From 8ccf0e9fb530b7c2f93616f7c794167e8748a4af Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Wed, 6 May 2020 18:42:47 -0500
Subject: [PATCH 01/41] New C++ compute kernels development framework project

---
 cpp/CMakeLists.txt                            |    4 +
 cpp/src/arrow/CMakeLists.txt                  |   40 +-
 cpp/src/arrow/array/diff_test.cc              |   13 +-
 cpp/src/arrow/compute/CMakeLists.txt          |    9 +-
 cpp/src/arrow/compute/README.md               |   58 +
 cpp/src/arrow/compute/api.h                   |   23 +-
 cpp/src/arrow/compute/api_eager.cc            |  209 +++
 cpp/src/arrow/compute/api_eager.h             |  341 ++++
 cpp/src/arrow/compute/cast.h                  |   68 +
 cpp/src/arrow/compute/compute_test.cc         |   95 -
 cpp/src/arrow/compute/context.h               |   79 -
 cpp/src/arrow/compute/exec.cc                 |  859 +++++++++
 cpp/src/arrow/compute/exec.h                  |  175 ++
 cpp/src/arrow/compute/exec_internal.h         |  128 ++
 cpp/src/arrow/compute/exec_test.cc            |  840 +++++++++
 cpp/src/arrow/compute/filter.h                |   54 +
 cpp/src/arrow/compute/function.cc             |  150 ++
 cpp/src/arrow/compute/function.h              |  197 +++
 cpp/src/arrow/compute/function_test.cc        |  239 +++
 cpp/src/arrow/compute/kernel.cc               |  296 ++++
 cpp/src/arrow/compute/kernel.h                |  655 ++++---
 cpp/src/arrow/compute/kernel_test.cc          |  430 +++++
 cpp/src/arrow/compute/kernels/CMakeLists.txt  |   66 +-
 cpp/src/arrow/compute/kernels/add.cc          |  131 --
 cpp/src/arrow/compute/kernels/add.h           |   77 -
 cpp/src/arrow/compute/kernels/aggregate.cc    |   88 -
 cpp/src/arrow/compute/kernels/aggregate.h     |  115 --
 .../arrow/compute/kernels/aggregate_basic.cc  |  366 ++++
 .../compute/kernels/aggregate_benchmark.cc    |    9 +-
 .../kernels/{isin.h => aggregate_internal.h}  |   43 +-
 .../arrow/compute/kernels/aggregate_test.cc   |  292 +---
 cpp/src/arrow/compute/kernels/boolean.cc      |  269 ---
 cpp/src/arrow/compute/kernels/boolean.h       |  105 --
 cpp/src/arrow/compute/kernels/cast.cc         | 1549 -----------------
 cpp/src/arrow/compute/kernels/cast.h          |  101 --
 .../arrow/compute/kernels/codegen_internal.cc |  145 ++
 .../arrow/compute/kernels/codegen_internal.h  |  429 +++++
 cpp/src/arrow/compute/kernels/common.h        |   50 +
 cpp/src/arrow/compute/kernels/compare.cc      |  332 ----
 cpp/src/arrow/compute/kernels/compare.h       |   72 -
 cpp/src/arrow/compute/kernels/count.cc        |  115 --
 cpp/src/arrow/compute/kernels/count.h         |   88 -
 cpp/src/arrow/compute/kernels/filter.h        |  105 --
 cpp/src/arrow/compute/kernels/hash.h          |  102 --
 cpp/src/arrow/compute/kernels/isin.cc         |  306 ----
 cpp/src/arrow/compute/kernels/isin_test.cc    |  415 -----
 cpp/src/arrow/compute/kernels/match.cc        |  281 ---
 cpp/src/arrow/compute/kernels/match.h         |   57 -
 cpp/src/arrow/compute/kernels/match_test.cc   |  389 -----
 cpp/src/arrow/compute/kernels/mean.cc         |  116 --
 cpp/src/arrow/compute/kernels/mean.h          |   66 -
 cpp/src/arrow/compute/kernels/minmax.cc       |  142 +-
 cpp/src/arrow/compute/kernels/minmax.h        |   98 --
 .../arrow/compute/kernels/nth_to_indices.cc   |  140 --
 .../arrow/compute/kernels/nth_to_indices.h    |   53 -
 .../kernels/{sort_to_indices.h => registry.h} |   41 +-
 .../scalar_arithmetic.cc}                     |   43 +-
 ...{add_test.cc => scalar_arithmetic_test.cc} |   21 +-
 .../arrow/compute/kernels/scalar_boolean.cc   |  183 ++
 ...boolean_test.cc => scalar_boolean_test.cc} |   46 +-
 cpp/src/arrow/compute/kernels/scalar_cast.cc  |  449 +++++
 .../compute/kernels/scalar_cast_boolean.cc    |   82 +
 .../compute/kernels/scalar_cast_decimal.cc    |   88 +
 .../compute/kernels/scalar_cast_internal.h    |  222 +++
 .../compute/kernels/scalar_cast_nested.cc     |   64 +
 .../compute/kernels/scalar_cast_numeric.cc    |  425 +++++
 .../compute/kernels/scalar_cast_string.cc     |  110 ++
 .../compute/kernels/scalar_cast_temporal.cc   |  276 +++
 .../{cast_test.cc => scalar_cast_test.cc}     |   88 +-
 .../arrow/compute/kernels/scalar_compare.cc   |  117 ++
 ...nchmark.cc => scalar_compare_benchmark.cc} |   12 +-
 ...compare_test.cc => scalar_compare_test.cc} |  330 ++--
 .../compute/kernels/scalar_set_lookup.cc      |  317 ++++
 .../compute/kernels/scalar_set_lookup_test.cc |  677 +++++++
 cpp/src/arrow/compute/kernels/sum.cc          |  114 --
 cpp/src/arrow/compute/kernels/sum.h           |   71 -
 cpp/src/arrow/compute/kernels/sum_internal.h  |  207 ---
 .../arrow/compute/kernels/util_internal.cc    |  337 ----
 cpp/src/arrow/compute/kernels/util_internal.h |  154 --
 .../compute/kernels/util_internal_test.cc     |  247 ---
 .../kernels/{filter.cc => vector_filter.cc}   |  154 +-
 ...enchmark.cc => vector_filter_benchmark.cc} |   20 +-
 .../{filter_test.cc => vector_filter_test.cc} |  116 +-
 .../kernels/{hash.cc => vector_hash.cc}       |   76 +-
 .../vector_hash_benchmark.cc}                 |   21 +-
 .../{hash_test.cc => vector_hash_test.cc}     |  241 ++-
 .../arrow/compute/kernels/vector_partition.cc |  107 ++
 ...hmark.cc => vector_partition_benchmark.cc} |    7 +-
 ...dices_test.cc => vector_partition_test.cc} |   78 +-
 .../{sort_to_indices.cc => vector_sort.cc}    |    4 +-
 ..._benchmark.cc => vector_sort_benchmark.cc} |    8 +-
 ...to_indices_test.cc => vector_sort_test.cc} |   23 +-
 .../kernels/{take.cc => vector_take.cc}       |   30 +-
 ..._benchmark.cc => vector_take_benchmark.cc} |    8 +-
 ...take_internal.h => vector_take_internal.h} |    2 +-
 .../{take_test.cc => vector_take_test.cc}     |   49 +-
 cpp/src/arrow/compute/options.h               |  155 ++
 cpp/src/arrow/compute/registry.cc             |  124 ++
 cpp/src/arrow/compute/registry.h              |   74 +
 cpp/src/arrow/compute/registry_test.cc        |   89 +
 cpp/src/arrow/compute/{kernels => }/take.h    |  118 +-
 cpp/src/arrow/compute/test_util.h             |   24 +-
 cpp/src/arrow/dataset/filter.cc               |   86 +-
 cpp/src/arrow/dataset/filter.h                |   81 +-
 cpp/src/arrow/dataset/filter_test.cc          |   21 +-
 cpp/src/arrow/dataset/scanner.h               |    1 -
 cpp/src/arrow/dataset/scanner_internal.h      |    2 +-
 cpp/src/arrow/dataset/scanner_test.cc         |    1 -
 cpp/src/arrow/dataset/type_fwd.h              |    2 +-
 cpp/src/arrow/datum.cc                        |  188 ++
 cpp/src/arrow/datum.h                         |  270 +++
 cpp/src/arrow/datum_test.cc                   |  161 ++
 cpp/src/arrow/python/arrow_to_pandas.cc       |   26 +-
 cpp/src/arrow/python/numpy_to_arrow.cc        |   22 +-
 cpp/src/arrow/python/numpy_to_arrow.h         |    2 +-
 cpp/src/arrow/stl.h                           |   14 +-
 cpp/src/arrow/stl_test.cc                     |    6 +-
 cpp/src/arrow/testing/gtest_util.cc           |   16 +-
 cpp/src/arrow/testing/gtest_util.h            |    8 +-
 cpp/src/arrow/type.cc                         |   86 +
 cpp/src/arrow/type.h                          |   10 +
 cpp/src/arrow/type_fwd.h                      |    1 +
 cpp/src/gandiva/arrow.h                       |   27 +-
 .../parquet/arrow/arrow_reader_writer_test.cc |   16 +-
 cpp/src/parquet/arrow/reader_internal.cc      |    4 +-
 cpp/src/parquet/column_writer.cc              |   12 +-
 cpp/src/parquet/encoding_test.cc              |    1 -
 testing                                       |    2 +-
 128 files changed, 10719 insertions(+), 8270 deletions(-)
 create mode 100644 cpp/src/arrow/compute/README.md
 create mode 100644 cpp/src/arrow/compute/api_eager.cc
 create mode 100644 cpp/src/arrow/compute/api_eager.h
 create mode 100644 cpp/src/arrow/compute/cast.h
 delete mode 100644 cpp/src/arrow/compute/compute_test.cc
 delete mode 100644 cpp/src/arrow/compute/context.h
 create mode 100644 cpp/src/arrow/compute/exec.cc
 create mode 100644 cpp/src/arrow/compute/exec.h
 create mode 100644 cpp/src/arrow/compute/exec_internal.h
 create mode 100644 cpp/src/arrow/compute/exec_test.cc
 create mode 100644 cpp/src/arrow/compute/filter.h
 create mode 100644 cpp/src/arrow/compute/function.cc
 create mode 100644 cpp/src/arrow/compute/function.h
 create mode 100644 cpp/src/arrow/compute/function_test.cc
 create mode 100644 cpp/src/arrow/compute/kernel.cc
 create mode 100644 cpp/src/arrow/compute/kernel_test.cc
 delete mode 100644 cpp/src/arrow/compute/kernels/add.cc
 delete mode 100644 cpp/src/arrow/compute/kernels/add.h
 delete mode 100644 cpp/src/arrow/compute/kernels/aggregate.cc
 delete mode 100644 cpp/src/arrow/compute/kernels/aggregate.h
 create mode 100644 cpp/src/arrow/compute/kernels/aggregate_basic.cc
 rename cpp/src/arrow/compute/kernels/{isin.h => aggregate_internal.h} (53%)
 delete mode 100644 cpp/src/arrow/compute/kernels/boolean.cc
 delete mode 100644 cpp/src/arrow/compute/kernels/boolean.h
 delete mode 100644 cpp/src/arrow/compute/kernels/cast.cc
 delete mode 100644 cpp/src/arrow/compute/kernels/cast.h
 create mode 100644 cpp/src/arrow/compute/kernels/codegen_internal.cc
 create mode 100644 cpp/src/arrow/compute/kernels/codegen_internal.h
 create mode 100644 cpp/src/arrow/compute/kernels/common.h
 delete mode 100644 cpp/src/arrow/compute/kernels/compare.cc
 delete mode 100644 cpp/src/arrow/compute/kernels/compare.h
 delete mode 100644 cpp/src/arrow/compute/kernels/count.cc
 delete mode 100644 cpp/src/arrow/compute/kernels/count.h
 delete mode 100644 cpp/src/arrow/compute/kernels/filter.h
 delete mode 100644 cpp/src/arrow/compute/kernels/hash.h
 delete mode 100644 cpp/src/arrow/compute/kernels/isin.cc
 delete mode 100644 cpp/src/arrow/compute/kernels/isin_test.cc
 delete mode 100644 cpp/src/arrow/compute/kernels/match.cc
 delete mode 100644 cpp/src/arrow/compute/kernels/match.h
 delete mode 100644 cpp/src/arrow/compute/kernels/match_test.cc
 delete mode 100644 cpp/src/arrow/compute/kernels/mean.cc
 delete mode 100644 cpp/src/arrow/compute/kernels/mean.h
 delete mode 100644 cpp/src/arrow/compute/kernels/minmax.h
 delete mode 100644 cpp/src/arrow/compute/kernels/nth_to_indices.cc
 delete mode 100644 cpp/src/arrow/compute/kernels/nth_to_indices.h
 rename cpp/src/arrow/compute/kernels/{sort_to_indices.h => registry.h} (53%)
 rename cpp/src/arrow/compute/{context.cc => kernels/scalar_arithmetic.cc} (51%)
 rename cpp/src/arrow/compute/kernels/{add_test.cc => scalar_arithmetic_test.cc} (82%)
 create mode 100644 cpp/src/arrow/compute/kernels/scalar_boolean.cc
 rename cpp/src/arrow/compute/kernels/{boolean_test.cc => scalar_boolean_test.cc} (89%)
 create mode 100644 cpp/src/arrow/compute/kernels/scalar_cast.cc
 create mode 100644 cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
 create mode 100644 cpp/src/arrow/compute/kernels/scalar_cast_decimal.cc
 create mode 100644 cpp/src/arrow/compute/kernels/scalar_cast_internal.h
 create mode 100644 cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
 create mode 100644 cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
 create mode 100644 cpp/src/arrow/compute/kernels/scalar_cast_string.cc
 create mode 100644 cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
 rename cpp/src/arrow/compute/kernels/{cast_test.cc => scalar_cast_test.cc} (96%)
 create mode 100644 cpp/src/arrow/compute/kernels/scalar_compare.cc
 rename cpp/src/arrow/compute/kernels/{compare_benchmark.cc => scalar_compare_benchmark.cc} (89%)
 rename cpp/src/arrow/compute/kernels/{compare_test.cc => scalar_compare_test.cc} (52%)
 create mode 100644 cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
 create mode 100644 cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc
 delete mode 100644 cpp/src/arrow/compute/kernels/sum.cc
 delete mode 100644 cpp/src/arrow/compute/kernels/sum.h
 delete mode 100644 cpp/src/arrow/compute/kernels/sum_internal.h
 delete mode 100644 cpp/src/arrow/compute/kernels/util_internal.cc
 delete mode 100644 cpp/src/arrow/compute/kernels/util_internal.h
 delete mode 100644 cpp/src/arrow/compute/kernels/util_internal_test.cc
 rename cpp/src/arrow/compute/kernels/{filter.cc => vector_filter.cc} (64%)
 rename cpp/src/arrow/compute/kernels/{filter_benchmark.cc => vector_filter_benchmark.cc} (86%)
 rename cpp/src/arrow/compute/kernels/{filter_test.cc => vector_filter_test.cc} (87%)
 rename cpp/src/arrow/compute/kernels/{hash.cc => vector_hash.cc} (87%)
 rename cpp/src/arrow/compute/{compute_benchmark.cc => kernels/vector_hash_benchmark.cc} (93%)
 rename cpp/src/arrow/compute/kernels/{hash_test.cc => vector_hash_test.cc} (68%)
 create mode 100644 cpp/src/arrow/compute/kernels/vector_partition.cc
 rename cpp/src/arrow/compute/kernels/{nth_to_indices_benchmark.cc => vector_partition_benchmark.cc} (90%)
 rename cpp/src/arrow/compute/kernels/{nth_to_indices_test.cc => vector_partition_test.cc} (59%)
 rename cpp/src/arrow/compute/kernels/{sort_to_indices.cc => vector_sort.cc} (99%)
 rename cpp/src/arrow/compute/kernels/{sort_to_indices_benchmark.cc => vector_sort_benchmark.cc} (92%)
 rename cpp/src/arrow/compute/kernels/{sort_to_indices_test.cc => vector_sort_test.cc} (90%)
 rename cpp/src/arrow/compute/kernels/{take.cc => vector_take.cc} (87%)
 rename cpp/src/arrow/compute/kernels/{take_benchmark.cc => vector_take_benchmark.cc} (95%)
 rename cpp/src/arrow/compute/kernels/{take_internal.h => vector_take_internal.h} (99%)
 rename cpp/src/arrow/compute/kernels/{take_test.cc => vector_take_test.cc} (93%)
 create mode 100644 cpp/src/arrow/compute/options.h
 create mode 100644 cpp/src/arrow/compute/registry.cc
 create mode 100644 cpp/src/arrow/compute/registry.h
 create mode 100644 cpp/src/arrow/compute/registry_test.cc
 rename cpp/src/arrow/compute/{kernels => }/take.h (60%)
 create mode 100644 cpp/src/arrow/datum.cc
 create mode 100644 cpp/src/arrow/datum.h
 create mode 100644 cpp/src/arrow/datum_test.cc

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index c5f65835499..662461ec89f 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -309,6 +309,10 @@ if(ARROW_DATASET)
   set(ARROW_FILESYSTEM ON)
 endif()
 
+if(ARROW_GANDIVA)
+  set(ARROW_COMPUTE ON)
+endif()
+
 if(ARROW_PARQUET)
   set(ARROW_COMPUTE ON)
 endif()
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index b06147f2247..2e62391c442 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -121,7 +121,6 @@ endfunction()
 
 set(ARROW_SRCS
     array.cc
-    builder.cc
     array/builder_adaptive.cc
     array/builder_base.cc
     array/builder_binary.cc
@@ -134,8 +133,10 @@ set(ARROW_SRCS
     array/dict_internal.cc
     array/diff.cc
     array/validate.cc
+    builder.cc
     buffer.cc
     compare.cc
+    datum.cc
     device.cc
     extension_type.cc
     memory_pool.cc
@@ -319,24 +320,24 @@ endif()
 
 if(ARROW_COMPUTE)
   list(APPEND ARROW_SRCS
-              compute/context.cc
-              compute/kernels/aggregate.cc
-              compute/kernels/boolean.cc
-              compute/kernels/cast.cc
-              compute/kernels/compare.cc
-              compute/kernels/count.cc
-              compute/kernels/hash.cc
-              compute/kernels/filter.cc
-              compute/kernels/mean.cc
-              compute/kernels/minmax.cc
-              compute/kernels/sort_to_indices.cc
-              compute/kernels/nth_to_indices.cc
-              compute/kernels/sum.cc
-              compute/kernels/add.cc
-              compute/kernels/take.cc
-              compute/kernels/isin.cc
-              compute/kernels/match.cc
-              compute/kernels/util_internal.cc)
+              compute/api_eager.cc
+              compute/exec.cc
+              compute/function.cc
+              compute/kernel.cc
+              compute/registry.cc
+              compute/kernels/codegen_internal.cc
+              compute/kernels/aggregate_basic.cc
+              compute/kernels/scalar_arithmetic.cc
+              compute/kernels/scalar_boolean.cc
+              compute/kernels/scalar_compare.cc
+              compute/kernels/scalar_set_lookup.cc
+              compute/kernels/vector_partition.cc
+              # compute/kernels/scalar_cast.cc
+              # compute/kernels/filter.cc
+              # compute/kernels/take.cc
+              # compute/kernels/hash.cc
+              # compute/kernels/sort_to_indices.cc
+       )
 endif()
 
 if(ARROW_FILESYSTEM)
@@ -524,6 +525,7 @@ endif()
 
 add_arrow_test(misc_test
                SOURCES
+               datum_test.cc
                memory_pool_test.cc
                result_test.cc
                pretty_print_test.cc
diff --git a/cpp/src/arrow/array/diff_test.cc b/cpp/src/arrow/array/diff_test.cc
index 0e9ccc40504..4917d4524d1 100644
--- a/cpp/src/arrow/array/diff_test.cc
+++ b/cpp/src/arrow/array/diff_test.cc
@@ -33,8 +33,7 @@
 #include "arrow/array/diff.h"
 #include "arrow/buffer.h"
 #include "arrow/builder.h"
-#include "arrow/compute/context.h"
-#include "arrow/compute/kernels/filter.h"
+#include "arrow/compute/api.h"
 #include "arrow/status.h"
 #include "arrow/testing/gtest_common.h"
 #include "arrow/testing/random.h"
@@ -119,20 +118,19 @@ class DiffTest : public ::testing::Test {
 
   void BaseAndTargetFromRandomFilter(std::shared_ptr<Array> values,
                                      double filter_probability) {
-    compute::Datum out_datum, base_filter, target_filter;
+    std::shared_ptr<Array> base_filter, target_filter;
     do {
       base_filter = this->rng_.Boolean(values->length(), filter_probability, 0.0);
       target_filter = this->rng_.Boolean(values->length(), filter_probability, 0.0);
-    } while (base_filter.Equals(target_filter));
+    } while (base_filter->Equals(target_filter));
 
-    ASSERT_OK(compute::Filter(&ctx_, values, base_filter, {}, &out_datum));
+    ASSERT_OK_AND_ASSIGN(Datum out_datum, compute::Filter(values, base_filter));
     base_ = out_datum.make_array();
 
-    ASSERT_OK(compute::Filter(&ctx_, values, target_filter, {}, &out_datum));
+    ASSERT_OK_AND_ASSIGN(out_datum, compute::Filter(values, target_filter));
     target_ = out_datum.make_array();
   }
 
-  compute::FunctionContext ctx_;
   random::RandomArrayGenerator rng_;
   std::shared_ptr<StructArray> edits_;
   std::shared_ptr<Array> base_, target_;
@@ -616,7 +614,6 @@ void MakeSameLength(std::shared_ptr<Array>* a, std::shared_ptr<Array>* b) {
 }
 
 TEST_F(DiffTest, CompareRandomStruct) {
-  compute::FunctionContext ctx;
   for (auto null_probability : {0.0, 0.25}) {
     constexpr auto length = 1 << 10;
     auto int32_values = this->rng_.Int32(length, 0, 127, null_probability);
diff --git a/cpp/src/arrow/compute/CMakeLists.txt b/cpp/src/arrow/compute/CMakeLists.txt
index 495a4a3f944..8ee87047a3d 100644
--- a/cpp/src/arrow/compute/CMakeLists.txt
+++ b/cpp/src/arrow/compute/CMakeLists.txt
@@ -58,7 +58,12 @@ function(ADD_ARROW_COMPUTE_TEST REL_TEST_NAME)
                  ${ARG_UNPARSED_ARGUMENTS})
 endfunction()
 
-add_arrow_compute_test(compute_test)
-add_arrow_benchmark(compute_benchmark)
+add_arrow_compute_test(internals_test
+                       SOURCES
+                       function_test.cc
+                       exec_test.cc
+                       kernel_test.cc
+                       registry_test.cc)
+add_arrow_compute_test(exec_test)
 
 add_subdirectory(kernels)
diff --git a/cpp/src/arrow/compute/README.md b/cpp/src/arrow/compute/README.md
new file mode 100644
index 00000000000..80d8918e3d9
--- /dev/null
+++ b/cpp/src/arrow/compute/README.md
@@ -0,0 +1,58 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+## Apache Arrow C++ Compute Functions
+
+This submodule contains analytical functions that process primarily Arrow
+columnar data; some functions can process scalar or Arrow-based array
+inputs. These are intended for use inside query engines, data frame libraries,
+etc.
+
+Many functions have SQL-like semantics in that they perform elementwise or
+scalar operations on whole arrays at a time. Other functions are not SQL-like
+and compute results that may be a different length or whose results depend on
+the order of the values.
+
+Some basic terminology:
+
+* We use the term "function" to refer to particular general operation that may
+  have many different implementations corresponding to different combinations
+  of types or function behavior options.
+* We call a specific implementation of a function a "kernel". When executing a
+  function on inputs, we must first select a suitable kernel (kernel selection
+  is called "dispatching") corresponding to the value types of the inputs
+* Functions along with their kernel implementations are collected in a
+  "function registry". Given a function name and argument types, we can look up
+  that function and dispatch to a compatible kernel.
+
+Types of functions
+
+* Scalar functions: elementwise functions that perform scalar operations in a
+  vectorized manner. These functions are generally valid for SQL-like
+  context. These are called "scalar" in that the functions executed consider
+  each value in an array independently, and the output array or arrays have the
+  same length as the input arrays. The result for each array cell is generally
+  independent of its position in the array.
+* Vector functions, which produce a result whose output is generally dependent
+  on the entire contents of the input arrays. These functions **are generally
+  not valid** for SQL-like processing because the output size may be different
+  than the input size, and the result may change based on the order of the
+  values in the array. This includes things like array subselection, sorting,
+  hashing, and more.
+* Scalar aggregate functions of which can be used in a SQL-like context
\ No newline at end of file
diff --git a/cpp/src/arrow/compute/api.h b/cpp/src/arrow/compute/api.h
index 8e60a39a0fd..8c3a2ac08ba 100644
--- a/cpp/src/arrow/compute/api.h
+++ b/cpp/src/arrow/compute/api.h
@@ -15,20 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#pragma once
+// NOTE: API is EXPERIMENTAL and will change without going through a
+// deprecation cycle
 
-#include "arrow/compute/context.h"  // IWYU pragma: export
-#include "arrow/compute/kernel.h"   // IWYU pragma: export
+#pragma once
 
-#include "arrow/compute/kernels/boolean.h"          // IWYU pragma: export
-#include "arrow/compute/kernels/cast.h"             // IWYU pragma: export
-#include "arrow/compute/kernels/compare.h"          // IWYU pragma: export
-#include "arrow/compute/kernels/count.h"            // IWYU pragma: export
-#include "arrow/compute/kernels/filter.h"           // IWYU pragma: export
-#include "arrow/compute/kernels/hash.h"             // IWYU pragma: export
-#include "arrow/compute/kernels/isin.h"             // IWYU pragma: export
-#include "arrow/compute/kernels/mean.h"             // IWYU pragma: export
-#include "arrow/compute/kernels/nth_to_indices.h"   // IWYU pragma: export
-#include "arrow/compute/kernels/sort_to_indices.h"  // IWYU pragma: export
-#include "arrow/compute/kernels/sum.h"              // IWYU pragma: export
-#include "arrow/compute/kernels/take.h"             // IWYU pragma: export
+#include "arrow/compute/api_eager.h"  // IWYU pragma: export
+#include "arrow/compute/exec.h"       // IWYU pragma: export
+#include "arrow/compute/function.h"   // IWYU pragma: export
+#include "arrow/compute/kernel.h"     // IWYU pragma: export
+#include "arrow/compute/registry.h"   // IWYU pragma: export
diff --git a/cpp/src/arrow/compute/api_eager.cc b/cpp/src/arrow/compute/api_eager.cc
new file mode 100644
index 00000000000..129a40f69f8
--- /dev/null
+++ b/cpp/src/arrow/compute/api_eager.cc
@@ -0,0 +1,209 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/api_eager.h"
+
+#include <sstream>
+#include <string>
+#include <utility>
+
+#include "arrow/compute/exec.h"
+
+namespace arrow {
+namespace compute {
+
+#define SCALAR_EAGER_UNARY(NAME, REGISTRY_NAME)              \
+  Result<Datum> NAME(const Datum& value, ExecContext* ctx) { \
+    return ExecScalarFunction(ctx, REGISTRY_NAME, {value});  \
+  }
+
+#define SCALAR_EAGER_BINARY(NAME, REGISTRY_NAME)                                \
+  Result<Datum> NAME(const Datum& left, const Datum& right, ExecContext* ctx) { \
+    return ExecScalarFunction(ctx, REGISTRY_NAME, {left, right});               \
+  }
+
+// ----------------------------------------------------------------------
+// Arithmetic
+
+SCALAR_EAGER_BINARY(Add, "add")
+
+// ----------------------------------------------------------------------
+// Set-related operations
+
+static Result<Datum> ExecSetLookup(const std::string& func_name, const Datum& data,
+                                   std::shared_ptr<Array> value_set,
+                                   bool add_nulls_to_hash_table, ExecContext* ctx) {
+  if (value_set->length() > 0 && !data.type()->Equals(value_set->type())) {
+    std::stringstream ss;
+    ss << "Array type didn't match type of values set: " << data.type()->ToString()
+       << " vs " << value_set->type()->ToString();
+    return Status::Invalid(ss.str());
+  }
+  SetLookupOptions options(std::move(value_set), !add_nulls_to_hash_table);
+  return ExecScalarFunction(ctx, func_name, {data}, &options);
+}
+
+Result<Datum> IsIn(const Datum& values, std::shared_ptr<Array> value_set,
+                   ExecContext* ctx) {
+  return ExecSetLookup("isin", values, std::move(value_set),
+                       /*add_nulls_to_hash_table=*/false, ctx);
+}
+
+Result<Datum> Match(const Datum& values, std::shared_ptr<Array> value_set,
+                    ExecContext* ctx) {
+  return ExecSetLookup("match", values, std::move(value_set),
+                       /*add_nulls_to_hash_table=*/true, ctx);
+}
+
+// ----------------------------------------------------------------------
+// Boolean functions
+
+SCALAR_EAGER_UNARY(Invert, "invert")
+SCALAR_EAGER_BINARY(And, "and")
+SCALAR_EAGER_BINARY(KleeneAnd, "and_kleene")
+SCALAR_EAGER_BINARY(Or, "or")
+SCALAR_EAGER_BINARY(KleeneOr, "or_kleene")
+SCALAR_EAGER_BINARY(Xor, "xor")
+
+// ----------------------------------------------------------------------
+
+Result<Datum> Compare(const Datum& left, const Datum& right, CompareOptions options,
+                      ExecContext* ctx) {
+  std::string func_name;
+  switch (options.op) {
+    case CompareOperator::EQUAL:
+      func_name = "==";
+      break;
+    case CompareOperator::NOT_EQUAL:
+      func_name = "!=";
+      break;
+    case CompareOperator::GREATER:
+      func_name = ">";
+      break;
+    case CompareOperator::GREATER_EQUAL:
+      func_name = ">=";
+      break;
+    case CompareOperator::LESS:
+      func_name = "<";
+      break;
+    case CompareOperator::LESS_EQUAL:
+      func_name = "<=";
+      break;
+    default:
+      DCHECK(false);
+      break;
+  }
+  return ExecScalarFunction(ctx, func_name, {left, right}, &options);
+}
+
+// ----------------------------------------------------------------------
+// Scalar aggregates
+
+Result<Datum> Count(const Datum& value, CountOptions options, ExecContext* ctx) {
+  return ExecScalarAggregateFunction(ctx, "count", {value}, &options);
+}
+
+Result<Datum> Mean(const Datum& value, ExecContext* ctx) {
+  return ExecScalarAggregateFunction(ctx, "mean", {value});
+}
+
+Result<Datum> Sum(const Datum& value, ExecContext* ctx) {
+  return ExecScalarAggregateFunction(ctx, "sum", {value});
+}
+
+// Result<Datum> MinMax(const Datum& value, const MinMaxOptions& options,
+//                      ExecContext* ctx) {
+//   return ExecScalarAggregateFunction(ctx, "minmax", {value});
+// }
+
+// ----------------------------------------------------------------------
+// Vector functions
+
+namespace {
+
+// Status InvokeHash(FunctionContext* ctx, HashKernel* func, const Datum& value,
+//                   std::vector<Datum>* kernel_outputs,
+//                   std::shared_ptr<Array>* dictionary) {
+//   RETURN_NOT_OK(detail::InvokeUnaryArrayKernel(ctx, func, value, kernel_outputs));
+//   std::shared_ptr<ArrayData> dict_data;
+//   RETURN_NOT_OK(func->GetDictionary(&dict_data));
+//   *dictionary = MakeArray(dict_data);
+//   return Status::OK();
+// }
+
+}  // namespace
+
+Result<std::shared_ptr<Array>> Unique(const Datum& value, ExecContext* ctx) {
+  // std::unique_ptr<HashKernel> func;
+  // RETURN_NOT_OK(GetUniqueKernel(ctx, value.type(), &func));
+  // std::vector<Datum> dummy_outputs;
+  // return InvokeHash(ctx, func.get(), value, &dummy_outputs, out);
+  return Status::NotImplemented("NYI");
+}
+
+Result<Datum> DictionaryEncode(const Datum& value, ExecContext* ctx) {
+  // std::unique_ptr<HashKernel> func;
+  // RETURN_NOT_OK(GetDictionaryEncodeKernel(ctx, value.type(), &func));
+  // std::shared_ptr<Array> dict;
+  // std::vector<Datum> indices_outputs;
+  // RETURN_NOT_OK(InvokeHash(ctx, func.get(), value, &indices_outputs, &dict));
+  // auto dict_type = dictionary(func->out_type(), dict->type());
+  // // Wrap indices in dictionary arrays for result
+  // std::vector<std::shared_ptr<Array>> dict_chunks;
+  // for (const Datum& datum : indices_outputs) {
+  //   dict_chunks.emplace_back(
+  //       std::make_shared<DictionaryArray>(dict_type, datum.make_array(), dict));
+  // }
+  // *out = detail::WrapArraysLike(value, dict_type, dict_chunks);
+  // return Status::OK();
+  return Status::NotImplemented("NYI");
+}
+
+const char kValuesFieldName[] = "values";
+const char kCountsFieldName[] = "counts";
+const int32_t kValuesFieldIndex = 0;
+const int32_t kCountsFieldIndex = 1;
+
+Result<std::shared_ptr<Array>> ValueCounts(const Datum& value, ExecContext* ctx) {
+  // std::unique_ptr<HashKernel> func;
+  // RETURN_NOT_OK(GetValueCountsKernel(ctx, value.type(), &func));
+  // // Calls return nothing for counts.
+  // std::vector<Datum> unused_output;
+  // std::shared_ptr<Array> uniques;
+  // RETURN_NOT_OK(InvokeHash(ctx, func.get(), value, &unused_output, &uniques));
+  // Datum value_counts;
+  // RETURN_NOT_OK(func->FlushFinal(&value_counts));
+  // auto data_type = std::make_shared<StructType>(std::vector<std::shared_ptr<Field>>{
+  //     std::make_shared<Field>(kValuesFieldName, uniques->type()),
+  //     std::make_shared<Field>(kCountsFieldName, int64())});
+  // *counts = std::make_shared<StructArray>(
+  //     data_type, uniques->length(),
+  //     std::vector<std::shared_ptr<Array>>{uniques, MakeArray(value_counts.array())});
+  // return Status::OK();
+  return Status::NotImplemented("NYI");
+}
+
+Result<std::shared_ptr<Array>> PartitionIndices(const Array& values, int64_t n,
+                                                ExecContext* ctx) {
+  PartitionOptions options(/*pivot=*/n);
+  ARROW_ASSIGN_OR_RAISE(Datum result, ExecVectorFunction(ctx, "partition_indices",
+                                                         {Datum(values)}, &options));
+  return result.make_array();
+}
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/api_eager.h b/cpp/src/arrow/compute/api_eager.h
new file mode 100644
index 00000000000..d41210c9594
--- /dev/null
+++ b/cpp/src/arrow/compute/api_eager.h
@@ -0,0 +1,341 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Eager evaluation convenience APIs for invoking common functions, including
+// necessary memory allocations
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/compute/cast.h"
+#include "arrow/compute/exec.h"
+#include "arrow/compute/filter.h"
+#include "arrow/compute/options.h"
+#include "arrow/compute/take.h"
+#include "arrow/datum.h"
+#include "arrow/result.h"
+
+namespace arrow {
+namespace compute {
+
+class ExecContext;
+
+// ----------------------------------------------------------------------
+
+/// \brief Add two values together. Array values must be the same length. If a
+/// value is null in either addend, the result is null
+///
+/// \param[in] left the first value
+/// \param[in] right the second value
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise addition of the values
+ARROW_EXPORT
+Result<Datum> Add(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR);
+
+/// \brief Compare a numeric array with a scalar.
+///
+/// \param[in] left datum to compare, must be an Array
+/// \param[in] right datum to compare, must be a Scalar of the same type than
+///            left Datum.
+/// \param[in] options compare options
+/// \param[in] ctx the function execution context, optional
+/// \return resulting datum
+///
+/// Note on floating point arrays, this uses ieee-754 compare semantics.
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Compare(const Datum& left, const Datum& right,
+                      struct CompareOptions options, ExecContext* ctx = NULLPTR);
+
+/// \brief Invert the values of a boolean datum
+/// \param[in] value datum to invert
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Invert(const Datum& value, ExecContext* ctx = NULLPTR);
+
+/// \brief Element-wise AND of two boolean datums which always propagates nulls
+/// (null and false is null).
+///
+/// \param[in] left left operand (array)
+/// \param[in] right right operand (array)
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> And(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR);
+
+/// \brief Element-wise AND of two boolean datums with a Kleene truth table
+/// (null and false is false).
+///
+/// \param[in] left left operand (array)
+/// \param[in] right right operand (array)
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> KleeneAnd(const Datum& left, const Datum& right,
+                        ExecContext* ctx = NULLPTR);
+
+/// \brief Element-wise OR of two boolean datums which always propagates nulls
+/// (null and true is null).
+///
+/// \param[in] left left operand (array)
+/// \param[in] right right operand (array)
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Or(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR);
+
+/// \brief Element-wise OR of two boolean datums with a Kleene truth table
+/// (null or true is true).
+///
+/// \param[in] left left operand (array)
+/// \param[in] right right operand (array)
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> KleeneOr(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR);
+
+/// \brief Element-wise XOR of two boolean datums
+/// \param[in] left left operand (array)
+/// \param[in] right right operand (array)
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Xor(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR);
+
+/// \brief IsIn returns true for each element of `values` that is contained in
+/// `value_set`
+///
+/// If null occurs in left, if null count in right is not 0,
+/// it returns true, else returns null.
+///
+/// \param[in] values array-like input to look up in value_set
+/// \param[in] value_set Array input
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> IsIn(const Datum& values, std::shared_ptr<Array> value_set,
+                   ExecContext* ctx = NULLPTR);
+
+/// \brief Match examines each slot in the haystack against a needles array.
+/// If the value is not found in needles, null will be output.
+/// If found, the index of occurrence within needles (ignoring duplicates)
+/// will be output.
+///
+/// For example given haystack = [99, 42, 3, null] and
+/// needles = [3, 3, 99], the output will be = [1, null, 0, null]
+///
+/// Note: Null in the haystack is considered to match
+/// a null in the needles array. For example given
+/// haystack = [99, 42, 3, null] and needles = [3, 99, null],
+/// the output will be = [1, null, 0, 2]
+///
+/// \param[in] haystack array-like input
+/// \param[in] needles Array input
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Match(const Datum& haystack, std::shared_ptr<Array> needles,
+                    ExecContext* ctx = NULLPTR);
+
+/// \brief Returns indices that partition an array around n-th
+/// sorted element.
+///
+/// Find index of n-th(0 based) smallest value and perform indirect
+/// partition of an array around that element. Output indices[0 ~ n-1]
+/// holds values no greater than n-th element, and indices[n+1 ~ end]
+/// holds values no less than n-th element. Elements in each partition
+/// is not sorted. Nulls will be partitioned to the end of the output.
+/// Output is not guaranteed to be stable.
+///
+/// \param[in] values array to be partitioned
+/// \param[in] n pivot array around sorted n-th element
+/// \param[in] ctx the function execution context, optional
+/// \return offsets indices that would partition an array
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> PartitionIndices(const Array& values, int64_t n,
+                                                ExecContext* ctx = NULLPTR);
+
+ARROW_DEPRECATED("Deprecated in 1.0.0. Use PartitionIndices")
+Result<std::shared_ptr<Array>> NthToIndices(const Array& values, int64_t n,
+                                            ExecContext* ctx = NULLPTR) {
+  return PartitionIndices(values, n, ctx);
+}
+
+/// \brief Returns the indices that would sort an array.
+///
+/// Perform an indirect sort of array. The output array will contain
+/// indices that would sort an array, which would be the same length
+/// as input. Nulls will be stably partitioned to the end of the output.
+///
+/// For example given values = [null, 1, 3.3, null, 2, 5.3], the output
+/// will be [1, 4, 2, 5, 0, 3]
+///
+/// \param[in] values array to sort
+/// \param[in] ctx the function execution context, optional
+/// \return offsets indices that would sort an array
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> SortToIndices(const Array& values,
+                                             ExecContext* ctx = NULLPTR);
+
+/// \brief Compute unique elements from an array-like object
+///
+/// Note if a null occurs in the input it will NOT be included in the output.
+///
+/// \param[in] datum array-like input
+/// \param[in] ctx the function execution context, optional
+/// \return result as Array
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> Unique(const Datum& datum, ExecContext* ctx = NULLPTR);
+
+// Constants for accessing the output of ValueCounts
+ARROW_EXPORT extern const char kValuesFieldName[];
+ARROW_EXPORT extern const char kCountsFieldName[];
+ARROW_EXPORT extern const int32_t kValuesFieldIndex;
+ARROW_EXPORT extern const int32_t kCountsFieldIndex;
+/// \brief Return counts of unique elements from an array-like object.
+///
+/// Note that the counts do not include counts for nulls in the array.  These can be
+/// obtained separately from metadata.
+///
+/// For floating point arrays there is no attempt to normalize -0.0, 0.0 and NaN values
+/// which can lead to unexpected results if the input Array has these values.
+///
+/// \param[in] value array-like input
+/// \param[in] ctx the function execution context, optional
+/// \return counts An array of  <input type "Values", int64_t "Counts"> structs.
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> ValueCounts(const Datum& value,
+                                           ExecContext* ctx = NULLPTR);
+
+/// \brief Dictionary-encode values in an array-like object
+/// \param[in] data array-like input
+/// \param[in] ctx the function execution context, optional
+/// \return result with same shape and type as input
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> DictionaryEncode(const Datum& data, ExecContext* ctx = NULLPTR);
+
+// ----------------------------------------------------------------------
+// Aggregate functions
+
+/// \brief Count non-null (or null) values in an array.
+///
+/// \param[in] options counting options, see CountOptions for more information
+/// \param[in] datum to count
+/// \param[in] ctx the function execution context, optional
+/// \return out resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Count(const Datum& datum, CountOptions options = CountOptions::Defaults(),
+                    ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the mean of a numeric array.
+///
+/// \param[in] value datum to compute the mean, expecting Array
+/// \param[in] ctx the function execution context, optional
+/// \return datum of the computed mean as a DoubleScalar
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Mean(const Datum& value, ExecContext* ctx = NULLPTR);
+
+/// \brief Sum values of a numeric array.
+///
+/// \param[in] value datum to sum, expecting Array or ChunkedArray
+/// \param[in] ctx the function execution context, optional
+/// \return datum of the computed sum as a Scalar
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Sum(const Datum& value, ExecContext* ctx = NULLPTR);
+
+/// \brief Calculate the min / max of a numeric array
+///
+/// This function returns both the min and max as a collection. The resulting
+/// datum thus consists of two scalar datums: {Datum(min), Datum(max)}
+///
+/// \param[in] value input datum, expecting Array or ChunkedArray
+/// \param[in] options see MinMaxOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return resulting datum containing a {min, max} collection
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> MinMax(const Datum& value,
+                     const MinMaxOptions& options = MinMaxOptions::Defaults(),
+                     ExecContext* ctx = NULLPTR);
+
+/// \brief Calculate the min / max of a numeric array.
+///
+/// This function returns both the min and max as a collection. The resulting
+/// datum thus consists of two scalar datums: {Datum(min), Datum(max)}
+///
+/// \param[in] array input array
+/// \param[in] options see MinMaxOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return resulting datum containing a {min, max} collection
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> MinMax(const Array& array,
+                     const MinMaxOptions& options = MinMaxOptions::Defaults(),
+                     ExecContext* ctx = NULLPTR);
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/cast.h b/cpp/src/arrow/compute/cast.h
new file mode 100644
index 00000000000..1c8c1f511d9
--- /dev/null
+++ b/cpp/src/arrow/compute/cast.h
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/compute/exec.h"
+#include "arrow/compute/options.h"
+#include "arrow/datum.h"
+#include "arrow/result.h"
+
+namespace arrow {
+namespace compute {
+
+class ExecContext;
+
+// ----------------------------------------------------------------------
+// Convenience invocation APIs for a number of kernels
+
+/// \brief Cast from one array type to another
+/// \param[in] value array to cast
+/// \param[in] to_type type to cast to
+/// \param[in] options casting options
+/// \param[in] context the function execution context, optional
+/// \return the resulting array
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> Cast(const Array& value, std::shared_ptr<DataType> to_type,
+                                    const CastOptions& options = CastOptions::Safe(),
+                                    ExecContext* context = NULLPTR);
+
+/// \brief Cast from one value to another
+/// \param[in] value datum to cast
+/// \param[in] to_type type to cast to
+/// \param[in] options casting options
+/// \param[in] context the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Cast(const Datum& value, std::shared_ptr<DataType> to_type,
+                   const CastOptions& options = CastOptions::Safe(),
+                   ExecContext* context = NULLPTR);
+
+/// \brief Return true if a cast function is defined
+ARROW_EXPORT
+bool CanCast(const DataType& from_type, const DataType& to_type);
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/compute_test.cc b/cpp/src/arrow/compute/compute_test.cc
deleted file mode 100644
index cd33466a67a..00000000000
--- a/cpp/src/arrow/compute/compute_test.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <cstdint>
-#include <cstdio>
-#include <functional>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include <gtest/gtest.h>
-
-#include "arrow/array.h"
-#include "arrow/buffer.h"
-#include "arrow/memory_pool.h"
-#include "arrow/status.h"
-#include "arrow/table.h"
-#include "arrow/testing/gtest_common.h"
-#include "arrow/testing/gtest_util.h"
-#include "arrow/type.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/decimal.h"
-
-#include "arrow/compute/context.h"
-#include "arrow/compute/kernel.h"
-#include "arrow/compute/kernels/util_internal.h"
-#include "arrow/compute/test_util.h"
-
-namespace arrow {
-namespace compute {
-
-// ----------------------------------------------------------------------
-// Datum
-
-template <typename T>
-void CheckImplicitConstructor(enum Datum::type expected_kind) {
-  std::shared_ptr<T> value;
-  Datum datum = value;
-  ASSERT_EQ(expected_kind, datum.kind());
-}
-
-TEST(TestDatum, ImplicitConstructors) {
-  CheckImplicitConstructor<Scalar>(Datum::SCALAR);
-
-  CheckImplicitConstructor<Array>(Datum::ARRAY);
-
-  // Instantiate from array subclass
-  CheckImplicitConstructor<BinaryArray>(Datum::ARRAY);
-
-  CheckImplicitConstructor<ChunkedArray>(Datum::CHUNKED_ARRAY);
-  CheckImplicitConstructor<RecordBatch>(Datum::RECORD_BATCH);
-
-  CheckImplicitConstructor<Table>(Datum::TABLE);
-}
-
-class TestInvokeBinaryKernel : public ComputeFixture, public TestBase {};
-
-TEST_F(TestInvokeBinaryKernel, Exceptions) {
-  MockBinaryKernel kernel;
-  std::vector<Datum> outputs;
-  std::shared_ptr<Table> table;
-  std::vector<bool> values1 = {true, false, true};
-  std::vector<bool> values2 = {false, true, false};
-
-  auto type = boolean();
-  auto a1 = _MakeArray<BooleanType, bool>(type, values1, {});
-  auto a2 = _MakeArray<BooleanType, bool>(type, values2, {});
-
-  // Left is not an array-like
-  ASSERT_RAISES(Invalid, detail::InvokeBinaryArrayKernel(&this->ctx_, &kernel, table, a2,
-                                                         &outputs));
-  // Right is not an array-like
-  ASSERT_RAISES(Invalid, detail::InvokeBinaryArrayKernel(&this->ctx_, &kernel, a1, table,
-                                                         &outputs));
-  // Different sized inputs
-  ASSERT_RAISES(Invalid, detail::InvokeBinaryArrayKernel(&this->ctx_, &kernel, a1,
-                                                         a1->Slice(1), &outputs));
-}
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/context.h b/cpp/src/arrow/compute/context.h
deleted file mode 100644
index dde8b686fc3..00000000000
--- a/cpp/src/arrow/compute/context.h
+++ /dev/null
@@ -1,79 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <cstdint>
-#include <memory>
-
-#include "arrow/memory_pool.h"
-#include "arrow/status.h"
-#include "arrow/util/macros.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-
-class Buffer;
-
-namespace internal {
-class CpuInfo;
-}  // namespace internal
-
-namespace compute {
-
-#define ARROW_RETURN_IF_ERROR(ctx)            \
-  if (ARROW_PREDICT_FALSE(ctx->HasError())) { \
-    Status s = ctx->status();                 \
-    ctx->ResetStatus();                       \
-    return s;                                 \
-  }
-
-/// \brief Container for variables and options used by function evaluation
-class ARROW_EXPORT FunctionContext {
- public:
-  explicit FunctionContext(MemoryPool* pool = default_memory_pool());
-  MemoryPool* memory_pool() const;
-
-  /// \brief Allocate buffer from the context's memory pool
-  Status Allocate(const int64_t nbytes, std::shared_ptr<Buffer>* out);
-
-  /// \brief Indicate that an error has occurred, to be checked by a parent caller
-  /// \param[in] status a Status instance
-  ///
-  /// \note Will not overwrite a prior set Status, so we will have the first
-  /// error that occurred until FunctionContext::ResetStatus is called
-  void SetStatus(const Status& status);
-
-  /// \brief Clear any error status
-  void ResetStatus();
-
-  /// \brief Return true if an error has occurred
-  bool HasError() const { return !status_.ok(); }
-
-  /// \brief Return the current status of the context
-  const Status& status() const { return status_; }
-
-  internal::CpuInfo* cpu_info() const { return cpu_info_; }
-
- private:
-  Status status_;
-  MemoryPool* pool_;
-  internal::CpuInfo* cpu_info_;
-};
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc
new file mode 100644
index 00000000000..7683ef23403
--- /dev/null
+++ b/cpp/src/arrow/compute/exec.cc
@@ -0,0 +1,859 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/exec.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/compute/exec_internal.h"
+#include "arrow/compute/function.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/compute/registry.h"
+#include "arrow/datum.h"
+#include "arrow/status.h"
+#include "arrow/table.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/cpu_info.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+
+namespace compute {
+
+#define CTX_RETURN_IF_ERROR(CTX)                  \
+  do {                                            \
+    if (ARROW_PREDICT_FALSE((CTX)->HasError())) { \
+      Status s = (CTX)->status();                 \
+      (CTX)->ResetStatus();                       \
+      return s;                                   \
+    }                                             \
+  } while (0)
+
+namespace {
+
+Result<std::shared_ptr<Buffer>> AllocateDataBuffer(KernelContext* ctx, int64_t length,
+                                                   int bit_width) {
+  if (bit_width == 1) {
+    return ctx->AllocateBitmap(length);
+  } else {
+    ARROW_CHECK_EQ(bit_width % 8, 0)
+        << "Only bit widths with multiple of 8 are currently supported";
+    int64_t buffer_size = length * bit_width / 8;
+    return ctx->Allocate(buffer_size);
+  }
+  return Status::OK();
+}
+
+bool CanPreallocate(const DataType& type) { return is_fixed_width(type.id()); }
+
+Status GetValueDescriptors(const std::vector<Datum>& args,
+                           std::vector<ValueDescr>* descrs) {
+  for (const auto& arg : args) {
+    descrs->emplace_back(arg.descr());
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+namespace detail {
+
+ExecBatchIterator::ExecBatchIterator(std::vector<Datum> args, int64_t length,
+                                     int64_t max_chunksize)
+    : args_(std::move(args)),
+      position_(0),
+      length_(length),
+      max_chunksize_(max_chunksize),
+      finished_(false) {
+  chunk_indexes_.resize(args_.size(), 0);
+  chunk_positions_.resize(args_.size(), 0);
+}
+
+Result<std::unique_ptr<ExecBatchIterator>> ExecBatchIterator::Make(
+    std::vector<Datum> args, int64_t max_chunksize) {
+  for (const auto& arg : args) {
+    if (!(arg.is_arraylike() || arg.is_scalar())) {
+      return Status::Invalid(
+          "ExecBatchIterator only works with Scalar, Array, and "
+          "ChunkedArray arguments");
+    }
+  }
+
+  // If the arguments are all scalars, then the length is 1
+  int64_t length = 1;
+
+  bool length_set = false;
+  for (size_t i = 0; i < args.size(); ++i) {
+    if (args[i].is_scalar()) {
+      continue;
+    }
+    if (!length_set) {
+      length = args[i].length();
+      length_set = true;
+    } else {
+      if (args[i].length() != length) {
+        return Status::Invalid("Array arguments must all be the same length");
+      }
+    }
+  }
+
+  // No maximum was indicated
+  if (max_chunksize < 1) {
+    max_chunksize = length;
+  }
+
+  return std::unique_ptr<ExecBatchIterator>(
+      new ExecBatchIterator(std::move(args), length, max_chunksize));
+}
+
+bool ExecBatchIterator::Next(ExecBatch* batch) {
+  if (finished_) return false;
+
+  // Determine how large the common contiguous "slice" of all the arguments is
+  int64_t iteration_size = std::min(length_ - position_, max_chunksize_);
+
+  // If length_ is 0, then this loop will never execute
+  for (size_t i = 0; i < args_.size() && iteration_size > 0; ++i) {
+    // If the argument is not a chunked array, it's either a Scalar or Array,
+    // in which case it doesn't influence the size of this batch. Note that if
+    // the args are all scalars the batch length is 1
+    if (args_[i].kind() != Datum::CHUNKED_ARRAY) {
+      continue;
+    }
+    const ChunkedArray& arg = *args_[i].chunked_array();
+    std::shared_ptr<Array> current_chunk;
+    while (true) {
+      current_chunk = arg.chunk(chunk_indexes_[i]);
+      if (chunk_positions_[i] == current_chunk->length()) {
+        // Chunk is zero-length, or was exhausted in the previous iteration
+        chunk_positions_[i] = 0;
+        ++chunk_indexes_[i];
+        continue;
+      }
+      break;
+    }
+    iteration_size =
+        std::min(current_chunk->length() - chunk_positions_[i], iteration_size);
+  }
+
+  // Now, fill the batch
+  batch->values.resize(args_.size());
+  batch->length = iteration_size;
+  for (size_t i = 0; i < args_.size(); ++i) {
+    if (args_[i].is_scalar()) {
+      batch->values[i] = args_[i].scalar();
+    } else if (args_[i].is_array()) {
+      batch->values[i] = args_[i].array()->Slice(position_, iteration_size);
+    } else {
+      const ChunkedArray& carr = *args_[i].chunked_array();
+      if (carr.num_chunks() > 0) {
+        const auto& chunk = carr.chunk(chunk_indexes_[i]);
+        batch->values[i] = chunk->data()->Slice(chunk_positions_[i], iteration_size);
+      } else {
+        // Degenerate case of a ChunkedArray with zero chunks
+        DCHECK_EQ(0, length_);
+        batch->values[i] = ArrayData::Make(carr.type(), 0);
+      }
+      chunk_positions_[i] += iteration_size;
+    }
+  }
+  position_ += iteration_size;
+  DCHECK_LE(position_, length_);
+  if (position_ == length_) {
+    finished_ = true;
+  }
+  return true;
+}
+
+// Null propagation implementation that deals both with preallocated bitmaps
+// and maybe-to-be allocated bitmaps
+//
+// If the bitmap is preallocated, it MUST be populated (since it might be a
+// view of a much larger bitmap). If it isn't preallocated, then we have
+// more flexibility.
+//
+// * If the batch has no nulls, then we do nothing
+// * If only a single array has nulls, and its offset is a multiple of 8,
+//   then we can zero-copy the bitmap into the output
+// * Otherwise, we allocate the bitmap and populate it
+class NullPropagator {
+ public:
+  NullPropagator(KernelContext* ctx, const ExecBatch& batch, ArrayData* output)
+      : ctx_(ctx), batch_(batch), output_(output) {
+    // At this point, the values in batch_.values must have been validated to
+    // all be value-like
+    for (const Datum& val : batch_.values) {
+      if (val.kind() == Datum::ARRAY) {
+        // Do not count the bits if they haven't been counted already
+        const int64_t known_null_count = val.array()->null_count.load();
+        if (known_null_count == kUnknownNullCount || known_null_count > 0) {
+          values_with_nulls_.push_back(&val);
+        }
+      } else if (!val.scalar()->is_valid) {
+        values_with_nulls_.push_back(&val);
+      }
+    }
+
+    if (output->buffers[0] != nullptr) {
+      bitmap_preallocated_ = true;
+      SetBitmap(output_->buffers[0].get());
+    }
+  }
+
+  void SetBitmap(Buffer* bitmap) { bitmap_ = bitmap->mutable_data(); }
+
+  Status EnsureAllocated() {
+    if (bitmap_preallocated_) {
+      return Status::OK();
+    }
+    ARROW_ASSIGN_OR_RAISE(output_->buffers[0], ctx_->AllocateBitmap(output_->length));
+    SetBitmap(output_->buffers[0].get());
+    return Status::OK();
+  }
+
+  Result<bool> ShortCircuitIfAllNull() {
+    // An all-null value (scalar null or all-null array) gives us a short
+    // circuit opportunity
+    bool is_all_null = false;
+    std::shared_ptr<Buffer> all_null_bitmap;
+
+    // Walk all the values with nulls instead of breaking on the first in case
+    // we find a bitmap that can be reused in the non-preallocated case
+    for (const Datum* value : values_with_nulls_) {
+      if (value->type()->id() == Type::NA) {
+        // No bitmap
+        is_all_null = true;
+      } else if (value->kind() == Datum::ARRAY) {
+        const ArrayData& arr = *value->array();
+        if (arr.null_count.load() == arr.length) {
+          // Pluck the all null bitmap so we can set it in the output if it was
+          // not pre-allocated
+          all_null_bitmap = arr.buffers[0];
+          is_all_null = true;
+        }
+      } else {
+        // Scalar
+        is_all_null = true;
+      }
+    }
+    if (!is_all_null) {
+      return false;
+    }
+
+    // OK, the output should be all null
+    output_->null_count = output_->length;
+
+    if (!bitmap_preallocated_ && all_null_bitmap) {
+      // If we did not pre-allocate memory, and we observed an all-null bitmap,
+      // then we can zero-copy it into the output
+      output_->buffers[0] = std::move(all_null_bitmap);
+    } else {
+      RETURN_NOT_OK(EnsureAllocated());
+      BitUtil::SetBitsTo(bitmap_, output_->offset, output_->length, false);
+    }
+    return true;
+  }
+
+  Status PropagateSingle() {
+    // One array
+    const ArrayData& arr = *values_with_nulls_[0]->array();
+    const std::shared_ptr<Buffer>& arr_bitmap = arr.buffers[0];
+
+    // Reuse the null count if it's known
+    output_->null_count = arr.null_count.load();
+
+    if (bitmap_preallocated_) {
+      internal::CopyBitmap(arr_bitmap->data(), arr.offset, arr.length, bitmap_,
+                           output_->offset);
+    } else {
+      // Two cases when memory was not pre-allocated:
+      //
+      // * Offset is zero: we reuse the bitmap as is
+      // * Offset is nonzero but a multiple of 8: we can slice the bitmap
+      // * Offset is not a multiple of 8: we must allocate and use CopyBitmap
+      //
+      // Keep in mind that output_->offset is not permitted to be nonzero when
+      // the bitmap is not preallocated, and that precondition is asserted
+      // higher in the call stack.
+      if (arr.offset == 0) {
+        output_->buffers[0] = arr_bitmap;
+      } else if (arr.offset % 8 == 0) {
+        output_->buffers[0] =
+            SliceBuffer(arr_bitmap, arr.offset / 8, BitUtil::BytesForBits(arr.length));
+      } else {
+        RETURN_NOT_OK(EnsureAllocated());
+        internal::CopyBitmap(arr_bitmap->data(), arr.offset, arr.length, bitmap_,
+                             /*dst_offset=*/0);
+      }
+    }
+    return Status::OK();
+  }
+
+  Status PropagateMultiple() {
+    // More than one array. We use BitmapAnd to intersect their bitmaps
+
+    // Do not compute the intersection null count until it's needed
+    RETURN_NOT_OK(EnsureAllocated());
+
+    auto Accumulate = [&](const ArrayData& left, const ArrayData& right) {
+      internal::BitmapAnd(left.buffers[0]->data(), left.offset, right.buffers[0]->data(),
+                          right.offset, output_->length, output_->offset,
+                          output_->buffers[0]->mutable_data());
+    };
+
+    DCHECK_GT(values_with_nulls_.size(), 1);
+
+    // Seed the output bitmap with the & of the first two bitmaps
+    Accumulate(*values_with_nulls_[0]->array(), *values_with_nulls_[1]->array());
+
+    // Accumulate the rest
+    for (size_t i = 2; i < values_with_nulls_.size(); ++i) {
+      Accumulate(*output_, *values_with_nulls_[i]->array());
+    }
+    return Status::OK();
+  }
+
+  Status Execute() {
+    bool finished = false;
+    ARROW_ASSIGN_OR_RAISE(finished, ShortCircuitIfAllNull());
+    if (finished) {
+      return Status::OK();
+    }
+
+    // At this point, by construction we know that all of the values in
+    // values_with_nulls_ are arrays that are not all null. So there are a
+    // few cases:
+    //
+    // * No arrays. This is a no-op w/o preallocation but when the bitmap is
+    //   pre-allocated we have to fill it with 1's
+    // * One array, whose bitmap can be zero-copied (w/o preallocation, and
+    //   when no byte is split) or copied (split byte or w/ preallocation)
+    // * More than one array, we must compute the intersection of all the
+    //   bitmaps
+    //
+    // BUT, if the output offset is nonzero for some reason, we copy into the
+    // output unconditionally
+
+    output_->null_count = kUnknownNullCount;
+
+    if (values_with_nulls_.size() == 0) {
+      // No arrays with nulls case
+      output_->null_count = 0;
+      if (bitmap_preallocated_) {
+        BitUtil::SetBitsTo(bitmap_, output_->offset, output_->length, true);
+      }
+      return Status::OK();
+    } else if (values_with_nulls_.size() == 1) {
+      return PropagateSingle();
+    } else {
+      return PropagateMultiple();
+    }
+  }
+
+ private:
+  KernelContext* ctx_;
+  const ExecBatch& batch_;
+  std::vector<const Datum*> values_with_nulls_;
+  ArrayData* output_;
+  uint8_t* bitmap_;
+  bool bitmap_preallocated_ = false;
+};
+
+Status PropagateNulls(KernelContext* ctx, const ExecBatch& batch, ArrayData* output) {
+  DCHECK_NE(nullptr, output);
+  DCHECK_GT(output->buffers.size(), 0);
+
+  if (output->type->id() == Type::NA) {
+    // Null output type is a no-op (rare when this would happen but we at least
+    // will test for it)
+    return Status::OK();
+  }
+
+  // This function is ONLY able to write into output with non-zero offset
+  // when the bitmap is preallocated. This could be a DCHECK but returning
+  // error Status for now for emphasis
+  if (output->offset != 0 && output->buffers[0] == nullptr) {
+    return Status::Invalid(
+        "Can only propagate nulls into pre-allocated memory "
+        "when the output offset is non-zero");
+  }
+  NullPropagator propagator(ctx, batch, output);
+  return propagator.Execute();
+}
+
+Status ExecListener::OnResult(Datum) { return Status::NotImplemented("OnResult"); }
+
+class DatumAccumulator : public ExecListener {
+ public:
+  DatumAccumulator() {}
+
+  Status OnResult(Datum value) override {
+    values_.emplace_back(value);
+    return Status::OK();
+  }
+
+  std::vector<Datum> values() const { return values_; }
+
+ private:
+  std::vector<Datum> values_;
+};
+
+template <typename FunctionType>
+class FunctionExecutorImpl : public FunctionExecutor {
+ public:
+  FunctionExecutorImpl(ExecContext* exec_ctx, const FunctionType* func,
+                       const FunctionOptions* options)
+      : exec_ctx_(exec_ctx), kernel_ctx_(exec_ctx), func_(func), options_(options) {}
+
+ protected:
+  using KernelType = typename FunctionType::KernelType;
+
+  void Reset() {}
+
+  Status InitState() {
+    // Some kernels require initialization of an opaque state object
+    if (kernel_->init) {
+      state_ = kernel_->init(&kernel_ctx_, *kernel_, options_);
+      CTX_RETURN_IF_ERROR(&kernel_ctx_);
+      kernel_ctx_.SetState(state_.get());
+    }
+    return Status::OK();
+  }
+
+  Status BindArgs(const std::vector<Datum>& args) {
+    std::vector<ValueDescr> arg_descrs;
+    RETURN_NOT_OK(GetValueDescriptors(args, &arg_descrs));
+    ARROW_ASSIGN_OR_RAISE(kernel_, func_->DispatchExact(arg_descrs));
+
+    // Resolve the output descriptor for this kernel
+    ARROW_ASSIGN_OR_RAISE(output_descr_,
+                          kernel_->signature->out_type().Resolve(arg_descrs));
+
+    ARROW_ASSIGN_OR_RAISE(batch_iterator_,
+                          ExecBatchIterator::Make(args, exec_ctx_->exec_chunksize()));
+
+    return Status::OK();
+  }
+
+  ValueDescr output_descr() const override { return output_descr_; }
+
+  ExecContext* exec_ctx_;
+  KernelContext kernel_ctx_;
+  const FunctionType* func_;
+  const KernelType* kernel_;
+  std::unique_ptr<ExecBatchIterator> batch_iterator_;
+  std::unique_ptr<KernelState> state_;
+  ValueDescr output_descr_;
+  const FunctionOptions* options_;
+};
+
+// Executor for SCALAR and VECTOR functions
+template <typename FunctionType>
+class ArrayExecutor : public FunctionExecutorImpl<FunctionType> {
+ public:
+  using BASE = FunctionExecutorImpl<FunctionType>;
+  using BASE::BASE;
+
+  Status ExecuteBatch(const ExecBatch& batch, ExecListener* listener) {
+    Datum out;
+    RETURN_NOT_OK(PrepareNextOutput(batch, &out));
+
+    if (kernel_->null_handling == NullHandling::INTERSECTION &&
+        output_descr_.shape == ValueDescr::ARRAY) {
+      RETURN_NOT_OK(PropagateNulls(&kernel_ctx_, batch, out.mutable_array()));
+    }
+
+    kernel_->exec(&kernel_ctx_, batch, &out);
+    CTX_RETURN_IF_ERROR(&kernel_ctx_);
+    if (!preallocate_contiguous_) {
+      // If we are producing chunked output rather than one big array, then
+      // emit each chunk as soon as it's available
+      RETURN_NOT_OK(listener->OnResult(std::move(out)));
+    }
+    return Status::OK();
+  }
+
+  Status PrepareExecute(const std::vector<Datum>& args) {
+    this->Reset();
+    RETURN_NOT_OK(this->BindArgs(args));
+    RETURN_NOT_OK(this->InitState());
+    output_num_buffers_ = static_cast<int>(output_descr_.type->layout().buffers.size());
+
+    // If the executor is configured to produce a single large Array output for
+    // kernels supporting preallocation, then we do so up front and then
+    // iterate over slices of that large array. Otherwise, we preallocate prior
+    // to processing each batch emitted from the ExecBatchIterator
+    if (output_descr_.shape == ValueDescr::ARRAY) {
+      RETURN_NOT_OK(SetupPreallocation(batch_iterator_->length()));
+    }
+    return Status::OK();
+  }
+
+  Status Execute(const std::vector<Datum>& args, ExecListener* listener) override {
+    RETURN_NOT_OK(PrepareExecute(args));
+    ExecBatch batch;
+    while (batch_iterator_->Next(&batch)) {
+      RETURN_NOT_OK(ExecuteBatch(batch, listener));
+    }
+    if (preallocate_contiguous_) {
+      // If we preallocated one big chunk, since the kernel execution is
+      // completed, we can now emit it
+      RETURN_NOT_OK(listener->OnResult(std::move(preallocated_)));
+    }
+    return Status::OK();
+  }
+
+ protected:
+  // We must accommodate two different modes of execution for preallocated
+  // execution
+  //
+  // * A single large ("contiguous") allocation that we populate with results
+  //   on a chunkwise basis according to the ExecBatchIterator. This permits
+  //   parallelization even if the objective is to obtain a single Array or
+  //   ChunkedArray at the end
+  // * A standalone buffer preallocation for each chunk emitted from the
+  //   ExecBatchIterator
+  //
+  // When data buffer preallocation is not possible (e.g. with BINARY / STRING
+  // outputs), then contiguous results are only possible if the input is
+  // contiguous.
+
+  Status PrepareNextOutput(const ExecBatch& batch, Datum* out) {
+    if (output_descr_.shape == ValueDescr::ARRAY) {
+      if (preallocate_contiguous_) {
+        // The output is already fully preallocated
+        const int64_t batch_start_position = batch_iterator_->position() - batch.length;
+
+        if (batch.length < batch_iterator_->length()) {
+          // If this is a partial execution, then we write into a slice of
+          // preallocated_
+          //
+          // XXX: ArrayData::Slice not returning std::shared_ptr<ArrayData> is
+          // a nuisance
+          out->value = std::make_shared<ArrayData>(
+              preallocated_->Slice(batch_start_position, batch.length));
+        } else {
+          // Otherwise write directly into preallocated_. The main difference
+          // computationally (versus the Slice approach) is that the null_count
+          // may not need to be recomputed in the result
+          out->value = preallocated_;
+        }
+      } else {
+        // We preallocate (maybe) only for the output of processing the current
+        // batch
+        ARROW_ASSIGN_OR_RAISE(out->value, PrepareOutput(batch.length));
+      }
+    }
+    // XXX: Scalar outputs are the responsibility of the kernel?
+    return Status::OK();
+  }
+
+  Result<std::shared_ptr<ArrayData>> PrepareOutput(int64_t length) {
+    auto out = std::make_shared<ArrayData>(output_descr_.type, length);
+    out->buffers.resize(output_num_buffers_);
+
+    const auto& fw_type = checked_cast<const FixedWidthType&>(*out->type);
+    if (validity_preallocated_) {
+      ARROW_ASSIGN_OR_RAISE(out->buffers[0], kernel_ctx_.AllocateBitmap(length));
+    }
+    if (data_preallocated_) {
+      ARROW_ASSIGN_OR_RAISE(
+          out->buffers[1], AllocateDataBuffer(&kernel_ctx_, length, fw_type.bit_width()));
+    }
+    return out;
+  }
+
+  Status SetupPreallocation(int64_t total_length) {
+    // Decide if we need to preallocate memory for this kernel
+    data_preallocated_ = ((kernel_->mem_allocation == MemAllocation::PREALLOCATE) &&
+                          CanPreallocate(*output_descr_.type));
+
+    validity_preallocated_ =
+        (kernel_->null_handling != NullHandling::COMPUTED_NO_PREALLOCATE &&
+         kernel_->null_handling != NullHandling::OUTPUT_NOT_NULL);
+
+    // Contiguous preallocation only possible if both the VALIDITY and DATA can
+    // be preallocated. Otherwise, we must go chunk-by-chunk. Note that when
+    // the DATA cannot be preallocated, the VALIDITY may still be preallocated
+    // depending on the NullHandling of the kernel
+    //
+    // Some kernels are unable to write into sliced outputs, so we respect the
+    // kernel's attributes
+    preallocate_contiguous_ =
+        (exec_ctx_->preallocate_contiguous() && kernel_->can_write_into_slices &&
+         data_preallocated_ && validity_preallocated_);
+    if (preallocate_contiguous_) {
+      // TODO: Are there contiguous preallocation scenarios that are NOT
+      // primitive (2-buffer)?
+      DCHECK_EQ(2, output_num_buffers_);
+      ARROW_ASSIGN_OR_RAISE(preallocated_, PrepareOutput(total_length));
+    }
+    return Status::OK();
+  }
+
+  // Lift protected members so we don't have to use this->
+  using BASE::batch_iterator_;
+  using BASE::exec_ctx_;
+  using BASE::func_;
+  using BASE::kernel_;
+  using BASE::kernel_ctx_;
+  using BASE::options_;
+  using BASE::output_descr_;
+  using BASE::state_;
+
+  int output_num_buffers_;
+
+  // If true, then the kernel writes into a preallocated data buffer
+  bool data_preallocated_ = false;
+
+  // If true, then memory is preallocated for the validity bitmap with the same
+  // strategy as the data buffer(s).
+  bool validity_preallocated_ = false;
+
+  // If true, and the kernel and output type supports preallocation (for both
+  // the validity and data buffers), then we allocate one big array and then
+  // iterate through it while executing the kernel in chunks
+  bool preallocate_contiguous_ = false;
+
+  // For storing a contiguous preallocation per above. Unused otherwise
+  std::shared_ptr<ArrayData> preallocated_;
+};
+
+class ScalarExecutor : public ArrayExecutor<ScalarFunction> {
+ public:
+  using FunctionType = ScalarFunction;
+  static constexpr Function::Kind function_kind = Function::SCALAR;
+  using BASE = ArrayExecutor<ScalarFunction>;
+  using BASE::BASE;
+};
+
+class VectorExecutor : public ArrayExecutor<VectorFunction> {
+ public:
+  using FunctionType = VectorFunction;
+  static constexpr Function::Kind function_kind = Function::VECTOR;
+  using BASE = ArrayExecutor<VectorFunction>;
+  using BASE::BASE;
+};
+
+class ScalarAggExecutor : public FunctionExecutorImpl<ScalarAggregateFunction> {
+ public:
+  using FunctionType = ScalarAggregateFunction;
+  static constexpr Function::Kind function_kind = Function::SCALAR_AGGREGATE;
+  using BASE = FunctionExecutorImpl<ScalarAggregateFunction>;
+
+  Status Consume(const ExecBatch& batch) {
+    auto batch_state = kernel_->init(&kernel_ctx_, *kernel_, options_);
+    KernelContext batch_ctx(exec_ctx_);
+    batch_ctx.SetState(batch_state.get());
+
+    kernel_->consume(&batch_ctx, batch);
+    CTX_RETURN_IF_ERROR(&batch_ctx);
+
+    kernel_->merge(&kernel_ctx_, *batch_state, state_.get());
+    CTX_RETURN_IF_ERROR(&kernel_ctx_);
+    return Status::OK();
+  }
+
+  Status Execute(const std::vector<Datum>& args, ExecListener* listener) override {
+    RETURN_NOT_OK(BindArgs(args));
+
+    // This is the global/total state for the aggregation. Batches are
+    // aggregated independently and then merged into the state
+    RETURN_NOT_OK(InitState());
+
+    ExecBatch batch;
+    while (batch_iterator_->Next(&batch)) {
+      // TODO: implement parallelism
+      if (batch.length > 0) {
+        RETURN_NOT_OK(Consume(batch));
+      }
+    }
+
+    Datum out;
+    kernel_->finalize(&kernel_ctx_, &out);
+    CTX_RETURN_IF_ERROR(&kernel_ctx_);
+    RETURN_NOT_OK(listener->OnResult(std::move(out)));
+    return Status::OK();
+  }
+
+ private:
+  using BASE::BASE;
+};
+
+template <typename ExecutorType,
+          typename FunctionType = typename ExecutorType::FunctionType>
+Result<std::unique_ptr<FunctionExecutor>> MakeExecutor(ExecContext* ctx,
+                                                       const Function* func,
+                                                       const FunctionOptions* options) {
+  DCHECK_EQ(ExecutorType::function_kind, func->kind());
+  auto typed_func = checked_cast<const FunctionType*>(func);
+  return std::unique_ptr<FunctionExecutor>(new ExecutorType(ctx, typed_func, options));
+}
+
+Result<std::unique_ptr<FunctionExecutor>> FunctionExecutor::Make(
+    ExecContext* ctx, const Function* func, const FunctionOptions* options) {
+  switch (func->kind()) {
+    case Function::SCALAR:
+      return MakeExecutor<detail::ScalarExecutor>(ctx, func, options);
+    case Function::VECTOR:
+      return MakeExecutor<detail::VectorExecutor>(ctx, func, options);
+    case Function::SCALAR_AGGREGATE:
+      return MakeExecutor<detail::ScalarAggExecutor>(ctx, func, options);
+  }
+}
+
+Status CheckAllValues(const std::vector<Datum>& values) {
+  for (const auto& value : values) {
+    if (!value.is_value()) {
+      return Status::Invalid("Datum contained non-scalar/array type");
+    }
+  }
+  return Status::OK();
+}
+
+Status ExecuteFunction(ExecContext* ctx, const std::string& func_name,
+                       const std::vector<Datum>& args, const FunctionOptions* options,
+                       ValueDescr* out_descr, ExecListener* listener) {
+  if (ctx == nullptr) {
+    ExecContext default_ctx;
+    return ExecuteFunction(&default_ctx, func_name, args, options, out_descr, listener);
+  }
+
+  // type-check Datum arguments here. Really we'd like to avoid this as much as
+  // possible
+  RETURN_NOT_OK(CheckAllValues(args));
+
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<const Function> func,
+                        ctx->func_registry()->GetFunction(func_name));
+  ARROW_ASSIGN_OR_RAISE(auto executor, FunctionExecutor::Make(ctx, func.get(), options));
+  RETURN_NOT_OK(executor->Execute(args, listener));
+  *out_descr = executor->output_descr();
+  return Status::OK();
+}
+
+}  // namespace detail
+
+ExecContext::ExecContext(MemoryPool* pool, FunctionRegistry* func_registry)
+    : pool_(pool) {
+  this->func_registry_ = func_registry == nullptr ? GetFunctionRegistry() : func_registry;
+}
+
+internal::CpuInfo* ExecContext::cpu_info() const {
+  return internal::CpuInfo::GetInstance();
+}
+
+// ----------------------------------------------------------------------
+// SelectionVector
+
+SelectionVector::SelectionVector(std::shared_ptr<ArrayData> data)
+    : data_(std::move(data)) {
+  DCHECK_EQ(Type::INT32, data_->type->id());
+  DCHECK_EQ(0, data_->GetNullCount());
+  indices_ = data_->GetValues<int32_t>(1);
+}
+
+Result<std::shared_ptr<SelectionVector>> SelectionVector::FromMask(const Array& arr) {
+  return Status::NotImplemented("FromMask");
+}
+
+namespace {
+
+std::shared_ptr<ChunkedArray> ToChunkedArray(const std::vector<Datum>& values,
+                                             const std::shared_ptr<DataType>& type) {
+  std::vector<std::shared_ptr<Array>> arrays;
+  for (const auto& val : values) {
+    arrays.emplace_back(val.make_array());
+  }
+  return std::make_shared<ChunkedArray>(arrays, type);
+}
+
+bool HaveChunkedArray(const std::vector<Datum>& values) {
+  for (const auto& value : values) {
+    if (value.kind() == Datum::CHUNKED_ARRAY) {
+      return true;
+    }
+  }
+  return false;
+}
+
+Datum WrapArrayResults(const std::vector<Datum>& input_args,
+                       const std::vector<Datum>& results,
+                       const ValueDescr& output_descr) {
+  DCHECK_GT(results.size(), 0);
+  if (output_descr.shape == ValueDescr::SCALAR) {
+    if (results.size() == 1) {
+      // Return as SCALAR
+      return results[0];
+    } else {
+      // Return as COLLECTION
+      return results;
+    }
+  } else {
+    // If execution yielded multiple chunks (because large arrays were split
+    // based on the ExecContext parameters, then the result is a ChunkedArray
+    if (HaveChunkedArray(input_args) || results.size() > 1) {
+      return ToChunkedArray(results, output_descr.type);
+    } else {
+      // Results have just one element
+      return results[0];
+    }
+  }
+}
+
+}  // namespace
+
+Result<Datum> ExecScalarFunction(ExecContext* ctx, const std::string& func_name,
+                                 const std::vector<Datum>& args,
+                                 const FunctionOptions* options) {
+  auto listener = std::make_shared<detail::DatumAccumulator>();
+  ValueDescr out_descr;
+  RETURN_NOT_OK(
+      detail::ExecuteFunction(ctx, func_name, args, options, &out_descr, listener.get()));
+  return WrapArrayResults(args, listener->values(), out_descr);
+}
+
+Result<Datum> ExecVectorFunction(ExecContext* ctx, const std::string& func_name,
+                                 const std::vector<Datum>& args,
+                                 const FunctionOptions* options) {
+  auto listener = std::make_shared<detail::DatumAccumulator>();
+  ValueDescr out_descr;
+  RETURN_NOT_OK(
+      detail::ExecuteFunction(ctx, func_name, args, options, &out_descr, listener.get()));
+  return WrapArrayResults(args, listener->values(), out_descr);
+}
+
+Result<Datum> ExecScalarAggregateFunction(ExecContext* ctx, const std::string& func_name,
+                                          const std::vector<Datum>& args,
+                                          const FunctionOptions* options) {
+  auto listener = std::make_shared<detail::DatumAccumulator>();
+  ValueDescr unused;
+  RETURN_NOT_OK(
+      detail::ExecuteFunction(ctx, func_name, args, options, &unused, listener.get()));
+  DCHECK_EQ(1, listener->values().size());
+  return listener->values()[0];
+}
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/exec.h b/cpp/src/arrow/compute/exec.h
new file mode 100644
index 00000000000..b473838e281
--- /dev/null
+++ b/cpp/src/arrow/compute/exec.h
@@ -0,0 +1,175 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// NOTE: API is EXPERIMENTAL and will change without going through a
+// deprecation cycle
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/datum.h"
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+namespace internal {
+
+class CpuInfo;
+
+}  // namespace internal
+
+namespace compute {
+
+struct FunctionOptions;
+class FunctionRegistry;
+
+// It seems like 64K might be a good default chunksize to use for execution
+// based on the experience of other query processing systems, so using this for
+// now.
+static constexpr int64_t kDefaultExecChunksize = UINT16_MAX;
+
+/// \brief Context for expression-global variables and options used by
+/// function evaluation
+class ARROW_EXPORT ExecContext {
+ public:
+  // If no function registry passed, the default is used
+  explicit ExecContext(MemoryPool* pool = default_memory_pool(),
+                       FunctionRegistry* func_registry = NULLPTR);
+
+  MemoryPool* memory_pool() const { return pool_; }
+
+  internal::CpuInfo* cpu_info() const;
+
+  FunctionRegistry* func_registry() const { return func_registry_; }
+
+  // \brief Set maximum length unit of work for kernel execution. Larger inputs
+  // will be split into smaller chunks, and, if desired, processed in
+  // parallel. Set to -1 for no limit
+  void set_exec_chunksize(int64_t chunksize) { exec_chunksize_ = chunksize; }
+
+  // \brief Maximum length unit of work for kernel execution.
+  int64_t exec_chunksize() const { return exec_chunksize_; }
+
+  /// \brief Set whether to use multiple threads for function execution
+  void set_use_threads(bool use_threads = true) { use_threads_ = use_threads; }
+
+  /// \brief If true, then utilize multiple threads where relevant for function
+  /// execution
+  bool use_threads() const { return use_threads_; }
+
+  // Set the preallocation strategy for kernel execution as it relates to
+  // chunked execution. For chunked execution, whether via ChunkedArray inputs
+  // or splitting larger Array arguments into smaller pieces, contiguous
+  // allocation (if permitted by the kernel) will allocate one large array to
+  // write output into yielding it to the caller at the end. If this option is
+  // set to off, then preallocations will be performed independently for each
+  // chunk of execution
+  //
+  // TODO: At some point we might want the limit the size of contiguous
+  // preallocations (for example, merging small ChunkedArray chunks until
+  // reaching some desired size)
+  void set_preallocate_contiguous(bool preallocate = true) {
+    preallocate_contiguous_ = preallocate;
+  }
+
+  bool preallocate_contiguous() const { return preallocate_contiguous_; }
+
+ private:
+  MemoryPool* pool_;
+  FunctionRegistry* func_registry_;
+  int64_t exec_chunksize_ = -1;
+  bool preallocate_contiguous_ = true;
+  bool use_threads_ = true;
+};
+
+// TODO: Consider standardizing on uint16 selection vectors and only use them
+// when we can ensure that each value is 64K length or smaller
+
+/// \brief Container for a int32 selection
+class ARROW_EXPORT SelectionVector {
+ public:
+  explicit SelectionVector(std::shared_ptr<ArrayData> data);
+
+  explicit SelectionVector(const Array& arr) : SelectionVector(arr.data()) {}
+
+  /// \brief Create SelectionVector from boolean mask
+  static Result<std::shared_ptr<SelectionVector>> FromMask(const Array& arr);
+
+  int32_t index(int i) const { return indices_[i]; }
+  const int32_t* indices() const { return indices_; }
+  int32_t length() const { return static_cast<int32_t>(data_->length); }
+
+ private:
+  std::shared_ptr<ArrayData> data_;
+  const int32_t* indices_;
+};
+
+struct ExecBatch {
+  ExecBatch() {}
+  ExecBatch(std::vector<Datum> values, int64_t length)
+      : values(std::move(values)), length(length) {}
+
+  std::vector<Datum> values;
+  std::shared_ptr<SelectionVector> selection_vector;
+  int64_t length;
+  const Datum& operator[](int i) const { return values[i]; }
+
+  int num_values() const { return static_cast<int>(values.size()); }
+
+  std::vector<ValueDescr> GetDescriptors() const {
+    std::vector<ValueDescr> result;
+    for (const auto& value : this->values) {
+      result.emplace_back(value.descr());
+    }
+    return result;
+  }
+};
+
+/// \brief Convenience method for invoking a scalar (elementwise) array
+/// function, including handling iteration on ChunkedArray inputs
+ARROW_EXPORT
+Result<Datum> ExecScalarFunction(ExecContext* ctx, const std::string& func_name,
+                                 const std::vector<Datum>& args,
+                                 const FunctionOptions* options = NULLPTR);
+
+/// \brief Convenience method for invoking a vector array function, including
+/// handling iteration on ChunkedArray inputs. Compared with a scalar function,
+/// vector functions may require post-processing of chunked outputs if the
+/// results are dependent on the whole data passed (e.g. with hash table
+/// functions)
+ARROW_EXPORT
+Result<Datum> ExecVectorFunction(ExecContext* ctx, const std::string& func_name,
+                                 const std::vector<Datum>& args,
+                                 const FunctionOptions* options = NULLPTR);
+
+/// \brief Convenience method for invoking a scalar aggregate function,
+/// including handling iteration on ChunkedArray inputs
+ARROW_EXPORT
+Result<Datum> ExecScalarAggregateFunction(ExecContext* ctx, const std::string& func_name,
+                                          const std::vector<Datum>& args,
+                                          const FunctionOptions* options = NULLPTR);
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/exec_internal.h b/cpp/src/arrow/compute/exec_internal.h
new file mode 100644
index 00000000000..1c61541d557
--- /dev/null
+++ b/cpp/src/arrow/compute/exec_internal.h
@@ -0,0 +1,128 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/buffer.h"
+#include "arrow/compute/exec.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/status.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace compute {
+
+class Function;
+
+// \brief Make a copy of the buffers into a destination array without carrying
+// the type.
+static inline void ZeroCopyData(const ArrayData& input, ArrayData* output) {
+  output->length = input.length;
+  output->SetNullCount(input.null_count);
+  output->buffers = input.buffers;
+  output->offset = input.offset;
+  output->child_data = input.child_data;
+}
+
+namespace detail {
+
+/// \brief Break std::vector<Datum> into a sequence of ExecBatch for kernel
+/// execution
+class ARROW_EXPORT ExecBatchIterator {
+ public:
+  /// \brief Construct iterator and do basic argument validation
+  ///
+  /// \param[in] args the Datum argument, must be all array-like or scalar
+  /// \param[in] max_chunksize the maximum length of each ExecBatch. Depending
+  /// on the chunk layout of ChunkedArray. Default of -1 means no maximum, so
+  /// as greedy as possible
+  static Result<std::unique_ptr<ExecBatchIterator>> Make(std::vector<Datum> args,
+                                                         int64_t max_chunksize = -1);
+
+  /// \brief Compute the next batch. Always returns at least one batch. Return
+  /// false if the iterator is exhausted
+  bool Next(ExecBatch* batch);
+
+  int64_t length() const { return length_; }
+
+  int64_t position() const { return position_; }
+
+  int64_t max_chunksize() const { return max_chunksize_; }
+
+ private:
+  ExecBatchIterator(std::vector<Datum> args, int64_t length, int64_t max_chunksize);
+
+  std::vector<Datum> args_;
+  std::vector<int> chunk_indexes_;
+  std::vector<int64_t> chunk_positions_;
+  int64_t position_;
+  int64_t length_;
+  int64_t max_chunksize_;
+  bool finished_;
+};
+
+// "Push" / listener API like IPC reader so that consumers can receive
+// processed chunks as soon as they're available.
+
+class ARROW_EXPORT ExecListener {
+ public:
+  virtual ~ExecListener() = default;
+
+  virtual Status OnResult(Datum value);
+};
+
+class ARROW_EXPORT FunctionExecutor {
+ public:
+  virtual ~FunctionExecutor() = default;
+
+  /// XXX: Better configurability for listener
+  /// Not thread-safe
+  virtual Status Execute(const std::vector<Datum>& args, ExecListener* listener) = 0;
+
+  virtual ValueDescr output_descr() const = 0;
+
+  static Result<std::unique_ptr<FunctionExecutor>> Make(ExecContext* ctx,
+                                                        const Function* func,
+                                                        const FunctionOptions* options);
+};
+
+ARROW_EXPORT
+Status ExecuteFunction(ExecContext* ctx, const std::string& func_name,
+                       const std::vector<Datum>& args, const FunctionOptions* options,
+                       ValueDescr* out_descr, ExecListener* listener);
+
+/// \brief Populate validity bitmap with the intersection of the nullity of the
+/// arguments. If a preallocated bitmap is not provided, then one will be
+/// allocated if needed (in some cases a bitmap can be zero-copied from the
+/// arguments). If any Scalar value is null, then the entire validity bitmap
+/// will be set to null.
+///
+/// \param[in] ctx kernel execution context, for memory allocation etc.
+/// \param[in] batch the data batch
+/// \param[in] out the output ArrayData, must not be null
+ARROW_EXPORT
+Status PropagateNulls(KernelContext* ctx, const ExecBatch& batch, ArrayData* out);
+
+}  // namespace detail
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/exec_test.cc b/cpp/src/arrow/compute/exec_test.cc
new file mode 100644
index 00000000000..ceee12785fd
--- /dev/null
+++ b/cpp/src/arrow/compute/exec_test.cc
@@ -0,0 +1,840 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "arrow/testing/gtest_common.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/random.h"
+
+#include "arrow/array.h"
+#include "arrow/buffer.h"
+#include "arrow/compute/exec.h"
+#include "arrow/compute/exec_internal.h"
+#include "arrow/compute/function.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/compute/options.h"
+#include "arrow/compute/registry.h"
+#include "arrow/compute/test_util.h"
+#include "arrow/memory_pool.h"
+#include "arrow/pretty_print.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/cpu_info.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+
+namespace compute {
+namespace detail {
+
+TEST(ExecContext, BasicWorkings) {
+  {
+    ExecContext ctx;
+    ASSERT_EQ(GetFunctionRegistry(), ctx.func_registry());
+    ASSERT_EQ(default_memory_pool(), ctx.memory_pool());
+
+    // No default chunksize right now
+    ASSERT_EQ(-1, ctx.exec_chunksize());
+
+    ASSERT_TRUE(ctx.use_threads());
+    ASSERT_EQ(internal::CpuInfo::GetInstance(), ctx.cpu_info());
+  }
+
+  // Now, let's customize all the things
+  LoggingMemoryPool my_pool(default_memory_pool());
+  std::unique_ptr<FunctionRegistry> custom_reg = FunctionRegistry::Make();
+  ExecContext ctx(&my_pool, custom_reg.get());
+
+  ASSERT_EQ(custom_reg.get(), ctx.func_registry());
+  ASSERT_EQ(&my_pool, ctx.memory_pool());
+
+  ctx.set_exec_chunksize(1 << 20);
+  ASSERT_EQ(1 << 20, ctx.exec_chunksize());
+
+  ctx.set_use_threads(false);
+  ASSERT_FALSE(ctx.use_threads());
+}
+
+TEST(SelectionVector, Basics) {
+  auto indices = ArrayFromJSON(int32(), "[0, 3]");
+  auto sel_vector = std::make_shared<SelectionVector>(*indices);
+
+  ASSERT_EQ(indices->length(), sel_vector->length());
+  ASSERT_EQ(3, sel_vector->index(1));
+  ASSERT_EQ(3, sel_vector->indices()[1]);
+}
+
+void AssertValidityZeroExtraBits(const ArrayData& arr) {
+  const Buffer& buf = *arr.buffers[0];
+
+  const int64_t bit_extent = ((arr.offset + arr.length + 7) / 8) * 8;
+  for (int64_t i = arr.offset + arr.length; i < bit_extent; ++i) {
+    EXPECT_FALSE(BitUtil::GetBit(buf.data(), i)) << i;
+  }
+}
+
+class TestComputeInternals : public ::testing::Test {
+ public:
+  void SetUp() {
+    registry_ = FunctionRegistry::Make();
+    rng_.reset(new random::RandomArrayGenerator(/*seed=*/0));
+    ResetContexts();
+  }
+
+  void ResetContexts() {
+    exec_ctx_.reset(new ExecContext(default_memory_pool(), registry_.get()));
+    ctx_.reset(new KernelContext(exec_ctx_.get()));
+  }
+
+  std::shared_ptr<Array> GetUInt8Array(int64_t size, double null_probability = 0.1) {
+    return rng_->UInt8(size, /*min=*/0, /*max=*/100, null_probability);
+  }
+
+  std::shared_ptr<Array> GetInt32Array(int64_t size, double null_probability = 0.1) {
+    return rng_->Int32(size, /*min=*/0, /*max=*/1000, null_probability);
+  }
+
+  std::shared_ptr<Array> GetFloat64Array(int64_t size, double null_probability = 0.1) {
+    return rng_->Float64(size, /*min=*/0, /*max=*/1000, null_probability);
+  }
+
+  std::shared_ptr<ChunkedArray> GetInt32Chunked(const std::vector<int>& sizes) {
+    std::vector<std::shared_ptr<Array>> chunks;
+    for (auto size : sizes) {
+      chunks.push_back(GetInt32Array(size));
+    }
+    return std::make_shared<ChunkedArray>(std::move(chunks));
+  }
+
+ protected:
+  std::unique_ptr<ExecContext> exec_ctx_;
+  std::unique_ptr<KernelContext> ctx_;
+  std::unique_ptr<FunctionRegistry> registry_;
+  std::unique_ptr<random::RandomArrayGenerator> rng_;
+};
+
+class TestPropagateNulls : public TestComputeInternals {};
+
+TEST_F(TestPropagateNulls, UnknownNullCountWithNullsZeroCopies) {
+  const int64_t length = 16;
+
+  constexpr uint8_t validity_bitmap[8] = {254, 0, 0, 0, 0, 0, 0, 0};
+  auto nulls = std::make_shared<Buffer>(validity_bitmap, 8);
+
+  ArrayData output(boolean(), length, {nullptr, nullptr});
+  ArrayData input(boolean(), length, {nulls, nullptr}, kUnknownNullCount);
+
+  ExecBatch batch({input}, length);
+  ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output));
+  ASSERT_EQ(nulls.get(), output.buffers[0].get());
+  ASSERT_EQ(kUnknownNullCount, output.null_count);
+  ASSERT_EQ(9, output.GetNullCount());
+}
+
+TEST_F(TestPropagateNulls, UnknownNullCountWithoutNulls) {
+  const int64_t length = 16;
+  constexpr uint8_t validity_bitmap[8] = {255, 255, 0, 0, 0, 0, 0, 0};
+  auto nulls = std::make_shared<Buffer>(validity_bitmap, 8);
+
+  ArrayData output(boolean(), length, {nullptr, nullptr});
+  ArrayData input(boolean(), length, {nulls, nullptr}, kUnknownNullCount);
+
+  ExecBatch batch({input}, length);
+  ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output));
+  EXPECT_EQ(-1, output.null_count);
+  EXPECT_EQ(nulls.get(), output.buffers[0].get());
+}
+
+TEST_F(TestPropagateNulls, SetAllNulls) {
+  const int64_t length = 16;
+
+  auto CheckSetAllNull = [&](std::vector<Datum> values, bool preallocate) {
+    // Make fresh bitmap with all 1's
+    uint8_t bitmap_data[2] = {255, 255};
+    auto preallocated_mem = std::make_shared<MutableBuffer>(bitmap_data, 2);
+
+    std::vector<std::shared_ptr<Buffer>> buffers(2);
+    if (preallocate) {
+      buffers[0] = preallocated_mem;
+    }
+
+    ArrayData output(boolean(), length, buffers);
+
+    ExecBatch batch(values, length);
+    ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output));
+
+    if (preallocate) {
+      // Ensure that buffer object the same when we pass in preallocated memory
+      ASSERT_EQ(preallocated_mem.get(), output.buffers[0].get());
+    }
+    ASSERT_NE(nullptr, output.buffers[0]);
+    uint8_t expected[2] = {0, 0};
+    const Buffer& out_buf = *output.buffers[0];
+    ASSERT_EQ(0, std::memcmp(out_buf.data(), expected, out_buf.size()));
+  };
+
+  // There is a null scalar
+  std::shared_ptr<Scalar> i32_val = std::make_shared<Int32Scalar>(3);
+  std::vector<Datum> vals = {i32_val, MakeNullScalar(boolean())};
+  CheckSetAllNull(vals, true);
+  CheckSetAllNull(vals, false);
+
+  const double true_prob = 0.5;
+
+  vals[0] = rng_->Boolean(length, true_prob);
+  CheckSetAllNull(vals, true);
+  CheckSetAllNull(vals, false);
+
+  auto arr_all_nulls = rng_->Boolean(length, true_prob, /*null_probability=*/1);
+
+  // One value is all null
+  vals = {rng_->Boolean(length, true_prob, /*null_probability=*/0.5), arr_all_nulls};
+  CheckSetAllNull(vals, true);
+  CheckSetAllNull(vals, false);
+
+  // A value is NullType
+  std::shared_ptr<Array> null_arr = std::make_shared<NullArray>(length);
+  vals = {rng_->Boolean(length, true_prob), null_arr};
+  CheckSetAllNull(vals, true);
+  CheckSetAllNull(vals, false);
+
+  // Other nitty-gritty scenarios
+  {
+    // An all-null bitmap is zero-copied over, even though there is a
+    // null-scalar earlier in the batch
+    ArrayData output(boolean(), length, {nullptr, nullptr});
+    ExecBatch batch({MakeNullScalar(boolean()), arr_all_nulls}, length);
+    ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output));
+    ASSERT_EQ(arr_all_nulls->data()->buffers[0].get(), output.buffers[0].get());
+  }
+}
+
+TEST_F(TestPropagateNulls, SingleValueWithNulls) {
+  // Input offset is non-zero (0 mod 8 and nonzero mod 8 cases)
+  const int64_t length = 100;
+  auto arr = rng_->Boolean(length, 0.5, /*null_probability=*/0.5);
+
+  auto CheckSliced = [&](int64_t offset, bool preallocate = false,
+                         int64_t out_offset = 0) {
+    // Unaligned bitmap, zero copy not possible
+    auto sliced = arr->Slice(offset);
+    std::vector<Datum> vals = {sliced};
+
+    ArrayData output(boolean(), vals[0].length(), {nullptr, nullptr});
+    output.offset = out_offset;
+
+    ExecBatch batch(vals, vals[0].length());
+
+    std::shared_ptr<Buffer> preallocated_bitmap;
+    if (preallocate) {
+      ASSERT_OK_AND_ASSIGN(
+          preallocated_bitmap,
+          AllocateBuffer(BitUtil::BytesForBits(sliced->length() + out_offset)));
+      std::memset(preallocated_bitmap->mutable_data(), 0, preallocated_bitmap->size());
+      output.buffers[0] = preallocated_bitmap;
+    } else {
+      ASSERT_EQ(0, output.offset);
+    }
+
+    ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output));
+
+    if (!preallocate) {
+      const Buffer* parent_buf = arr->data()->buffers[0].get();
+      if (offset == 0) {
+        // Validity bitmap same, no slice
+        ASSERT_EQ(parent_buf, output.buffers[0].get());
+      } else if (offset % 8 == 0) {
+        // Validity bitmap sliced
+        ASSERT_NE(parent_buf, output.buffers[0].get());
+        ASSERT_EQ(parent_buf, output.buffers[0]->parent().get());
+      } else {
+        // New memory for offset not 0 mod 8
+        ASSERT_NE(parent_buf, output.buffers[0].get());
+        ASSERT_EQ(nullptr, output.buffers[0]->parent());
+      }
+    } else {
+      // preallocated, so check that the validity bitmap is unbothered
+      ASSERT_EQ(preallocated_bitmap.get(), output.buffers[0].get());
+    }
+
+    ASSERT_EQ(arr->Slice(offset)->null_count(), output.GetNullCount());
+
+    ASSERT_TRUE(internal::BitmapEquals(output.buffers[0]->data(), output.offset,
+                                       sliced->null_bitmap_data(), sliced->offset(),
+                                       output.length));
+    AssertValidityZeroExtraBits(output);
+  };
+
+  CheckSliced(8);
+  CheckSliced(7);
+  CheckSliced(8, /*preallocated=*/true);
+  CheckSliced(7, true);
+  CheckSliced(8, true, /*offset=*/4);
+  CheckSliced(7, true, 4);
+}
+
+TEST_F(TestPropagateNulls, ZeroCopyWhenZeroNullsOnOneInput) {
+  const int64_t length = 16;
+
+  constexpr uint8_t validity_bitmap[8] = {254, 0, 0, 0, 0, 0, 0, 0};
+  auto nulls = std::make_shared<Buffer>(validity_bitmap, 8);
+
+  ArrayData some_nulls(boolean(), 16, {nulls, nullptr}, /*null_count=*/9);
+  ArrayData no_nulls(boolean(), length, {nullptr, nullptr}, /*null_count=*/0);
+
+  ArrayData output(boolean(), length, {nullptr, nullptr});
+  ExecBatch batch({some_nulls, no_nulls}, length);
+  ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output));
+  ASSERT_EQ(nulls.get(), output.buffers[0].get());
+  ASSERT_EQ(9, output.null_count);
+
+  // Flip order of args
+  output = ArrayData(boolean(), length, {nullptr, nullptr});
+  batch.values = {no_nulls, no_nulls, some_nulls};
+  ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output));
+  ASSERT_EQ(nulls.get(), output.buffers[0].get());
+  ASSERT_EQ(9, output.null_count);
+
+  // Check that preallocated memory is not clobbered
+  uint8_t bitmap_data[2] = {0, 0};
+  auto preallocated_mem = std::make_shared<MutableBuffer>(bitmap_data, 2);
+  output.null_count = kUnknownNullCount;
+  output.buffers[0] = preallocated_mem;
+  ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output));
+
+  ASSERT_EQ(preallocated_mem.get(), output.buffers[0].get());
+  ASSERT_EQ(9, output.null_count);
+  ASSERT_EQ(254, bitmap_data[0]);
+  ASSERT_EQ(0, bitmap_data[1]);
+}
+
+TEST_F(TestPropagateNulls, IntersectsNulls) {
+  const int64_t length = 16;
+
+  // 0b01111111 0b11001111
+  constexpr uint8_t bitmap1[8] = {127, 207, 0, 0, 0, 0, 0, 0};
+
+  // 0b11111110 0b01111111
+  constexpr uint8_t bitmap2[8] = {254, 127, 0, 0, 0, 0, 0, 0};
+
+  // 0b11101111 0b11111110
+  constexpr uint8_t bitmap3[8] = {239, 254, 0, 0, 0, 0, 0, 0};
+
+  ArrayData arr1(boolean(), length, {std::make_shared<Buffer>(bitmap1, 8), nullptr});
+  ArrayData arr2(boolean(), length, {std::make_shared<Buffer>(bitmap2, 8), nullptr});
+  ArrayData arr3(boolean(), length, {std::make_shared<Buffer>(bitmap3, 8), nullptr});
+
+  auto CheckCase = [&](std::vector<Datum> values, int64_t ex_null_count,
+                       const uint8_t* ex_bitmap, bool preallocate = false,
+                       int64_t output_offset = 0) {
+    ExecBatch batch(values, length);
+
+    std::shared_ptr<Buffer> nulls;
+    if (preallocate) {
+      // Make the buffer one byte bigger so we can have non-zero offsets
+      ASSERT_OK_AND_ASSIGN(nulls, AllocateBuffer(3));
+      std::memset(nulls->mutable_data(), 0, nulls->size());
+    } else {
+      // non-zero output offset not permitted unless the output memory is
+      // preallocated
+      ASSERT_EQ(0, output_offset);
+    }
+    ArrayData output(boolean(), length, {nulls, nullptr});
+    output.offset = output_offset;
+
+    ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output));
+
+    // Preallocated memory used
+    if (preallocate) {
+      ASSERT_EQ(nulls.get(), output.buffers[0].get());
+    }
+
+    EXPECT_EQ(kUnknownNullCount, output.null_count);
+    EXPECT_EQ(ex_null_count, output.GetNullCount());
+
+    const auto& out_buffer = *output.buffers[0];
+
+    ASSERT_TRUE(internal::BitmapEquals(out_buffer.data(), output_offset, ex_bitmap,
+                                       /*ex_offset=*/0, length));
+
+    // Now check that the rest of the bits in out_buffer are still 0
+    AssertValidityZeroExtraBits(output);
+  };
+
+  // 0b01101110 0b01001110
+  uint8_t expected1[2] = {110, 78};
+  CheckCase({arr1, arr2, arr3}, 7, expected1);
+  CheckCase({arr1, arr2, arr3}, 7, expected1, /*preallocate=*/true);
+  CheckCase({arr1, arr2, arr3}, 7, expected1, /*preallocate=*/true,
+            /*output_offset=*/4);
+
+  // 0b01111110 0b01001111
+  uint8_t expected2[2] = {126, 79};
+  CheckCase({arr1, arr2}, 5, expected2);
+  CheckCase({arr1, arr2}, 5, expected2, /*preallocate=*/true,
+            /*output_offset=*/4);
+}
+
+TEST_F(TestPropagateNulls, NullOutputTypeNoop) {
+  // Ensure we leave the buffers alone when the output type is null()
+  const int64_t length = 100;
+  ExecBatch batch({rng_->Boolean(100, 0.5, 0.5)}, length);
+
+  ArrayData output(null(), length, {nullptr});
+  ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output));
+  ASSERT_EQ(nullptr, output.buffers[0]);
+}
+
+// ----------------------------------------------------------------------
+// ExecBatchIterator
+
+class TestExecBatchIterator : public TestComputeInternals {
+ public:
+  void SetupIterator(std::vector<Datum> args, int64_t max_chunksize = -1) {
+    ASSERT_OK_AND_ASSIGN(iterator_,
+                         ExecBatchIterator::Make(std::move(args), max_chunksize));
+  }
+  void CheckIteration(const std::vector<Datum>& args, int chunksize,
+                      const std::vector<int>& ex_batch_sizes) {
+    SetupIterator(args, chunksize);
+    ExecBatch batch;
+    int64_t position = 0;
+    for (size_t i = 0; i < ex_batch_sizes.size(); ++i) {
+      ASSERT_EQ(position, iterator_->position());
+      ASSERT_TRUE(iterator_->Next(&batch));
+      ASSERT_EQ(ex_batch_sizes[i], batch.length);
+
+      for (size_t j = 0; j < args.size(); ++j) {
+        switch (args[j].kind()) {
+          case Datum::SCALAR:
+            ASSERT_TRUE(args[j].scalar()->Equals(batch[j].scalar()));
+            break;
+          case Datum::ARRAY:
+            AssertArraysEqual(*args[j].make_array()->Slice(position, batch.length),
+                              *batch[j].make_array());
+            break;
+          case Datum::CHUNKED_ARRAY: {
+            const ChunkedArray& carr = *args[j].chunked_array();
+            if (batch.length == 0) {
+              ASSERT_EQ(0, carr.length());
+            } else {
+              auto arg_slice = carr.Slice(position, batch.length);
+              // The sliced ChunkedArrays should only ever be 1 chunk
+              ASSERT_EQ(1, arg_slice->num_chunks());
+              AssertArraysEqual(*arg_slice->chunk(0), *batch[j].make_array());
+            }
+          } break;
+          default:
+            break;
+        }
+      }
+      position += ex_batch_sizes[i];
+    }
+    // Ensure that the iterator is exhausted
+    ASSERT_FALSE(iterator_->Next(&batch));
+
+    ASSERT_EQ(iterator_->length(), iterator_->position());
+  }
+
+ protected:
+  std::unique_ptr<ExecBatchIterator> iterator_;
+};
+
+TEST_F(TestExecBatchIterator, Basics) {
+  const int64_t length = 100;
+
+  // Simple case with a single chunk
+  std::vector<Datum> args = {Datum(GetInt32Array(length)), Datum(GetFloat64Array(length)),
+                             Datum(std::make_shared<Int32Scalar>(3))};
+  SetupIterator(args);
+
+  ExecBatch batch;
+  ASSERT_TRUE(iterator_->Next(&batch));
+  ASSERT_EQ(3, batch.values.size());
+  ASSERT_EQ(3, batch.num_values());
+  ASSERT_EQ(length, batch.length);
+
+  std::vector<ValueDescr> descrs = batch.GetDescriptors();
+  ASSERT_EQ(ValueDescr::Array(int32()), descrs[0]);
+  ASSERT_EQ(ValueDescr::Array(float64()), descrs[1]);
+  ASSERT_EQ(ValueDescr::Scalar(int32()), descrs[2]);
+
+  AssertArraysEqual(*args[0].make_array(), *batch[0].make_array());
+  AssertArraysEqual(*args[1].make_array(), *batch[1].make_array());
+  ASSERT_TRUE(args[2].scalar()->Equals(batch[2].scalar()));
+
+  ASSERT_EQ(length, iterator_->position());
+  ASSERT_FALSE(iterator_->Next(&batch));
+
+  // Split into chunks of size 16
+  CheckIteration(args, /*chunksize=*/16, {16, 16, 16, 16, 16, 16, 4});
+}
+
+TEST_F(TestExecBatchIterator, InputValidation) {
+  std::vector<Datum> args = {Datum(GetInt32Array(10)), Datum(GetInt32Array(9))};
+  ASSERT_RAISES(Invalid, ExecBatchIterator::Make(args));
+
+  args = {Datum(GetInt32Array(9)), Datum(GetInt32Array(10))};
+  ASSERT_RAISES(Invalid, ExecBatchIterator::Make(args));
+
+  args = {Datum(GetInt32Array(10))};
+  ASSERT_OK_AND_ASSIGN(auto iterator, ExecBatchIterator::Make(args));
+  ASSERT_EQ(10, iterator->max_chunksize());
+}
+
+TEST_F(TestExecBatchIterator, ChunkedArrays) {
+  std::vector<Datum> args = {Datum(GetInt32Chunked({0, 20, 10})),
+                             Datum(GetInt32Chunked({15, 15})), Datum(GetInt32Array(30)),
+                             Datum(std::make_shared<Int32Scalar>(5)),
+                             Datum(MakeNullScalar(boolean()))};
+
+  CheckIteration(args, /*chunksize=*/10, {10, 5, 5, 10});
+  CheckIteration(args, /*chunksize=*/20, {15, 5, 10});
+  CheckIteration(args, /*chunksize=*/30, {15, 5, 10});
+}
+
+TEST_F(TestExecBatchIterator, ZeroLengthCases) {
+  auto carr = std::shared_ptr<ChunkedArray>(new ChunkedArray({}, int32()));
+
+  // Zero-length ChunkedArray with zero chunks
+  std::vector<Datum> args = {Datum(carr)};
+  CheckIteration(args, /*chunksize=*/10, {0});
+
+  // Zero-length array
+  args = {Datum(GetInt32Array(0))};
+  CheckIteration(args, /*chunksize=*/10, {0});
+
+  // ChunkedArray with single empty chunk
+  args = {Datum(GetInt32Chunked({0}))};
+  CheckIteration(args, /*chunksize=*/10, {0});
+}
+
+// ----------------------------------------------------------------------
+// Scalar function execution
+
+void ExecCopy(KernelContext*, const ExecBatch& batch, Datum* out) {
+  DCHECK_EQ(1, batch.num_values());
+  const auto& type = checked_cast<const FixedWidthType&>(*batch[0].type());
+  int value_size = type.bit_width() / 8;
+
+  const ArrayData& arg0 = *batch[0].array();
+  ArrayData* out_arr = out->mutable_array();
+  uint8_t* dst = out_arr->buffers[1]->mutable_data() + out_arr->offset * value_size;
+  const uint8_t* src = arg0.buffers[1]->data() + arg0.offset * value_size;
+  std::memcpy(dst, src, batch.length * value_size);
+}
+
+void ExecComputedBitmap(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  // Propagate nulls not used. Check that the out bitmap isn't the same already
+  // as the input bitmap
+  const ArrayData& arg0 = *batch[0].array();
+  ArrayData* out_arr = out->mutable_array();
+
+  DCHECK(!internal::BitmapEquals(arg0.buffers[0]->data(), arg0.offset,
+                                 out_arr->buffers[0]->data(), out_arr->offset,
+                                 batch.length));
+  internal::CopyBitmap(arg0.buffers[0]->data(), arg0.offset, batch.length,
+                       out_arr->buffers[0]->mutable_data(), out_arr->offset);
+  ExecCopy(ctx, batch, out);
+}
+
+void ExecNoPreallocatedData(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  // Validity preallocated, but not the data
+  ArrayData* out_arr = out->mutable_array();
+  DCHECK_EQ(0, out_arr->offset);
+  const auto& type = checked_cast<const FixedWidthType&>(*batch[0].type());
+  int value_size = type.bit_width() / 8;
+  Status s = (ctx->Allocate(out_arr->length * value_size).Value(&out_arr->buffers[1]));
+  DCHECK_OK(s);
+  ExecCopy(ctx, batch, out);
+}
+
+void ExecNoPreallocatedAnything(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  // Neither validity nor data preallocated
+  ArrayData* out_arr = out->mutable_array();
+  DCHECK_EQ(0, out_arr->offset);
+  Status s = (ctx->AllocateBitmap(out_arr->length).Value(&out_arr->buffers[0]));
+  DCHECK_OK(s);
+  const ArrayData& arg0 = *batch[0].array();
+  internal::CopyBitmap(arg0.buffers[0]->data(), arg0.offset, batch.length,
+                       out_arr->buffers[0]->mutable_data(), /*offset=*/0);
+
+  // Reuse the kernel that allocates the data
+  ExecNoPreallocatedData(ctx, batch, out);
+}
+
+struct ExampleOptions : public FunctionOptions {
+  std::shared_ptr<Scalar> value;
+  explicit ExampleOptions(std::shared_ptr<Scalar> value) : value(std::move(value)) {}
+};
+
+struct ExampleState : public KernelState {
+  std::shared_ptr<Scalar> value;
+  explicit ExampleState(std::shared_ptr<Scalar> value) : value(std::move(value)) {}
+};
+
+std::unique_ptr<KernelState> InitStateful(KernelContext*, const Kernel&,
+                                          const FunctionOptions* options) {
+  auto func_options = static_cast<const ExampleOptions*>(options);
+  return std::unique_ptr<KernelState>(new ExampleState{func_options->value});
+}
+
+void ExecStateful(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  // We take the value from the state and multiply the data in batch[0] with it
+  ExampleState* state = static_cast<ExampleState*>(ctx->state());
+  int32_t multiplier = checked_cast<const Int32Scalar&>(*state->value).value;
+
+  const ArrayData& arg0 = *batch[0].array();
+  ArrayData* out_arr = out->mutable_array();
+  const int32_t* arg0_data = arg0.GetValues<int32_t>(1);
+  int32_t* dst = out_arr->GetMutableValues<int32_t>(1);
+  for (int64_t i = 0; i < arg0.length; ++i) {
+    dst[i] = arg0_data[i] * multiplier;
+  }
+}
+
+void ExecAddInt32(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  const Int32Scalar& arg0 = batch[0].scalar_as<Int32Scalar>();
+  const Int32Scalar& arg1 = batch[1].scalar_as<Int32Scalar>();
+  out->value = std::make_shared<Int32Scalar>(arg0.value + arg1.value);
+}
+
+class TestExecScalarFunction : public TestComputeInternals {
+ public:
+  void SetUp() {
+    TestComputeInternals::SetUp();
+
+    AddCopyFunctions();
+    AddNoPreallocateFunctions();
+    AddStatefulFunction();
+    AddScalarFunction();
+  }
+
+  void AddCopyFunctions() {
+    // This function simply copies memory from the input argument into the
+    // (preallocated) output
+    auto func = std::make_shared<ScalarFunction>("copy", 1);
+
+    // Add a few kernels. Our implementation only accepts arrays
+    ASSERT_OK(func->AddKernel({InputType::Array(uint8())}, uint8(), ExecCopy));
+    ASSERT_OK(func->AddKernel({InputType::Array(int32())}, int32(), ExecCopy));
+    ASSERT_OK(func->AddKernel({InputType::Array(float64())}, float64(), ExecCopy));
+    ASSERT_OK(registry_->AddFunction(func));
+
+    // A version which doesn't want the executor to call PropagateNulls
+    auto func2 = std::make_shared<ScalarFunction>("copy_computed_bitmap", 1);
+    ScalarKernel kernel({InputType::Array(uint8())}, uint8(), ExecComputedBitmap);
+    kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE;
+    ASSERT_OK(func2->AddKernel(kernel));
+    ASSERT_OK(registry_->AddFunction(func2));
+  }
+
+  void AddNoPreallocateFunctions() {
+    // A function that allocates its own output memory. We have cases for both
+    // non-preallocated data and non-preallocated validity bitmap
+    auto f1 = std::make_shared<ScalarFunction>("nopre_data", 1);
+    auto f2 = std::make_shared<ScalarFunction>("nopre_validity_or_data", 1);
+
+    ScalarKernel kernel({InputType::Array(uint8())}, uint8(), ExecNoPreallocatedData);
+    kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+    ASSERT_OK(f1->AddKernel(kernel));
+
+    kernel.exec = ExecNoPreallocatedAnything;
+    kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+    ASSERT_OK(f2->AddKernel(kernel));
+
+    ASSERT_OK(registry_->AddFunction(f1));
+    ASSERT_OK(registry_->AddFunction(f2));
+  }
+
+  void AddStatefulFunction() {
+    // This function's behavior depends on a static parameter that is made
+    // available to the kernel's execution function through its Options object
+    auto func = std::make_shared<ScalarFunction>("stateful", 1);
+
+    ScalarKernel kernel({InputType::Array(int32())}, int32(), ExecStateful, InitStateful);
+    ASSERT_OK(func->AddKernel(kernel));
+    ASSERT_OK(registry_->AddFunction(func));
+  }
+
+  void AddScalarFunction() {
+    auto func = std::make_shared<ScalarFunction>("scalar_add_int32", 2);
+    ASSERT_OK(func->AddKernel({InputType::Scalar(int32()), InputType::Scalar(int32())},
+                              int32(), ExecAddInt32));
+    ASSERT_OK(registry_->AddFunction(func));
+  }
+};
+
+TEST_F(TestExecScalarFunction, ArgumentValidation) {
+  // Copy accepts only a single array argument
+  Datum d1(GetInt32Array(10));
+
+  // Too many args
+  std::vector<Datum> args = {d1, d1};
+  ASSERT_RAISES(Invalid, ExecScalarFunction(exec_ctx_.get(), "copy", args));
+
+  // Too few
+  args = {};
+  ASSERT_RAISES(Invalid, ExecScalarFunction(exec_ctx_.get(), "copy", args));
+
+  // Cannot do scalar
+  args = {Datum(std::make_shared<Int32Scalar>(5))};
+  ASSERT_RAISES(KeyError, ExecScalarFunction(exec_ctx_.get(), "copy", args));
+}
+
+TEST_F(TestExecScalarFunction, PreallocationCases) {
+  double null_prob = 0.2;
+
+  auto arr = GetUInt8Array(50, null_prob);
+
+  auto CheckFunction = [&](std::string func_name) {
+    ResetContexts();
+
+    // The default should be a single array output
+    {
+      std::vector<Datum> args = {Datum(arr)};
+      ASSERT_OK_AND_ASSIGN(Datum result,
+                           ExecScalarFunction(exec_ctx_.get(), func_name, args));
+      ASSERT_EQ(Datum::ARRAY, result.kind());
+      AssertArraysEqual(*arr, *result.make_array());
+    }
+
+    // Set the exec_chunksize to be smaller, so now we have several invocations
+    // of the kernel, but still the output is onee array
+    {
+      std::vector<Datum> args = {Datum(arr)};
+      exec_ctx_->set_exec_chunksize(8);
+      ASSERT_OK_AND_ASSIGN(Datum result,
+                           ExecScalarFunction(exec_ctx_.get(), func_name, args));
+      AssertArraysEqual(*arr, *result.make_array());
+    }
+
+    exec_ctx_->set_exec_chunksize(12);
+
+    // Chunksize not multiple of 8
+    {
+      std::vector<Datum> args = {Datum(arr)};
+      exec_ctx_->set_exec_chunksize(12);
+      ASSERT_OK_AND_ASSIGN(Datum result,
+                           ExecScalarFunction(exec_ctx_.get(), func_name, args));
+      AssertArraysEqual(*arr, *result.make_array());
+    }
+
+    // Input is chunked, output has one big chunk
+    {
+      auto carr = std::shared_ptr<ChunkedArray>(
+          new ChunkedArray({arr->Slice(0, 15), arr->Slice(15)}));
+      std::vector<Datum> args = {Datum(carr)};
+      ASSERT_OK_AND_ASSIGN(Datum result,
+                           ExecScalarFunction(exec_ctx_.get(), func_name, args));
+      std::shared_ptr<ChunkedArray> actual = result.chunked_array();
+      ASSERT_EQ(1, actual->num_chunks());
+      AssertChunkedEquivalent(*carr, *actual);
+    }
+
+    // Preallocate independently for each batch
+    {
+      std::vector<Datum> args = {Datum(arr)};
+      exec_ctx_->set_preallocate_contiguous(false);
+      exec_ctx_->set_exec_chunksize(20);
+      ASSERT_OK_AND_ASSIGN(Datum result,
+                           ExecScalarFunction(exec_ctx_.get(), func_name, args));
+      ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind());
+      const ChunkedArray& carr = *result.chunked_array();
+      ASSERT_EQ(3, carr.num_chunks());
+      AssertArraysEqual(*arr->Slice(0, 20), *carr.chunk(0));
+      AssertArraysEqual(*arr->Slice(20, 20), *carr.chunk(1));
+      AssertArraysEqual(*arr->Slice(40), *carr.chunk(2));
+    }
+  };
+
+  CheckFunction("copy");
+  CheckFunction("copy_computed_bitmap");
+}
+
+TEST_F(TestExecScalarFunction, BasicNonStandardCases) {
+  // Test a handful of cases
+  //
+  // * Validity bitmap computed by kernel rather than using PropagateNulls
+  // * Data not pre-allocated
+  // * Validity bitmap not pre-allocated
+
+  double null_prob = 0.2;
+
+  auto arr = GetUInt8Array(100, null_prob);
+  std::vector<Datum> args = {Datum(arr)};
+
+  auto CheckFunction = [&](std::string func_name) {
+    ResetContexts();
+
+    // The default should be a single array output
+    {
+      exec_ctx_->set_exec_chunksize(-1);
+      ASSERT_OK_AND_ASSIGN(Datum result,
+                           ExecScalarFunction(exec_ctx_.get(), func_name, args));
+      AssertArraysEqual(*arr, *result.make_array(), true);
+    }
+
+    // Split execution into 3 chunks
+    {
+      exec_ctx_->set_exec_chunksize(40);
+      ASSERT_OK_AND_ASSIGN(Datum result,
+                           ExecScalarFunction(exec_ctx_.get(), func_name, args));
+      ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind());
+      const ChunkedArray& carr = *result.chunked_array();
+      ASSERT_EQ(3, carr.num_chunks());
+      AssertArraysEqual(*arr->Slice(0, 40), *carr.chunk(0));
+      AssertArraysEqual(*arr->Slice(40, 40), *carr.chunk(1));
+      AssertArraysEqual(*arr->Slice(80), *carr.chunk(2));
+    }
+  };
+
+  CheckFunction("nopre_data");
+  CheckFunction("nopre_validity_or_data");
+}
+
+TEST_F(TestExecScalarFunction, StatefulKernel) {
+  auto input = ArrayFromJSON(int32(), "[1, 2, 3, null, 5]");
+  auto multiplier = std::make_shared<Int32Scalar>(2);
+  auto expected = ArrayFromJSON(int32(), "[2, 4, 6, null, 10]");
+
+  ExampleOptions options(multiplier);
+  std::vector<Datum> args = {Datum(input)};
+  ASSERT_OK_AND_ASSIGN(Datum result,
+                       ExecScalarFunction(exec_ctx_.get(), "stateful", args, &options));
+  AssertArraysEqual(*expected, *result.make_array());
+}
+
+TEST_F(TestExecScalarFunction, ScalarFunction) {
+  std::vector<Datum> args = {Datum(std::make_shared<Int32Scalar>(5)),
+                             Datum(std::make_shared<Int32Scalar>(7))};
+  ASSERT_OK_AND_ASSIGN(Datum result,
+                       ExecScalarFunction(exec_ctx_.get(), "scalar_add_int32", args));
+  ASSERT_EQ(Datum::SCALAR, result.kind());
+
+  auto expected = std::make_shared<Int32Scalar>(12);
+  ASSERT_TRUE(expected->Equals(*result.scalar()));
+}
+
+}  // namespace detail
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/filter.h b/cpp/src/arrow/compute/filter.h
new file mode 100644
index 00000000000..260e9909b00
--- /dev/null
+++ b/cpp/src/arrow/compute/filter.h
@@ -0,0 +1,54 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/compute/exec.h"
+#include "arrow/compute/options.h"
+#include "arrow/datum.h"
+#include "arrow/result.h"
+
+namespace arrow {
+namespace compute {
+
+class ExecContext;
+
+/// \brief Filter with a boolean selection filter
+///
+/// The output will be populated with values from the input at positions
+/// where the selection filter is not 0. Nulls in the filter will be handled
+/// based on options.null_selection_behavior.
+///
+/// For example given values = ["a", "b", "c", null, "e", "f"] and
+/// filter = [0, 1, 1, 0, null, 1], the output will be
+/// (null_selection_behavior == DROP)      = ["b", "c", "f"]
+/// (null_selection_behavior == EMIT_NULL) = ["b", "c", null, "f"]
+///
+/// \param[in] values array to filter
+/// \param[in] filter indicates which values should be filtered out
+/// \param[in] options configures null_selection_behavior
+/// \param[in] context the function execution context, optional
+/// \return the resulting datum
+ARROW_EXPORT
+Result<Datum> Filter(const Datum& values, const Datum& filter,
+                     FilterOptions options = FilterOptions::Defaults(),
+                     ExecContext* context = NULLPTR);
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc
new file mode 100644
index 00000000000..1c29ab7ed3b
--- /dev/null
+++ b/cpp/src/arrow/compute/function.cc
@@ -0,0 +1,150 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/function.h"
+
+#include <cstddef>
+#include <memory>
+#include <sstream>
+
+namespace arrow {
+
+struct ValueDescr;
+
+namespace compute {
+
+static Status CheckArity(const std::vector<InputType>& args, const FunctionArity& arity) {
+  const int passed_num_args = static_cast<int>(args.size());
+  if (arity.is_varargs && passed_num_args < arity.num_args) {
+    return Status::Invalid("Varargs function needs at least ", arity.num_args,
+                           " arguments but kernel accepts only ", passed_num_args);
+  } else if (!arity.is_varargs && passed_num_args != arity.num_args) {
+    return Status::Invalid("Function accepts ", arity.num_args,
+                           " arguments but kernel accepts ", passed_num_args);
+  }
+  return Status::OK();
+}
+
+template <typename DescrType>
+std::string FormatArgTypes(const std::vector<DescrType>& descrs) {
+  std::stringstream ss;
+  ss << "(";
+  for (size_t i = 0; i < descrs.size(); ++i) {
+    if (i > 0) {
+      ss << ", ";
+    }
+    ss << descrs[i].ToString();
+  }
+  ss << ")";
+  return ss.str();
+}
+
+template <typename KernelType, typename DescrType>
+Result<const KernelType*> DispatchExactImpl(const Function& func,
+                                            const std::vector<KernelType>& kernels,
+                                            const std::vector<DescrType>& values) {
+  const int passed_num_args = static_cast<int>(values.size());
+
+  // Validate arity
+  const FunctionArity arity = func.arity();
+  if (arity.is_varargs && passed_num_args < arity.num_args) {
+    return Status::Invalid("Varargs function needs at least ", arity.num_args,
+                           " arguments but passed only ", passed_num_args);
+  } else if (!arity.is_varargs && passed_num_args != arity.num_args) {
+    return Status::Invalid("Function accepts ", arity.num_args, " arguments but passed ",
+                           passed_num_args);
+  }
+  for (const auto& kernel : kernels) {
+    if (kernel.signature->MatchesInputs(values)) {
+      return &kernel;
+    }
+  }
+  return Status::KeyError("Function ", func.name(),
+                          " has no kernel exactly matching input types ",
+                          FormatArgTypes(values));
+}
+
+Status ScalarFunction::AddKernel(std::vector<InputType> in_types, OutputType out_type,
+                                 ArrayKernelExec exec, KernelInit init) {
+  RETURN_NOT_OK(CheckArity(in_types, arity_));
+
+  if (arity_.is_varargs && in_types.size() != 1) {
+    return Status::Invalid("Varargs signatures must have exactly one input type");
+  }
+  auto sig =
+      KernelSignature::Make(std::move(in_types), std::move(out_type), arity_.is_varargs);
+  kernels_.emplace_back(std::move(sig), exec, init);
+  return Status::OK();
+}
+
+Status ScalarFunction::AddKernel(ScalarKernel kernel) {
+  RETURN_NOT_OK(CheckArity(kernel.signature->in_types(), arity_));
+  if (arity_.is_varargs && !kernel.signature->is_varargs()) {
+    return Status::Invalid("Function accepts varargs but kernel signature does not");
+  }
+  kernels_.emplace_back(std::move(kernel));
+  return Status::OK();
+}
+
+Result<const ScalarKernel*> ScalarFunction::DispatchExact(
+    const std::vector<ValueDescr>& values) const {
+  return DispatchExactImpl(*this, kernels_, values);
+}
+
+Status VectorFunction::AddKernel(std::vector<InputType> in_types, OutputType out_type,
+                                 ArrayKernelExec exec, KernelInit init) {
+  RETURN_NOT_OK(CheckArity(in_types, arity_));
+
+  if (arity_.is_varargs && in_types.size() != 1) {
+    return Status::Invalid("Varargs signatures must have exactly one input type");
+  }
+  auto sig =
+      KernelSignature::Make(std::move(in_types), std::move(out_type), arity_.is_varargs);
+  kernels_.emplace_back(std::move(sig), exec, init);
+  return Status::OK();
+}
+
+Status VectorFunction::AddKernel(VectorKernel kernel) {
+  RETURN_NOT_OK(CheckArity(kernel.signature->in_types(), arity_));
+  if (arity_.is_varargs && !kernel.signature->is_varargs()) {
+    return Status::Invalid("Function accepts varargs but kernel signature does not");
+  }
+  kernels_.emplace_back(std::move(kernel));
+  return Status::OK();
+}
+
+Result<const VectorKernel*> VectorFunction::DispatchExact(
+    const std::vector<ValueDescr>& values) const {
+  return DispatchExactImpl(*this, kernels_, values);
+}
+
+Status ScalarAggregateFunction::AddKernel(ScalarAggregateKernel kernel) {
+  RETURN_NOT_OK(CheckArity(kernel.signature->in_types(), arity_));
+  if (arity_.is_varargs && !kernel.signature->is_varargs()) {
+    return Status::Invalid("Function accepts varargs but kernel signature does not");
+  }
+  kernels_.emplace_back(std::move(kernel));
+  return Status::OK();
+}
+
+Result<const ScalarAggregateKernel*> ScalarAggregateFunction::DispatchExact(
+    const std::vector<ValueDescr>& values) const {
+  return DispatchExactImpl(*this, kernels_, values);
+}
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/function.h b/cpp/src/arrow/compute/function.h
new file mode 100644
index 00000000000..3fa9ab1ae24
--- /dev/null
+++ b/cpp/src/arrow/compute/function.h
@@ -0,0 +1,197 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// NOTE: API is EXPERIMENTAL and will change without going through a
+// deprecation cycle
+
+#pragma once
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/compute/kernel.h"
+#include "arrow/compute/options.h"  // IWYU pragma: keep
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+struct ValueDescr;
+
+namespace compute {
+
+/// \brief Contains the number of required arguments for the function
+struct ARROW_EXPORT FunctionArity {
+  static FunctionArity Nullary() { return FunctionArity(0, false); }
+  static FunctionArity Unary() { return FunctionArity(1, false); }
+  static FunctionArity Binary() { return FunctionArity(2, false); }
+  static FunctionArity Ternary() { return FunctionArity(3, false); }
+  static FunctionArity Varargs(int min_args = 1) { return FunctionArity(min_args, true); }
+
+  FunctionArity(int num_args, bool is_varargs = false)  // NOLINT implicit conversion
+      : num_args(num_args), is_varargs(is_varargs) {}
+
+  /// The number of required arguments (or the minimum number for varargs
+  /// functions)
+  int num_args;
+
+  /// If true, then the num_args is the minimum number of required arguments
+  bool is_varargs = false;
+};
+
+/// \brief Base class for function containers that are capable of dispatch to
+/// kernel implementations
+class ARROW_EXPORT Function {
+ public:
+  /// \brief The kind of function, which indicates in what contexts it is
+  /// valid for use
+  enum Kind {
+    /// A function that performs scalar data operations on whole arrays of
+    /// data. Can generally process Array or Scalar values. The size of the
+    /// output will be the same as the size (or broadcasted size, in the case
+    /// of mixing Array and Scalar inputs) of the input.
+    SCALAR,
+
+    /// A function with array input and output whose behavior depends on the
+    /// values of the entire arrays passed, rather than the value of each scalar
+    /// value.
+    VECTOR,
+
+    /// A function that computes scalar summary statistics from array input.
+    SCALAR_AGGREGATE
+  };
+
+  virtual ~Function() = default;
+
+  /// \brief The name of the kernel. The registry enforces uniqueness of names
+  const std::string& name() const { return name_; }
+
+  /// \brief The kind of kernel, which indicates in what contexts it is valid
+  /// for use
+  Function::Kind kind() const { return kind_; }
+
+  /// \brief Contains the number of arguments the function requires
+  const FunctionArity& arity() const { return arity_; }
+
+  /// \brief Returns the number of registered kernels for this function
+  virtual int num_kernels() const = 0;
+
+ protected:
+  Function(std::string name, Function::Kind kind, const FunctionArity& arity)
+      : name_(std::move(name)), kind_(kind), arity_(arity) {}
+  std::string name_;
+  Function::Kind kind_;
+  FunctionArity arity_;
+};
+
+namespace detail {
+
+template <typename KernelType>
+class FunctionImpl : public Function {
+ public:
+  /// \brief Return vector of all available kernels for this function
+  const std::vector<KernelType>& kernels() const { return kernels_; }
+
+  int num_kernels() const override { return static_cast<int>(kernels_.size()); }
+
+ protected:
+  FunctionImpl(std::string name, Function::Kind kind, const FunctionArity& arity)
+      : Function(std::move(name), kind, arity) {}
+
+  std::vector<KernelType> kernels_;
+};
+
+}  // namespace detail
+
+/// \brief A function that executes elementwise operations on arrays or
+/// scalars, and therefore whose results generally do not depend on the order
+/// of the values in the arguments. Accepts and returns arrays that are all of
+/// the same size. These functions roughly correspond to the functions used in
+/// SQL expressions.
+class ARROW_EXPORT ScalarFunction : public detail::FunctionImpl<ScalarKernel> {
+ public:
+  using KernelType = ScalarKernel;
+
+  ScalarFunction(std::string name, const FunctionArity& arity)
+      : detail::FunctionImpl<ScalarKernel>(std::move(name), Function::SCALAR, arity) {}
+
+  /// \brief Add a simple kernel (function implementation) with given
+  /// input/output types, no required state initialization, preallocation for
+  /// fixed-width types, and default null handling (intersect validity bitmaps
+  /// of inputs)
+  Status AddKernel(std::vector<InputType> in_types, OutputType out_type,
+                   ArrayKernelExec func, KernelInit init = NULLPTR);
+
+  /// \brief Add a kernel (function implementation). Returns error if fails
+  /// to match the other parameters of the function
+  Status AddKernel(ScalarKernel kernel);
+
+  /// \brief Return the first kernel that can execute the function given the
+  /// exact argument types (without implicit type casts or scalar->array
+  /// promotions)
+  Result<const ScalarKernel*> DispatchExact(const std::vector<ValueDescr>& values) const;
+};
+
+/// \brief A function that executes general array operations that may yield
+/// outputs of different sizes or have results that depend on the whole array
+/// contents. These functions roughly correspond to the functions found in
+/// non-SQL array languages like APL and its derivatives
+class ARROW_EXPORT VectorFunction : public detail::FunctionImpl<VectorKernel> {
+ public:
+  using KernelType = VectorKernel;
+
+  VectorFunction(std::string name, const FunctionArity& arity)
+      : detail::FunctionImpl<VectorKernel>(std::move(name), Function::VECTOR, arity) {}
+
+  /// \brief Add a simple kernel (function implementation) with given
+  /// input/output types, no required state initialization, preallocation for
+  /// fixed-width types, and default null handling (intersect validity bitmaps
+  /// of inputs)
+  Status AddKernel(std::vector<InputType> in_types, OutputType out_type,
+                   ArrayKernelExec func, KernelInit init = NULLPTR);
+
+  /// \brief Add a kernel (function implementation). Returns error if fails
+  /// to match the other parameters of the function
+  Status AddKernel(VectorKernel kernel);
+
+  /// \brief Return the first kernel that can execute the function given the
+  /// exact argument types (without implicit type casts or scalar->array
+  /// promotions)
+  Result<const VectorKernel*> DispatchExact(const std::vector<ValueDescr>& values) const;
+};
+
+class ARROW_EXPORT ScalarAggregateFunction
+    : public detail::FunctionImpl<ScalarAggregateKernel> {
+ public:
+  using KernelType = ScalarAggregateKernel;
+
+  ScalarAggregateFunction(std::string name, const FunctionArity& arity)
+      : detail::FunctionImpl<ScalarAggregateKernel>(std::move(name),
+                                                    Function::SCALAR_AGGREGATE, arity) {}
+
+  /// \brief Add a kernel (function implementation). Returns error if fails
+  /// to match the other parameters of the function
+  Status AddKernel(ScalarAggregateKernel kernel);
+
+  Result<const ScalarAggregateKernel*> DispatchExact(
+      const std::vector<ValueDescr>& values) const;
+};
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/function_test.cc b/cpp/src/arrow/compute/function_test.cc
new file mode 100644
index 00000000000..89c3ed00352
--- /dev/null
+++ b/cpp/src/arrow/compute/function_test.cc
@@ -0,0 +1,239 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "arrow/compute/function.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/status.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/type.h"
+
+namespace arrow {
+
+struct Datum;
+
+namespace compute {
+
+class ExecContext;
+struct FunctionOptions;
+
+TEST(FunctionArity, Basics) {
+  auto nullary = FunctionArity::Nullary();
+  ASSERT_EQ(0, nullary.num_args);
+  ASSERT_FALSE(nullary.is_varargs);
+
+  auto unary = FunctionArity::Unary();
+  ASSERT_EQ(1, unary.num_args);
+
+  auto binary = FunctionArity::Binary();
+  ASSERT_EQ(2, binary.num_args);
+
+  auto ternary = FunctionArity::Ternary();
+  ASSERT_EQ(3, ternary.num_args);
+
+  auto varargs = FunctionArity::Varargs();
+  ASSERT_EQ(1, varargs.num_args);
+  ASSERT_TRUE(varargs.is_varargs);
+
+  auto varargs2 = FunctionArity::Varargs(2);
+  ASSERT_EQ(2, varargs2.num_args);
+  ASSERT_TRUE(varargs2.is_varargs);
+}
+
+TEST(ScalarFunction, Basics) {
+  ScalarFunction func("scalar_test", 2);
+  ScalarFunction varargs_func("varargs_test", FunctionArity::Varargs(1));
+
+  ASSERT_EQ("scalar_test", func.name());
+  ASSERT_EQ(2, func.arity().num_args);
+  ASSERT_FALSE(func.arity().is_varargs);
+  ASSERT_EQ(Function::SCALAR, func.kind());
+
+  ASSERT_EQ("varargs_test", varargs_func.name());
+  ASSERT_EQ(1, varargs_func.arity().num_args);
+  ASSERT_TRUE(varargs_func.arity().is_varargs);
+  ASSERT_EQ(Function::SCALAR, varargs_func.kind());
+}
+
+TEST(VectorFunction, Basics) {
+  VectorFunction func("vector_test", 2);
+  VectorFunction varargs_func("varargs_test", FunctionArity::Varargs(1));
+
+  ASSERT_EQ("vector_test", func.name());
+  ASSERT_EQ(2, func.arity().num_args);
+  ASSERT_FALSE(func.arity().is_varargs);
+  ASSERT_EQ(Function::VECTOR, func.kind());
+
+  ASSERT_EQ("varargs_test", varargs_func.name());
+  ASSERT_EQ(1, varargs_func.arity().num_args);
+  ASSERT_TRUE(varargs_func.arity().is_varargs);
+  ASSERT_EQ(Function::VECTOR, varargs_func.kind());
+}
+
+auto ExecNYI = [](KernelContext* ctx, const ExecBatch& args, Datum* out) {
+  ctx->SetStatus(Status::NotImplemented("NYI"));
+  return;
+};
+
+template <typename FunctionType>
+void CheckAddDispatch(FunctionType* func) {
+  using KernelType = typename FunctionType::KernelType;
+
+  ASSERT_EQ(0, func->num_kernels());
+  ASSERT_EQ(0, func->kernels().size());
+
+  std::vector<InputType> in_types1 = {int32(), int32()};
+  OutputType out_type1 = int32();
+
+  ASSERT_OK(func->AddKernel(in_types1, out_type1, ExecNYI));
+  ASSERT_OK(func->AddKernel({int32(), int8()}, int32(), ExecNYI));
+
+  // Duplicate sig is okay
+  ASSERT_OK(func->AddKernel(in_types1, out_type1, ExecNYI));
+
+  // Add given a descr
+  KernelType descr({float64(), float64()}, float64(), ExecNYI);
+  ASSERT_OK(func->AddKernel(descr));
+
+  ASSERT_EQ(4, func->num_kernels());
+  ASSERT_EQ(4, func->kernels().size());
+
+  // Try adding some invalid kernels
+  ASSERT_RAISES(Invalid, func->AddKernel({}, int32(), ExecNYI));
+  ASSERT_RAISES(Invalid, func->AddKernel({int32()}, int32(), ExecNYI));
+  ASSERT_RAISES(Invalid, func->AddKernel({int8(), int8(), int8()}, int32(), ExecNYI));
+
+  // Add valid and invalid kernel using kernel struct directly
+  KernelType valid_kernel({boolean(), boolean()}, boolean(), ExecNYI);
+  ASSERT_OK(func->AddKernel(valid_kernel));
+
+  KernelType invalid_kernel({boolean()}, boolean(), ExecNYI);
+  ASSERT_RAISES(Invalid, func->AddKernel(invalid_kernel));
+
+  ASSERT_OK_AND_ASSIGN(const KernelType* kernel, func->DispatchExact({int32(), int32()}));
+  KernelSignature expected_sig(in_types1, out_type1);
+  ASSERT_TRUE(kernel->signature->Equals(expected_sig));
+
+  // No kernel available
+  ASSERT_RAISES(KeyError, func->DispatchExact({utf8(), utf8()}));
+
+  // Wrong arity
+  ASSERT_RAISES(Invalid, func->DispatchExact({}));
+  ASSERT_RAISES(Invalid, func->DispatchExact({int32(), int32(), int32()}));
+}
+
+TEST(ScalarVectorFunction, DispatchExact) {
+  ScalarFunction func1("scalar_test", 2);
+  VectorFunction func2("vector_test", 2);
+
+  CheckAddDispatch(&func1);
+  CheckAddDispatch(&func2);
+}
+
+TEST(ArrayFunction, Varargs) {
+  ScalarFunction va_func("va_test", FunctionArity::Varargs(1));
+
+  std::vector<InputType> va_args = {int8()};
+
+  ASSERT_OK(va_func.AddKernel(va_args, int8(), ExecNYI));
+
+  // No input type passed
+  ASSERT_RAISES(Invalid, va_func.AddKernel({}, int8(), ExecNYI));
+
+  // Varargs function expect a single input type
+  ASSERT_RAISES(Invalid, va_func.AddKernel({int8(), int8()}, int8(), ExecNYI));
+
+  // Invalid sig
+  ScalarKernel non_va_kernel(std::make_shared<KernelSignature>(va_args, int8()), ExecNYI);
+  ASSERT_RAISES(Invalid, va_func.AddKernel(non_va_kernel));
+
+  std::vector<ValueDescr> args = {ValueDescr::Scalar(int8()), int8(), int8()};
+  ASSERT_OK_AND_ASSIGN(const ScalarKernel* kernel, va_func.DispatchExact(args));
+  ASSERT_TRUE(kernel->signature->MatchesInputs(args));
+
+  // No dispatch possible because args incompatible
+  args[2] = int32();
+  ASSERT_RAISES(KeyError, va_func.DispatchExact(args));
+}
+
+TEST(ScalarAggregateFunction, Basics) {
+  ScalarAggregateFunction func("agg_test", 1);
+
+  ASSERT_EQ("agg_test", func.name());
+  ASSERT_EQ(1, func.arity().num_args);
+  ASSERT_FALSE(func.arity().is_varargs);
+  ASSERT_EQ(Function::SCALAR_AGGREGATE, func.kind());
+}
+
+std::unique_ptr<KernelState> NoopInit(KernelContext*, const Kernel&,
+                                      const FunctionOptions*) {
+  return nullptr;
+}
+
+void NoopConsume(KernelContext*, const ExecBatch&) {}
+void NoopMerge(KernelContext*, const KernelState&, KernelState*) {}
+void NoopFinalize(KernelContext*, Datum*) {}
+
+TEST(ScalarAggregateFunction, DispatchExact) {
+  ScalarAggregateFunction func("agg_test", 1);
+
+  std::vector<InputType> in_args = {ValueDescr::Array(int8())};
+  ScalarAggregateKernel kernel(std::move(in_args), int64(), NoopInit, NoopConsume,
+                               NoopMerge, NoopFinalize);
+  ASSERT_OK(func.AddKernel(kernel));
+
+  in_args = {float64()};
+  kernel.signature = std::make_shared<KernelSignature>(in_args, float64());
+  ASSERT_OK(func.AddKernel(kernel));
+
+  ASSERT_EQ(2, func.num_kernels());
+  ASSERT_EQ(2, func.kernels().size());
+  ASSERT_TRUE(func.kernels()[1].signature->Equals(*kernel.signature));
+
+  // Invalid arity
+  in_args = {};
+  kernel.signature = std::make_shared<KernelSignature>(in_args, float64());
+  ASSERT_RAISES(Invalid, func.AddKernel(kernel));
+
+  in_args = {float32(), float64()};
+  kernel.signature = std::make_shared<KernelSignature>(in_args, float64());
+  ASSERT_RAISES(Invalid, func.AddKernel(kernel));
+
+  std::vector<ValueDescr> dispatch_args = {ValueDescr::Array(int8())};
+  ASSERT_OK_AND_ASSIGN(const ScalarAggregateKernel* selected_kernel,
+                       func.DispatchExact(dispatch_args));
+  ASSERT_EQ(&func.kernels()[0], selected_kernel);
+  ASSERT_TRUE(selected_kernel->signature->MatchesInputs(dispatch_args));
+
+  // We declared that only arrays are accepted
+  dispatch_args[0] = {ValueDescr::Scalar(int8())};
+  ASSERT_RAISES(KeyError, func.DispatchExact(dispatch_args));
+
+  // Didn't qualify the float64() kernel so this actually dispatches (even
+  // though that may not be what you want)
+  dispatch_args[0] = {ValueDescr::Scalar(float64())};
+  ASSERT_OK_AND_ASSIGN(selected_kernel, func.DispatchExact(dispatch_args));
+  ASSERT_TRUE(selected_kernel->signature->MatchesInputs(dispatch_args));
+}
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernel.cc b/cpp/src/arrow/compute/kernel.cc
new file mode 100644
index 00000000000..b03523c8be0
--- /dev/null
+++ b/cpp/src/arrow/compute/kernel.cc
@@ -0,0 +1,296 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/kernel.h"
+
+#include <cstddef>
+#include <memory>
+#include <sstream>
+#include <string>
+
+#include "arrow/buffer.h"
+#include "arrow/compute/exec.h"
+#include "arrow/result.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/hashing.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+
+using internal::hash_combine;
+
+static constexpr size_t kHashSeed = 0;
+
+namespace compute {
+
+// ----------------------------------------------------------------------
+// KernelContext
+
+inline void ZeroLastByte(Buffer* buffer) {
+  *(buffer->mutable_data() + (buffer->size() - 1)) = 0;
+}
+
+Result<std::shared_ptr<Buffer>> KernelContext::Allocate(int64_t nbytes) {
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> result,
+                        AllocateBuffer(nbytes, exec_ctx_->memory_pool()));
+  result->ZeroPadding();
+  return result;
+}
+
+Result<std::shared_ptr<Buffer>> KernelContext::AllocateBitmap(int64_t num_bits) {
+  const int64_t nbytes = BitUtil::BytesForBits(num_bits);
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> result,
+                        AllocateBuffer(nbytes, exec_ctx_->memory_pool()));
+  // Some utility methods access the last byte before it might be
+  // initialized this makes valgrind/asan unhappy, so we proactively
+  // zero it.
+  ZeroLastByte(result.get());
+  result->ZeroPadding();
+  return result;
+}
+
+void KernelContext::SetStatus(const Status& status) {
+  if (ARROW_PREDICT_FALSE(!status_.ok())) {
+    return;
+  }
+  status_ = status;
+}
+
+/// \brief Clear any error status
+void KernelContext::ResetStatus() { status_ = Status::OK(); }
+
+// ----------------------------------------------------------------------
+// InputType
+
+size_t InputType::Hash() const {
+  size_t result = kHashSeed;
+  hash_combine(result, static_cast<int>(shape_));
+  switch (kind_) {
+    case InputType::EXACT_TYPE:
+      hash_combine(result, type_->Hash());
+      break;
+    case InputType::SAME_TYPE_ID:
+      hash_combine(result, static_cast<int>(type_id_));
+      break;
+    default:
+      break;
+  }
+  return result;
+}
+
+std::string InputType::ToString() const {
+  std::stringstream ss;
+  switch (shape_) {
+    case ValueDescr::ANY:
+      ss << "any";
+      break;
+    case ValueDescr::ARRAY:
+      ss << "array";
+      break;
+    case ValueDescr::SCALAR:
+      ss << "scalar";
+      break;
+    default:
+      DCHECK(false);
+      break;
+  }
+  ss << "[";
+  switch (kind_) {
+    case InputType::EXACT_TYPE:
+      ss << type_->ToString();
+      break;
+    case InputType::SAME_TYPE_ID: {
+      // Indicate that the parameters for the type are unspecified. TODO: don't
+      // show this for types without parameters, like Type::INT32
+      ss << internal::ToString(type_id_) << "*";
+    } break;
+    default:
+      DCHECK(false);
+      break;
+  }
+  ss << "]";
+  return ss.str();
+}
+
+bool InputType::Equals(const InputType& other) const {
+  if (this == &other) {
+    return true;
+  }
+  if (kind_ != other.kind_ || shape_ != other.shape_) {
+    return false;
+  }
+  switch (kind_) {
+    case InputType::EXACT_TYPE:
+      return type_->Equals(*other.type_);
+    case InputType::SAME_TYPE_ID:
+      return type_id_ == other.type_id_;
+    default:
+      return false;
+  }
+}
+
+bool InputType::Matches(const ValueDescr& descr) const {
+  if (shape_ != ValueDescr::ANY && descr.shape != shape_) {
+    return false;
+  }
+  switch (kind_) {
+    case InputType::EXACT_TYPE:
+      return type_->Equals(*descr.type);
+    case InputType::SAME_TYPE_ID:
+      return type_id_ == descr.type->id();
+    default:
+      // ANY_TYPE
+      return true;
+  }
+}
+
+bool InputType::Matches(const Datum& value) const { return Matches(value.descr()); }
+
+const std::shared_ptr<DataType>& InputType::type() const {
+  DCHECK_EQ(InputType::EXACT_TYPE, kind_);
+  return type_;
+}
+
+Type::type InputType::type_id() const {
+  DCHECK_EQ(InputType::SAME_TYPE_ID, kind_);
+  return type_id_;
+}
+
+// ----------------------------------------------------------------------
+// OutputType
+
+OutputType::Resolver ResolveAs(ValueDescr descr) {
+  return [descr](const std::vector<ValueDescr>&) { return descr; };
+}
+
+OutputType::OutputType(ValueDescr descr) : resolver_(ResolveAs(descr)) {}
+
+Result<ValueDescr> OutputType::Resolve(const std::vector<ValueDescr>& args) const {
+  if (kind_ == OutputType::FIXED) {
+    return ValueDescr(type_, GetBroadcastShape(args));
+  } else {
+    return resolver_(args);
+  }
+}
+
+const std::shared_ptr<DataType>& OutputType::type() const {
+  DCHECK_EQ(FIXED, kind_);
+  return type_;
+}
+
+const OutputType::Resolver& OutputType::resolver() const {
+  DCHECK_EQ(COMPUTED, kind_);
+  return resolver_;
+}
+
+std::string OutputType::ToString() const {
+  if (kind_ == OutputType::FIXED) {
+    return type_->ToString();
+  } else {
+    return "computed";
+  }
+}
+
+// ----------------------------------------------------------------------
+// KernelSignature
+
+KernelSignature::KernelSignature(std::vector<InputType> in_types, OutputType out_type,
+                                 bool is_varargs)
+    : in_types_(std::move(in_types)),
+      out_type_(std::move(out_type)),
+      is_varargs_(is_varargs),
+      hash_code_(0) {
+  // Varargs sigs must have only a single input type to use for argument validation
+  DCHECK(!is_varargs || (is_varargs && (in_types_.size() == 1)));
+}
+
+std::shared_ptr<KernelSignature> KernelSignature::Make(std::vector<InputType> in_types,
+                                                       OutputType out_type,
+                                                       bool is_varargs) {
+  return std::make_shared<KernelSignature>(std::move(in_types), std::move(out_type),
+                                           is_varargs);
+}
+
+bool KernelSignature::Equals(const KernelSignature& other) const {
+  if (is_varargs_ != other.is_varargs_) {
+    return false;
+  }
+  if (in_types_.size() != other.in_types_.size()) {
+    return false;
+  }
+  for (size_t i = 0; i < in_types_.size(); ++i) {
+    if (!in_types_[i].Equals(other.in_types_[i])) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool KernelSignature::MatchesInputs(const std::vector<ValueDescr>& args) const {
+  if (is_varargs_) {
+    for (const auto& arg : args) {
+      if (!in_types_[0].Matches(arg)) {
+        return false;
+      }
+    }
+  } else {
+    if (args.size() != in_types_.size()) {
+      return false;
+    }
+    for (size_t i = 0; i < in_types_.size(); ++i) {
+      if (!in_types_[i].Matches(args[i])) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+int64_t KernelSignature::Hash() const {
+  if (hash_code_ != 0) {
+    return hash_code_;
+  }
+  size_t result = kHashSeed;
+  for (const auto& in_type : in_types_) {
+    hash_combine(result, in_type.Hash());
+  }
+  hash_code_ = result;
+  return result;
+}
+
+std::string KernelSignature::ToString() const {
+  std::stringstream ss;
+
+  if (is_varargs_) {
+    ss << "varargs[" << in_types_[0].ToString() << "]";
+  } else {
+    ss << "(";
+    for (size_t i = 0; i < in_types_.size(); ++i) {
+      if (i > 0) {
+        ss << ", ";
+      }
+      ss << in_types_[i].ToString();
+    }
+    ss << ")";
+  }
+  ss << " -> " << out_type_.ToString();
+  return ss.str();
+}
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h
index 16dca696567..30eb097f5ef 100644
--- a/cpp/src/arrow/compute/kernel.h
+++ b/cpp/src/arrow/compute/kernel.h
@@ -15,295 +15,472 @@
 // specific language governing permissions and limitations
 // under the License.
 
+// NOTE: API is EXPERIMENTAL and will change without going through a
+// deprecation cycle
+
 #pragma once
 
+#include <cstdint>
+#include <functional>
 #include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
-#include "arrow/array.h"
-#include "arrow/record_batch.h"
-#include "arrow/scalar.h"
-#include "arrow/table.h"
-#include "arrow/util/macros.h"
-#include "arrow/util/memory.h"
-#include "arrow/util/variant.h"  // IWYU pragma: export
+#include "arrow/compute/exec.h"
+#include "arrow/datum.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
 #include "arrow/util/visibility.h"
 
 namespace arrow {
+
+class Buffer;
+struct Datum;
+
 namespace compute {
 
-class FunctionContext;
+struct FunctionOptions;
 
-/// \class OpKernel
-/// \brief Base class for operator kernels
-///
-/// Note to implementors:
-/// Operator kernels are intended to be the lowest level of an analytics/compute
-/// engine.  They will generally not be exposed directly to end-users.  Instead
-/// they will be wrapped by higher level constructs (e.g. top-level functions
-/// or physical execution plan nodes).  These higher level constructs are
-/// responsible for user input validation and returning the appropriate
-/// error Status.
-///
-/// Due to this design, implementations of Call (the execution
-/// method on subclasses) should use assertions (i.e. DCHECK) to double-check
-/// parameter arguments when in higher level components returning an
-/// InvalidArgument error might be more appropriate.
-///
-class ARROW_EXPORT OpKernel {
+/// \brief Base class for opaque kernel-specific state. For example, if there
+/// is some kind of initialization required
+struct KernelState {
+  virtual ~KernelState() = default;
+};
+
+/// \brief Context/state for the execution of a particular kernel
+class ARROW_EXPORT KernelContext {
  public:
-  virtual ~OpKernel() = default;
-  /// \brief EXPERIMENTAL The output data type of the kernel
-  /// \return the output type
-  virtual std::shared_ptr<DataType> out_type() const = 0;
+  explicit KernelContext(ExecContext* exec_ctx) : exec_ctx_(exec_ctx) {}
+
+  /// \brief Allocate buffer from the context's memory pool
+  Result<std::shared_ptr<Buffer>> Allocate(int64_t nbytes);
+
+  /// \brief Allocate buffer for bitmap from the context's memory pool
+  Result<std::shared_ptr<Buffer>> AllocateBitmap(int64_t num_bits);
+
+  /// \brief Indicate that an error has occurred, to be checked by a exec caller
+  /// \param[in] status a Status instance
+  ///
+  /// \note Will not overwrite a prior set Status, so we will have the first
+  /// error that occurred until ExecContext::ResetStatus is called
+  void SetStatus(const Status& status);
+
+  /// \brief Clear any error status
+  void ResetStatus();
+
+  /// \brief Return true if an error has occurred
+  bool HasError() const { return !status_.ok(); }
+
+  /// \brief Return the current status of the context
+  const Status& status() const { return status_; }
+
+  // For passing kernel state to
+  void SetState(KernelState* state) { state_ = state; }
+
+  KernelState* state() { return state_; }
+
+  /// \brief Common state related to function execution
+  ExecContext* exec_context() { return exec_ctx_; }
+
+ private:
+  ExecContext* exec_ctx_;
+  Status status_;
+  KernelState* state_;
 };
 
-struct Datum;
-static inline bool CollectionEquals(const std::vector<Datum>& left,
-                                    const std::vector<Datum>& right);
-
-// Datums variants may have a length. This special value indicate that the
-// current variant does not have a length.
-constexpr int64_t kUnknownLength = -1;
-
-/// \class Datum
-/// \brief Variant type for various Arrow C++ data structures
-struct ARROW_EXPORT Datum {
-  enum type { NONE, SCALAR, ARRAY, CHUNKED_ARRAY, RECORD_BATCH, TABLE, COLLECTION };
-
-  util::variant<decltype(NULLPTR), std::shared_ptr<Scalar>, std::shared_ptr<ArrayData>,
-                std::shared_ptr<ChunkedArray>, std::shared_ptr<RecordBatch>,
-                std::shared_ptr<Table>, std::vector<Datum>>
-      value;
-
-  /// \brief Empty datum, to be populated elsewhere
-  Datum() : value(NULLPTR) {}
-
-  Datum(const std::shared_ptr<Scalar>& value)  // NOLINT implicit conversion
-      : value(value) {}
-  Datum(const std::shared_ptr<ArrayData>& value)  // NOLINT implicit conversion
-      : value(value) {}
-
-  Datum(const std::shared_ptr<Array>& value)  // NOLINT implicit conversion
-      : Datum(value ? value->data() : NULLPTR) {}
-
-  Datum(const std::shared_ptr<ChunkedArray>& value)  // NOLINT implicit conversion
-      : value(value) {}
-  Datum(const std::shared_ptr<RecordBatch>& value)  // NOLINT implicit conversion
-      : value(value) {}
-  Datum(const std::shared_ptr<Table>& value)  // NOLINT implicit conversion
-      : value(value) {}
-  Datum(const std::vector<Datum>& value)  // NOLINT implicit conversion
-      : value(value) {}
-
-  // Cast from subtypes of Array to Datum
-  template <typename T, typename = enable_if_t<std::is_base_of<Array, T>::value>>
-  Datum(const std::shared_ptr<T>& value)  // NOLINT implicit conversion
-      : Datum(std::shared_ptr<Array>(value)) {}
-
-  // Convenience constructors
-  explicit Datum(bool value) : value(std::make_shared<BooleanScalar>(value)) {}
-  explicit Datum(int8_t value) : value(std::make_shared<Int8Scalar>(value)) {}
-  explicit Datum(uint8_t value) : value(std::make_shared<UInt8Scalar>(value)) {}
-  explicit Datum(int16_t value) : value(std::make_shared<Int16Scalar>(value)) {}
-  explicit Datum(uint16_t value) : value(std::make_shared<UInt16Scalar>(value)) {}
-  explicit Datum(int32_t value) : value(std::make_shared<Int32Scalar>(value)) {}
-  explicit Datum(uint32_t value) : value(std::make_shared<UInt32Scalar>(value)) {}
-  explicit Datum(int64_t value) : value(std::make_shared<Int64Scalar>(value)) {}
-  explicit Datum(uint64_t value) : value(std::make_shared<UInt64Scalar>(value)) {}
-  explicit Datum(float value) : value(std::make_shared<FloatScalar>(value)) {}
-  explicit Datum(double value) : value(std::make_shared<DoubleScalar>(value)) {}
-
-  ~Datum() {}
-
-  Datum(const Datum& other) noexcept { this->value = other.value; }
-
-  Datum& operator=(const Datum& other) noexcept {
-    value = other.value;
-    return *this;
-  }
+/// A standard function taking zero or more Array/Scalar values and returning
+/// Array/Scalar output. May be used for SCALAR and VECTOR kernel kinds. Should
+/// write into pre-allocated memory except in cases when a builder
+/// (e.g. StringBuilder) must be employed
+using ArrayKernelExec = std::function<void(KernelContext*, const ExecBatch&, Datum*)>;
 
-  // Define move constructor and move assignment, for better performance
-  Datum(Datum&& other) noexcept : value(std::move(other.value)) {}
+/// \brief A container to express what kernel argument input types are accepted
+class ARROW_EXPORT InputType {
+ public:
+  enum Kind {
+    /// Accept any value type
+    ANY_TYPE,
 
-  Datum& operator=(Datum&& other) noexcept {
-    value = std::move(other.value);
-    return *this;
-  }
+    /// A fixed arrow::DataType and will only exact match having this exact
+    /// type (e.g. same TimestampType unit, same decimal scale and precision,
+    /// or same nested child types
+    EXACT_TYPE,
 
-  Datum::type kind() const {
-    switch (this->value.index()) {
-      case 0:
-        return Datum::NONE;
-      case 1:
-        return Datum::SCALAR;
-      case 2:
-        return Datum::ARRAY;
-      case 3:
-        return Datum::CHUNKED_ARRAY;
-      case 4:
-        return Datum::RECORD_BATCH;
-      case 5:
-        return Datum::TABLE;
-      case 6:
-        return Datum::COLLECTION;
-      default:
-        return Datum::NONE;
-    }
-  }
+    /// Any type having the indicated Type::type id. For example, accept
+    /// any Type::LIST or any Type::TIMESTAMP
+    SAME_TYPE_ID,
+  };
 
-  std::shared_ptr<ArrayData> array() const {
-    return util::get<std::shared_ptr<ArrayData>>(this->value);
-  }
+  InputType(ValueDescr::Shape shape = ValueDescr::ANY)  // NOLINT implicit construction
+      : kind_(ANY_TYPE), shape_(shape) {}
 
-  std::shared_ptr<Array> make_array() const {
-    return MakeArray(util::get<std::shared_ptr<ArrayData>>(this->value));
-  }
+  InputType(std::shared_ptr<DataType> type,
+            ValueDescr::Shape shape = ValueDescr::ANY)  // NOLINT implicit construction
+      : kind_(EXACT_TYPE), shape_(shape), type_(std::move(type)) {}
 
-  std::shared_ptr<ChunkedArray> chunked_array() const {
-    return util::get<std::shared_ptr<ChunkedArray>>(this->value);
-  }
+  InputType(const ValueDescr& descr)  // NOLINT implicit construction
+      : InputType(descr.type, descr.shape) {}
 
-  std::shared_ptr<RecordBatch> record_batch() const {
-    return util::get<std::shared_ptr<RecordBatch>>(this->value);
-  }
+  InputType(Type::type type_id,
+            ValueDescr::Shape shape = ValueDescr::ANY)  // NOLINT implicit construction
+      : kind_(SAME_TYPE_ID), shape_(shape), type_id_(type_id) {}
 
-  std::shared_ptr<Table> table() const {
-    return util::get<std::shared_ptr<Table>>(this->value);
-  }
+  InputType(const InputType& other) { CopyInto(other); }
 
-  const std::vector<Datum> collection() const {
-    return util::get<std::vector<Datum>>(this->value);
+  // Convenience ctors
+  static InputType Array(std::shared_ptr<DataType> type) {
+    return InputType(std::move(type), ValueDescr::ARRAY);
   }
 
-  std::shared_ptr<Scalar> scalar() const {
-    return util::get<std::shared_ptr<Scalar>>(this->value);
+  static InputType Scalar(std::shared_ptr<DataType> type) {
+    return InputType(std::move(type), ValueDescr::SCALAR);
   }
 
-  bool is_array() const { return this->kind() == Datum::ARRAY; }
+  static InputType Array(Type::type id) { return InputType(id, ValueDescr::ARRAY); }
 
-  bool is_arraylike() const {
-    return this->kind() == Datum::ARRAY || this->kind() == Datum::CHUNKED_ARRAY;
-  }
+  static InputType Scalar(Type::type id) { return InputType(id, ValueDescr::SCALAR); }
 
-  bool is_scalar() const { return this->kind() == Datum::SCALAR; }
+  void operator=(const InputType& other) { CopyInto(other); }
 
-  bool is_collection() const { return this->kind() == Datum::COLLECTION; }
+  InputType(InputType&& other) { MoveInto(std::forward<InputType>(other)); }
 
-  /// \brief The value type of the variant, if any
-  ///
-  /// \return nullptr if no type
-  std::shared_ptr<DataType> type() const {
-    if (this->kind() == Datum::ARRAY) {
-      return util::get<std::shared_ptr<ArrayData>>(this->value)->type;
-    } else if (this->kind() == Datum::CHUNKED_ARRAY) {
-      return util::get<std::shared_ptr<ChunkedArray>>(this->value)->type();
-    } else if (this->kind() == Datum::SCALAR) {
-      return util::get<std::shared_ptr<Scalar>>(this->value)->type;
-    }
-    return NULLPTR;
+  void operator=(InputType&& other) { MoveInto(std::forward<InputType>(other)); }
+
+  /// \brief Return true if this type exactly matches another
+  bool Equals(const InputType& other) const;
+
+  bool operator==(const InputType& other) const { return this->Equals(other); }
+
+  bool operator!=(const InputType& other) const { return !(*this == other); }
+
+  /// \brief Return hash code
+  uint64_t Hash() const;
+
+  /// \brief Render a human-readable string representation
+  std::string ToString() const;
+
+  /// \brief Return true if the value matches this argument kind in type
+  /// and shape
+  bool Matches(const Datum& value) const;
+
+  /// \brief Return true if the value descriptor matches this argument kind in
+  /// type and shape
+  bool Matches(const ValueDescr& value) const;
+
+  /// \brief The type matching rule that this InputType uses
+  Kind kind() const { return kind_; }
+
+  ValueDescr::Shape shape() const { return shape_; }
+
+  /// \brief For ArgKind::EXACT_TYPE, the exact type that this InputType must
+  /// match. Otherwise this function should not be used
+  const std::shared_ptr<DataType>& type() const;
+
+  /// \brief For ArgKind::SAME_TYPE_ID, the Type::type that this InputType must
+  /// match, Otherwise this function should not be used
+  Type::type type_id() const;
+
+ private:
+  void CopyInto(const InputType& other) {
+    this->kind_ = other.kind_;
+    this->shape_ = other.shape_;
+    this->type_ = other.type_;
+    this->type_id_ = other.type_id_;
   }
 
-  /// \brief The value length of the variant, if any
-  ///
-  /// \return kUnknownLength if no type
-  int64_t length() const {
-    if (this->kind() == Datum::ARRAY) {
-      return util::get<std::shared_ptr<ArrayData>>(this->value)->length;
-    } else if (this->kind() == Datum::CHUNKED_ARRAY) {
-      return util::get<std::shared_ptr<ChunkedArray>>(this->value)->length();
-    } else if (this->kind() == Datum::SCALAR) {
-      return 1;
-    }
-    return kUnknownLength;
+  void MoveInto(InputType&& other) {
+    this->kind_ = other.kind_;
+    this->shape_ = other.shape_;
+    this->type_ = std::move(other.type_);
+    this->type_id_ = other.type_id_;
   }
 
-  /// \brief The array chunks of the variant, if any
-  ///
-  /// \return empty if not arraylike
-  ArrayVector chunks() const {
-    if (!this->is_arraylike()) {
-      return {};
-    }
-    if (this->is_array()) {
-      return {this->make_array()};
-    }
-    return this->chunked_array()->chunks();
+  Kind kind_;
+
+  ValueDescr::Shape shape_;
+
+  // For EXACT_TYPE ArgKind
+  std::shared_ptr<DataType> type_;
+
+  // For SAME_TYPE_ID ArgKind
+  Type::type type_id_;
+};
+
+/// \brief Container to capture both exact and input-dependent output types
+///
+/// The value shape returned by Resolve will be determined by broadcasting the
+/// shapes of the input arguments, otherwise this is handled by the
+/// user-defined resolver function
+///
+/// * Any ARRAY shape -> output shape is ARRAY
+/// * All SCALAR shapes -> output shape is SCALAR
+class ARROW_EXPORT OutputType {
+ public:
+  /// \brief An enum indicating whether the value type is an invariant fixed
+  /// value or one that's computed by a kernel-defined resolver function
+  enum ResolveKind { FIXED, COMPUTED };
+
+  /// Type resolution function. Given input types and shapes, return output
+  /// type and shape. This function SHOULD _not_ be used to check for arity,
+  /// that SHOULD be performed one or more layers above.
+  using Resolver = std::function<Result<ValueDescr>(const std::vector<ValueDescr>&)>;
+
+  OutputType(std::shared_ptr<DataType> type)  // NOLINT implicit construction
+      : kind_(FIXED), type_(std::move(type)) {}
+
+  /// For outputting a particular type and shape
+  OutputType(ValueDescr descr);  // NOLINT implicit construction
+
+  explicit OutputType(Resolver resolver) : kind_(COMPUTED), resolver_(resolver) {}
+
+  OutputType(const OutputType& other) {
+    this->kind_ = other.kind_;
+    this->type_ = other.type_;
+    this->resolver_ = other.resolver_;
   }
 
-  bool Equals(const Datum& other) const {
-    if (this->kind() != other.kind()) return false;
-
-    switch (this->kind()) {
-      case Datum::NONE:
-        return true;
-      case Datum::SCALAR:
-        return internal::SharedPtrEquals(this->scalar(), other.scalar());
-      case Datum::ARRAY:
-        return internal::SharedPtrEquals(this->make_array(), other.make_array());
-      case Datum::CHUNKED_ARRAY:
-        return internal::SharedPtrEquals(this->chunked_array(), other.chunked_array());
-      case Datum::RECORD_BATCH:
-        return internal::SharedPtrEquals(this->record_batch(), other.record_batch());
-      case Datum::TABLE:
-        return internal::SharedPtrEquals(this->table(), other.table());
-      case Datum::COLLECTION:
-        return CollectionEquals(this->collection(), other.collection());
-      default:
-        return false;
-    }
+  OutputType(OutputType&& other) {
+    this->kind_ = other.kind_;
+    this->type_ = std::move(other.type_);
+    this->resolver_ = other.resolver_;
   }
+
+  /// \brief Return the shape and type of the expected output value of the
+  /// kernel given the value descriptors (shapes and types)
+  Result<ValueDescr> Resolve(const std::vector<ValueDescr>& args) const;
+
+  /// \brief The value type for the FIXED kind rule
+  const std::shared_ptr<DataType>& type() const;
+
+  /// \brief For use with COMPUTED resolution strategy, the output type depends
+  /// on the input type. It may be more convenient to invoke this with
+  /// OutputType::Resolve returned from this method
+  const Resolver& resolver() const;
+
+  /// \brief Render a human-readable string representation
+  std::string ToString() const;
+
+  /// \brief Return the kind of type resolution of this output type, whether
+  /// fixed/invariant or computed by a "user"-defined resolver
+  ResolveKind kind() const { return kind_; }
+
+ private:
+  ResolveKind kind_;
+
+  // For FIXED resolution
+  std::shared_ptr<DataType> type_;
+
+  // For COMPUTED resolution
+  Resolver resolver_;
 };
 
-/// \class UnaryKernel
-/// \brief An array-valued function of a single input argument.
+/// \brief Holds the input types and output type of the kernel
 ///
-/// Note to implementors:  Try to avoid making kernels that allocate memory if
-/// the output size is a deterministic function of the Input Datum's metadata.
-/// Instead separate the logic of the kernel and allocations necessary into
-/// two different kernels.  Some reusable kernels that allocate buffers
-/// and delegate computation to another kernel are available in util-internal.h.
-class ARROW_EXPORT UnaryKernel : public OpKernel {
+/// Varargs functions should pass a single input type to be used to validate
+/// the the input types of a function invocation
+class ARROW_EXPORT KernelSignature {
  public:
-  /// \brief Executes the kernel.
-  ///
-  /// \param[in] ctx The function context for the kernel
-  /// \param[in] input The kernel input data
-  /// \param[out] out The output of the function. Each implementation of this
-  /// function might assume different things about the existing contents of out
-  /// (e.g. which buffers are preallocated).  In the future it is expected that
-  /// there will be a more generic mechanism for understanding the necessary
-  /// contracts.
-  virtual Status Call(FunctionContext* ctx, const Datum& input, Datum* out) = 0;
+  KernelSignature(std::vector<InputType> in_types, OutputType out_type,
+                  bool is_varargs = false);
+
+  /// \brief Convenience ctor since make_shared can be awkward
+  static std::shared_ptr<KernelSignature> Make(std::vector<InputType> in_types,
+                                               OutputType out_type,
+                                               bool is_varargs = false);
+
+  /// \brief Return true if the signature if compatible with the list of input
+  /// value descriptors
+  bool MatchesInputs(const std::vector<ValueDescr>& descriptors) const;
+
+  /// \brief Returns true if the input types of each signature are
+  /// equal. Well-formed functions should have a deterministic output type
+  /// given input types, but currently it is the responsibility of the
+  /// developer to ensure this
+  bool Equals(const KernelSignature& other) const;
+
+  bool operator==(const KernelSignature& other) const { return this->Equals(other); }
+
+  bool operator!=(const KernelSignature& other) const { return !(*this == other); }
+
+  /// \brief Compute a hash code for the signature
+  int64_t Hash() const;
+
+  const std::vector<InputType>& in_types() const { return in_types_; }
+
+  const OutputType& out_type() const { return out_type_; }
+
+  /// \brief Render a human-readable string representation
+  std::string ToString() const;
+
+  bool is_varargs() const { return is_varargs_; }
+
+ private:
+  std::vector<InputType> in_types_;
+  OutputType out_type_;
+  bool is_varargs_;
+
+  // For caching the hash code after it's computed the first time
+  mutable int64_t hash_code_;
 };
 
-/// \class BinaryKernel
-/// \brief An array-valued function of a two input arguments
-class ARROW_EXPORT BinaryKernel : public OpKernel {
- public:
-  virtual Status Call(FunctionContext* ctx, const Datum& left, const Datum& right,
-                      Datum* out) = 0;
+struct SimdLevel {
+  enum type { NONE, SSE4_2, AVX, AVX2, AVX512, NEON };
 };
 
-// TODO doxygen 1.8.16 does not like the following code
-///@cond INTERNAL
+struct NullHandling {
+  enum type {
+    /// Compute the output validity bitmap by intersecting the validity bitmaps
+    /// of the arguments. Kernel does not do anything with the bitmap
+    INTERSECTION,
 
-static inline bool CollectionEquals(const std::vector<Datum>& left,
-                                    const std::vector<Datum>& right) {
-  if (left.size() != right.size()) {
-    return false;
-  }
+    /// Kernel expects a pre-allocated buffer to write the result bitmap into
+    COMPUTED_PREALLOCATE,
 
-  for (size_t i = 0; i < left.size(); i++) {
-    if (!left[i].Equals(right[i])) {
-      return false;
-    }
-  }
-  return true;
-}
+    /// Kernel allocates and populates the validity bitmap of the output
+    COMPUTED_NO_PREALLOCATE,
+
+    /// Output is never null
+    OUTPUT_NOT_NULL
+  };
+};
+
+struct MemAllocation {
+  enum type {
+    // For data types that support pre-allocation (fixed-type), the kernel
+    // expects to be provided pre-allocated memory to write
+    // into. Non-fixed-width must always allocate their own memory but perhaps
+    // not their validity bitmaps. The allocation made for the same length as
+    // the execution batch, so vector kernels yielding differently sized output
+    // should not use this
+    PREALLOCATE,
+
+    // The kernel does its own memory allocation
+    NO_PREALLOCATE
+  };
+};
+
+struct Kernel;
+
+using KernelInit = std::function<std::unique_ptr<KernelState>(
+    KernelContext*, const Kernel&, const FunctionOptions*)>;
+
+/// \brief Base type for kernels. Contains the function signature and
+/// optionally the state initialization function, along with some common
+/// attributes
+struct Kernel {
+  Kernel() {}
+
+  Kernel(std::shared_ptr<KernelSignature> sig, KernelInit init)
+      : signature(std::move(sig)), init(init) {}
 
-///@endcond
+  Kernel(std::vector<InputType> in_types, OutputType out_type, KernelInit init)
+      : Kernel(KernelSignature::Make(std::move(in_types), out_type), init) {}
+
+  std::shared_ptr<KernelSignature> signature;
+
+  /// \brief Create a new KernelState for invocations of this kernel, e.g. to
+  /// set up any options or state relevant for execution. May be nullptr
+  KernelInit init;
+
+  // Does execution benefit from parallelization (splitting large chunks into
+  // smaller chunks and using multiple threads). Some vector kernels may
+  // require single-threaded execution.
+  bool parallelizable = true;
+
+  SimdLevel::type simd_level = SimdLevel::NONE;
+};
+
+/// \brief Descriptor to hold signature and execution function implementations
+/// for a particular kernel
+struct ArrayKernel : public Kernel {
+  ArrayKernel() {}
+
+  ArrayKernel(std::shared_ptr<KernelSignature> sig, ArrayKernelExec exec,
+              KernelInit init = NULLPTR)
+      : Kernel(std::move(sig), init), exec(exec) {}
+
+  ArrayKernel(std::vector<InputType> in_types, OutputType out_type, ArrayKernelExec exec,
+              KernelInit init = NULLPTR)
+      : Kernel(std::move(in_types), std::move(out_type), init), exec(exec) {}
+
+  /// \brief Perform a single invocation of this kernel. In general, this
+  /// function must
+  ArrayKernelExec exec;
+
+  /// \brief Writing execution results into larger contiguous allocations
+  /// requires that the kernel be able to write into sliced output
+  /// ArrayData*. Some kernel implementations may not be able to do this, so
+  /// setting this to false disables this functionality
+  bool can_write_into_slices = true;
+};
+
+struct ScalarKernel : public ArrayKernel {
+  using ArrayKernel::ArrayKernel;
+
+  // For scalar functions preallocated data and intersecting arg validity
+  // bitmaps is a reasonable default
+  NullHandling::type null_handling = NullHandling::INTERSECTION;
+  MemAllocation::type mem_allocation = MemAllocation::PREALLOCATE;
+};
+
+// Finalize returns Datum to permit multiple return values
+using VectorFinalize = std::function<void(KernelContext*, std::vector<Datum>*)>;
+
+struct VectorKernel : public ArrayKernel {
+  VectorKernel() {}
+
+  VectorKernel(std::shared_ptr<KernelSignature> sig, ArrayKernelExec exec)
+      : ArrayKernel(std::move(sig), exec) {}
+
+  VectorKernel(std::vector<InputType> in_types, OutputType out_type, ArrayKernelExec exec,
+               KernelInit init = NULLPTR, VectorFinalize finalize = NULLPTR)
+      : ArrayKernel(std::move(in_types), out_type, exec, init), finalize(finalize) {}
+
+  VectorKernel(std::shared_ptr<KernelSignature> sig, ArrayKernelExec exec,
+               KernelInit init = NULLPTR, VectorFinalize finalize = NULLPTR)
+      : ArrayKernel(std::move(sig), exec, init), finalize(finalize) {}
+
+  VectorFinalize finalize;
+
+  // Since vector kernels generally are implemented rather differently from
+  // scalar/elementwise kernels (and they may not even yield arrays of the same
+  // size), so we make the developer opt-in to any memory preallocation rather
+  // than having to turn it off.
+  NullHandling::type null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+  MemAllocation::type mem_allocation = MemAllocation::NO_PREALLOCATE;
+};
+
+using ScalarAggregateConsume = std::function<void(KernelContext*, const ExecBatch&)>;
+
+using ScalarAggregateMerge =
+    std::function<void(KernelContext*, const KernelState&, KernelState*)>;
+
+// Finalize returns Datum to permit multiple return values
+using ScalarAggregateFinalize = std::function<void(KernelContext*, Datum*)>;
+
+struct ScalarAggregateKernel : public Kernel {
+  ScalarAggregateKernel() {}
+
+  ScalarAggregateKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
+                        ScalarAggregateConsume consume, ScalarAggregateMerge merge,
+                        ScalarAggregateFinalize finalize)
+      : Kernel(std::move(sig), init),
+        consume(consume),
+        merge(merge),
+        finalize(finalize) {}
+
+  ScalarAggregateKernel(std::vector<InputType> in_types, OutputType out_type,
+                        KernelInit init, ScalarAggregateConsume consume,
+                        ScalarAggregateMerge merge, ScalarAggregateFinalize finalize)
+      : ScalarAggregateKernel(KernelSignature::Make(std::move(in_types), out_type), init,
+                              consume, merge, finalize) {}
+
+  ScalarAggregateConsume consume;
+  ScalarAggregateMerge merge;
+  ScalarAggregateFinalize finalize;
+};
 
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernel_test.cc b/cpp/src/arrow/compute/kernel_test.cc
new file mode 100644
index 00000000000..b562da95815
--- /dev/null
+++ b/cpp/src/arrow/compute/kernel_test.cc
@@ -0,0 +1,430 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "arrow/compute/kernel.h"
+#include "arrow/status.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/type.h"
+#include "arrow/util/key_value_metadata.h"
+
+namespace arrow {
+namespace compute {
+
+// ----------------------------------------------------------------------
+// InputType
+
+TEST(InputType, AnyTypeConstructor) {
+  // Check the ANY_TYPE ctors
+  InputType ty;
+  ASSERT_EQ(InputType::ANY_TYPE, ty.kind());
+  ASSERT_EQ(ValueDescr::ANY, ty.shape());
+
+  ty = InputType(ValueDescr::SCALAR);
+  ASSERT_EQ(ValueDescr::SCALAR, ty.shape());
+
+  ty = InputType(ValueDescr::ARRAY);
+  ASSERT_EQ(ValueDescr::ARRAY, ty.shape());
+}
+
+TEST(InputType, Constructors) {
+  // Exact type constructor
+  InputType ty1(int8());
+  ASSERT_EQ(InputType::EXACT_TYPE, ty1.kind());
+  ASSERT_EQ(ValueDescr::ANY, ty1.shape());
+  AssertTypeEqual(*int8(), *ty1.type());
+
+  InputType ty1_implicit = int8();
+  ASSERT_TRUE(ty1.Equals(ty1_implicit));
+
+  InputType ty1_array(int8(), ValueDescr::ARRAY);
+  ASSERT_EQ(ValueDescr::ARRAY, ty1_array.shape());
+
+  InputType ty1_scalar(int8(), ValueDescr::SCALAR);
+  ASSERT_EQ(ValueDescr::SCALAR, ty1_scalar.shape());
+
+  // Same type id constructor
+  InputType ty2 = Type::DECIMAL;
+  ASSERT_EQ(InputType::SAME_TYPE_ID, ty2.kind());
+
+  InputType ty2_array(Type::DECIMAL, ValueDescr::ARRAY);
+  ASSERT_EQ(ValueDescr::ARRAY, ty2_array.shape());
+
+  InputType ty2_scalar(Type::DECIMAL, ValueDescr::SCALAR);
+  ASSERT_EQ(ValueDescr::SCALAR, ty2_scalar.shape());
+
+  // Implicit construction in a vector
+  std::vector<InputType> types = {int8(), Type::DECIMAL};
+  ASSERT_TRUE(types[0].Equals(ty1));
+  ASSERT_TRUE(types[1].Equals(ty2));
+
+  // Copy constructor
+  InputType ty3 = ty1;
+  InputType ty4 = ty2;
+  ASSERT_TRUE(ty3.Equals(ty1));
+  ASSERT_TRUE(ty4.Equals(ty2));
+
+  // Move constructor
+  InputType ty5 = std::move(ty3);
+  InputType ty6 = std::move(ty4);
+  ASSERT_TRUE(ty5.Equals(ty1));
+  ASSERT_TRUE(ty6.Equals(ty2));
+
+  // ToString
+  ASSERT_EQ("any[int8]", ty1.ToString());
+  ASSERT_EQ("array[int8]", ty1_array.ToString());
+  ASSERT_EQ("scalar[int8]", ty1_scalar.ToString());
+
+  ASSERT_EQ("any[decimal*]", ty2.ToString());
+  ASSERT_EQ("array[decimal*]", ty2_array.ToString());
+  ASSERT_EQ("scalar[decimal*]", ty2_scalar.ToString());
+}
+
+TEST(InputType, Equals) {
+  InputType t1 = int8();
+  InputType t2 = int8();
+  InputType t3(int8(), ValueDescr::ARRAY);
+  InputType t3_i32(int32(), ValueDescr::ARRAY);
+  InputType t3_scalar(int8(), ValueDescr::SCALAR);
+  InputType t4(int8(), ValueDescr::ARRAY);
+  InputType t4_i32(int32(), ValueDescr::ARRAY);
+
+  InputType t5 = Type::DECIMAL;
+  InputType t6 = Type::DECIMAL;
+  InputType t7(Type::DECIMAL, ValueDescr::SCALAR);
+  InputType t7_i32(Type::INT32, ValueDescr::SCALAR);
+  InputType t8(Type::DECIMAL, ValueDescr::SCALAR);
+  InputType t8_i32(Type::INT32, ValueDescr::SCALAR);
+
+  ASSERT_TRUE(t1.Equals(t2));
+  ASSERT_EQ(t1, t2);
+
+  // ANY vs SCALAR
+  ASSERT_NE(t1, t3);
+
+  ASSERT_EQ(t3, t4);
+
+  // both ARRAY, but different type
+  ASSERT_NE(t3, t3_i32);
+
+  // ARRAY vs SCALAR
+  ASSERT_NE(t3, t3_scalar);
+
+  ASSERT_EQ(t3_i32, t4_i32);
+
+  ASSERT_FALSE(t1.Equals(t5));
+  ASSERT_NE(t1, t5);
+
+  ASSERT_EQ(t5, t5);
+  ASSERT_EQ(t5, t6);
+  ASSERT_NE(t5, t7);
+  ASSERT_EQ(t7, t8);
+  ASSERT_EQ(t7, t8);
+  ASSERT_NE(t7, t7_i32);
+  ASSERT_EQ(t7_i32, t8_i32);
+
+  // NOTE: For the time being, we treat int32() and Type::INT32 as being
+  // different. This could obviously be fixed later to make these equivalent
+  ASSERT_NE(InputType(int8()), InputType(Type::INT32));
+
+  // Check that field metadata excluded from equality checks
+  InputType t9 = list(
+      field("item", utf8(), /*nullable=*/true, key_value_metadata({"foo"}, {"bar"})));
+  InputType t10 = list(field("item", utf8()));
+  ASSERT_TRUE(t9.Equals(t10));
+}
+
+TEST(InputType, Hash) {
+  InputType t0;
+  InputType t0_scalar(ValueDescr::SCALAR);
+  InputType t0_array(ValueDescr::ARRAY);
+
+  InputType t1 = int8();
+  InputType t2 = Type::DECIMAL;
+
+  // These checks try to determine first of all whether Hash always returns the
+  // same value, and whether the elements of the type are all incorporated into
+  // the Hash
+  ASSERT_EQ(t0.Hash(), t0.Hash());
+  ASSERT_NE(t0.Hash(), t0_scalar.Hash());
+  ASSERT_NE(t0.Hash(), t0_array.Hash());
+  ASSERT_NE(t0_scalar.Hash(), t0_array.Hash());
+
+  ASSERT_EQ(t1.Hash(), t1.Hash());
+  ASSERT_EQ(t2.Hash(), t2.Hash());
+
+  ASSERT_NE(t0.Hash(), t1.Hash());
+  ASSERT_NE(t0.Hash(), t2.Hash());
+  ASSERT_NE(t1.Hash(), t2.Hash());
+}
+
+TEST(InputType, Matches) {
+  InputType ty1 = int8();
+
+  ASSERT_TRUE(ty1.Matches(ValueDescr::Scalar(int8())));
+  ASSERT_TRUE(ty1.Matches(ValueDescr::Array(int8())));
+  ASSERT_TRUE(ty1.Matches(ValueDescr::Any(int8())));
+  ASSERT_FALSE(ty1.Matches(ValueDescr::Any(int16())));
+
+  InputType ty2 = Type::DECIMAL;
+  ASSERT_TRUE(ty2.Matches(ValueDescr::Scalar(decimal(12, 2))));
+  ASSERT_TRUE(ty2.Matches(ValueDescr::Array(decimal(12, 2))));
+  ASSERT_FALSE(ty2.Matches(ValueDescr::Any(float64())));
+
+  InputType ty3(int64(), ValueDescr::SCALAR);
+  ASSERT_FALSE(ty3.Matches(ValueDescr::Array(int64())));
+  ASSERT_TRUE(ty3.Matches(ValueDescr::Scalar(int64())));
+  ASSERT_FALSE(ty3.Matches(ValueDescr::Scalar(int32())));
+  ASSERT_FALSE(ty3.Matches(ValueDescr::Any(int64())));
+}
+
+// ----------------------------------------------------------------------
+// OutputType
+
+TEST(OutputType, Constructors) {
+  OutputType ty1 = int8();
+  ASSERT_EQ(OutputType::FIXED, ty1.kind());
+  AssertTypeEqual(*int8(), *ty1.type());
+
+  auto DummyResolver = [](const std::vector<ValueDescr>& args) {
+    return ValueDescr(int32(), GetBroadcastShape(args));
+  };
+  OutputType ty2(DummyResolver);
+  ASSERT_EQ(OutputType::COMPUTED, ty2.kind());
+
+  ASSERT_OK_AND_ASSIGN(ValueDescr out_descr2, ty2.Resolve({}));
+  ASSERT_EQ(ValueDescr::Scalar(int32()), out_descr2);
+
+  // Copy constructor
+  OutputType ty3 = ty1;
+  ASSERT_EQ(OutputType::FIXED, ty3.kind());
+  AssertTypeEqual(*ty1.type(), *ty3.type());
+
+  OutputType ty4 = ty2;
+  ASSERT_EQ(OutputType::COMPUTED, ty4.kind());
+  ASSERT_OK_AND_ASSIGN(ValueDescr out_descr4, ty4.Resolve({}));
+  ASSERT_EQ(ValueDescr::Scalar(int32()), out_descr4);
+
+  // Move constructor
+  OutputType ty5 = std::move(ty1);
+  ASSERT_EQ(OutputType::FIXED, ty5.kind());
+  AssertTypeEqual(*int8(), *ty5.type());
+
+  OutputType ty6 = std::move(ty4);
+  ASSERT_EQ(OutputType::COMPUTED, ty6.kind());
+  ASSERT_OK_AND_ASSIGN(ValueDescr out_descr6, ty6.Resolve({}));
+  ASSERT_EQ(ValueDescr::Scalar(int32()), out_descr6);
+
+  // ToString
+
+  // ty1 was copied to ty3
+  ASSERT_EQ("int8", ty3.ToString());
+  ASSERT_EQ("computed", ty2.ToString());
+}
+
+TEST(OutputType, Resolve) {
+  // Check shape promotion rules for FIXED kind
+  OutputType ty1(int32());
+
+  ASSERT_OK_AND_ASSIGN(ValueDescr descr, ty1.Resolve({}));
+  ASSERT_EQ(ValueDescr::Scalar(int32()), descr);
+
+  ASSERT_OK_AND_ASSIGN(descr, ty1.Resolve({ValueDescr(int8(), ValueDescr::SCALAR)}));
+  ASSERT_EQ(ValueDescr::Scalar(int32()), descr);
+
+  ASSERT_OK_AND_ASSIGN(descr, ty1.Resolve({ValueDescr(int8(), ValueDescr::SCALAR),
+                                           ValueDescr(int8(), ValueDescr::ARRAY)}));
+  ASSERT_EQ(ValueDescr::Array(int32()), descr);
+
+  OutputType ty2([](const std::vector<ValueDescr>& args) -> Result<ValueDescr> {
+    return ValueDescr(args[0].type, GetBroadcastShape(args));
+  });
+
+  ASSERT_OK_AND_ASSIGN(descr, ty2.Resolve({ValueDescr::Array(utf8())}));
+  ASSERT_EQ(ValueDescr::Array(utf8()), descr);
+
+  // Type resolver that returns an error
+  OutputType ty3([](const std::vector<ValueDescr>& args) -> Result<ValueDescr> {
+    // NB: checking the value types versus the function arity should be
+    // validated elsewhere, so this is just for illustration purposes
+    if (args.size() == 0) {
+      return Status::Invalid("Need at least one argument");
+    }
+    return ValueDescr(args[0]);
+  });
+  ASSERT_RAISES(Invalid, ty3.Resolve({}));
+}
+
+TEST(OutputType, ResolveDescr) {
+  ValueDescr d1 = ValueDescr::Scalar(int32());
+  ValueDescr d2 = ValueDescr::Array(int32());
+
+  OutputType ty1(d1);
+  OutputType ty2(d2);
+
+  {
+    ASSERT_OK_AND_ASSIGN(ValueDescr descr, ty1.Resolve({}));
+    ASSERT_EQ(d1, descr);
+  }
+
+  {
+    ASSERT_OK_AND_ASSIGN(ValueDescr descr, ty2.Resolve({}));
+    ASSERT_EQ(d2, descr);
+  }
+}
+
+// ----------------------------------------------------------------------
+// KernelSignature
+
+TEST(KernelSignature, Basics) {
+  // (any[int8], scalar[decimal]) -> utf8
+  std::vector<InputType> in_types({int8(), InputType(Type::DECIMAL, ValueDescr::SCALAR)});
+  OutputType out_type(utf8());
+
+  KernelSignature sig(in_types, out_type);
+  ASSERT_EQ(2, sig.in_types().size());
+  ASSERT_TRUE(sig.in_types()[0].type()->Equals(*int8()));
+  ASSERT_TRUE(sig.in_types()[0].Matches(ValueDescr::Scalar(int8())));
+  ASSERT_TRUE(sig.in_types()[0].Matches(ValueDescr::Array(int8())));
+
+  ASSERT_TRUE(sig.in_types()[1].Matches(ValueDescr::Scalar(decimal(12, 2))));
+  ASSERT_FALSE(sig.in_types()[1].Matches(ValueDescr::Array(decimal(12, 2))));
+}
+
+TEST(KernelSignature, Equals) {
+  KernelSignature sig1({}, utf8());
+  KernelSignature sig1_copy({}, utf8());
+  KernelSignature sig2({int8()}, utf8());
+
+  // Output type doesn't matter (for now)
+  KernelSignature sig3({int8()}, int32());
+
+  KernelSignature sig4({int8(), int16()}, utf8());
+  KernelSignature sig4_copy({int8(), int16()}, utf8());
+  KernelSignature sig5({int8(), int16(), int32()}, utf8());
+
+  // Differ in shape
+  KernelSignature sig6({ValueDescr::Scalar(int8())}, utf8());
+  KernelSignature sig7({ValueDescr::Array(int8())}, utf8());
+
+  ASSERT_EQ(sig1, sig1);
+
+  ASSERT_EQ(sig2, sig3);
+  ASSERT_NE(sig3, sig4);
+
+  // Different sig objects, but same sig
+  ASSERT_EQ(sig1, sig1_copy);
+  ASSERT_EQ(sig4, sig4_copy);
+
+  // Match first 2 args, but not third
+  ASSERT_NE(sig4, sig5);
+
+  ASSERT_NE(sig6, sig7);
+}
+
+TEST(KernelSignature, VarargsEquals) {
+  KernelSignature sig1({int8()}, utf8(), /*is_varargs=*/true);
+  KernelSignature sig2({int8()}, utf8(), /*is_varargs=*/true);
+  KernelSignature sig3({int8()}, utf8());
+
+  ASSERT_EQ(sig1, sig2);
+  ASSERT_NE(sig2, sig3);
+}
+
+TEST(KernelSignature, Hash) {
+  // Some basic tests to ensure that the hashes are deterministic and that all
+  // input arguments are incorporated
+  KernelSignature sig1({}, utf8());
+  KernelSignature sig2({int8()}, utf8());
+  KernelSignature sig3({int8(), int32()}, utf8());
+
+  ASSERT_EQ(sig1.Hash(), sig1.Hash());
+  ASSERT_EQ(sig2.Hash(), sig2.Hash());
+  ASSERT_NE(sig1.Hash(), sig2.Hash());
+  ASSERT_NE(sig2.Hash(), sig3.Hash());
+}
+
+TEST(KernelSignature, MatchesInputs) {
+  // () -> boolean
+  KernelSignature sig1({}, boolean());
+
+  ASSERT_TRUE(sig1.MatchesInputs({}));
+  ASSERT_FALSE(sig1.MatchesInputs({int8()}));
+
+  // (any[int8], any[decimal]) -> boolean
+  KernelSignature sig2({int8(), Type::DECIMAL}, boolean());
+
+  ASSERT_FALSE(sig2.MatchesInputs({}));
+  ASSERT_FALSE(sig2.MatchesInputs({int8()}));
+  ASSERT_TRUE(sig2.MatchesInputs({int8(), decimal(12, 2)}));
+  ASSERT_TRUE(sig2.MatchesInputs(
+      {ValueDescr::Scalar(int8()), ValueDescr::Scalar(decimal(12, 2))}));
+  ASSERT_TRUE(
+      sig2.MatchesInputs({ValueDescr::Array(int8()), ValueDescr::Array(decimal(12, 2))}));
+
+  // (scalar[int8], array[int32]) -> boolean
+  KernelSignature sig3({ValueDescr::Scalar(int8()), ValueDescr::Array(int32())},
+                       boolean());
+
+  ASSERT_FALSE(sig3.MatchesInputs({}));
+
+  // Unqualified, these are ANY type and do not match because the kernel
+  // requires a scalar and an array
+  ASSERT_FALSE(sig3.MatchesInputs({int8(), int32()}));
+  ASSERT_TRUE(
+      sig3.MatchesInputs({ValueDescr::Scalar(int8()), ValueDescr::Array(int32())}));
+  ASSERT_FALSE(
+      sig3.MatchesInputs({ValueDescr::Array(int8()), ValueDescr::Array(int32())}));
+}
+
+TEST(KernelSignature, VarargsMatchesInputs) {
+  KernelSignature sig({int8()}, utf8(), /*is_varargs=*/true);
+
+  std::vector<ValueDescr> args = {int8()};
+  ASSERT_TRUE(sig.MatchesInputs(args));
+  args.push_back(ValueDescr::Scalar(int8()));
+  args.push_back(ValueDescr::Array(int8()));
+  ASSERT_TRUE(sig.MatchesInputs(args));
+  args.push_back(int32());
+  ASSERT_FALSE(sig.MatchesInputs(args));
+}
+
+TEST(KernelSignature, ToString) {
+  std::vector<InputType> in_types = {InputType(int8(), ValueDescr::SCALAR),
+                                     InputType(Type::DECIMAL, ValueDescr::ARRAY),
+                                     InputType(utf8())};
+  KernelSignature sig(in_types, utf8());
+  ASSERT_EQ("(scalar[int8], array[decimal*], any[string]) -> string", sig.ToString());
+
+  OutputType out_type(
+      [](const std::vector<ValueDescr>& args) { return Status::Invalid("NYI"); });
+  KernelSignature sig2({int8(), Type::DECIMAL}, out_type);
+  ASSERT_EQ("(any[int8], any[decimal*]) -> computed", sig2.ToString());
+}
+
+TEST(KernelSignature, VarargsToString) {
+  KernelSignature sig({int8()}, utf8(), /*is_varargs=*/true);
+  ASSERT_EQ("varargs[any[int8]] -> string", sig.ToString());
+}
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt
index 12ad4d3a958..b230621ad53 100644
--- a/cpp/src/arrow/compute/kernels/CMakeLists.txt
+++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt
@@ -15,37 +15,49 @@
 # specific language governing permissions and limitations
 # under the License.
 
-arrow_install_all_headers("arrow/compute/kernels")
-
-add_arrow_compute_test(boolean_test)
-add_arrow_compute_test(cast_test)
-add_arrow_compute_test(hash_test)
-add_arrow_compute_test(isin_test)
-add_arrow_compute_test(match_test)
-add_arrow_compute_test(sort_to_indices_test)
-add_arrow_compute_test(nth_to_indices_test)
-add_arrow_compute_test(util_internal_test)
-add_arrow_compute_test(add_test)
+# ----------------------------------------------------------------------
+# Scalar kernels
 
-# Aggregates
-add_arrow_compute_test(aggregate_test)
+add_arrow_compute_test(scalar_test
+                       SOURCES
+                       scalar_arithmetic_test.cc
+                       scalar_boolean_test.cc
+                       scalar_compare_test.cc
+                       scalar_set_lookup_test.cc)
 
-# Comparison
-add_arrow_compute_test(compare_test)
+# add_arrow_compute_test(cast_test)
 
-# Selection
-add_arrow_compute_test(take_test)
-add_arrow_compute_test(filter_test)
+add_arrow_benchmark(scalar_compare_benchmark PREFIX "arrow-compute")
 
-add_arrow_benchmark(sort_to_indices_benchmark PREFIX "arrow-compute")
-add_arrow_benchmark(nth_to_indices_benchmark PREFIX "arrow-compute")
+# ----------------------------------------------------------------------
+# Vector kernels
 
-# Aggregates
-add_arrow_benchmark(aggregate_benchmark PREFIX "arrow-compute")
+add_arrow_compute_test(vector_test
+                       SOURCES
+                       vector_partition_test.cc)
+
+# add_arrow_compute_test(hash_test)
+
+# add_arrow_benchmark(hash_benchmark PREFIX "arrow-compute")
+
+# Single-array sorting
+
+# add_arrow_compute_test(sort_to_indices_test)
+# add_arrow_benchmark(sort_to_indices_benchmark PREFIX "arrow-compute")
+# add_arrow_benchmark(nth_to_indices_benchmark PREFIX "arrow-compute")
 
-# Comparison
-add_arrow_benchmark(compare_benchmark PREFIX "arrow-compute")
+# Array value selection
 
-# Selection
-add_arrow_benchmark(filter_benchmark PREFIX "arrow-compute")
-add_arrow_benchmark(take_benchmark PREFIX "arrow-compute")
+# add_arrow_compute_test(filter_test)
+# add_arrow_compute_test(take_test)
+
+# add_arrow_benchmark(filter_benchmark PREFIX "arrow-compute")a
+# add_arrow_benchmark(take_benchmark PREFIX "arrow-compute")
+
+# ----------------------------------------------------------------------
+# Aggregate kernels
+
+# Aggregates
+
+add_arrow_compute_test(aggregate_test)
+# add_arrow_benchmark(aggregate_benchmark PREFIX "arrow-compute")
diff --git a/cpp/src/arrow/compute/kernels/add.cc b/cpp/src/arrow/compute/kernels/add.cc
deleted file mode 100644
index 19eb153b5cd..00000000000
--- a/cpp/src/arrow/compute/kernels/add.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/kernels/add.h"
-#include "arrow/builder.h"
-#include "arrow/compute/context.h"
-#include "arrow/type_traits.h"
-
-namespace arrow {
-namespace compute {
-
-template <typename ArrowType>
-class AddKernelImpl : public AddKernel {
- private:
-  using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
-  std::shared_ptr<DataType> result_type_;
-
-  Status Add(FunctionContext* ctx, const std::shared_ptr<ArrayType>& lhs,
-             const std::shared_ptr<ArrayType>& rhs, std::shared_ptr<Array>* result) {
-    NumericBuilder<ArrowType> builder;
-    RETURN_NOT_OK(builder.Reserve(lhs->length()));
-    for (int i = 0; i < lhs->length(); i++) {
-      if (lhs->IsNull(i) || rhs->IsNull(i)) {
-        builder.UnsafeAppendNull();
-      } else {
-        builder.UnsafeAppend(lhs->Value(i) + rhs->Value(i));
-      }
-    }
-    return builder.Finish(result);
-  }
-
- public:
-  explicit AddKernelImpl(std::shared_ptr<DataType> result_type)
-      : result_type_(result_type) {}
-
-  Status Call(FunctionContext* ctx, const Datum& lhs, const Datum& rhs,
-              Datum* out) override {
-    if (!lhs.is_array() || !rhs.is_array()) {
-      return Status::Invalid("AddKernel expects array values");
-    }
-    if (lhs.length() != rhs.length()) {
-      return Status::Invalid("AddKernel expects arrays with the same length");
-    }
-    auto lhs_array = lhs.make_array();
-    auto rhs_array = rhs.make_array();
-    std::shared_ptr<Array> result;
-    RETURN_NOT_OK(this->Add(ctx, lhs_array, rhs_array, &result));
-    *out = result;
-    return Status::OK();
-  }
-
-  std::shared_ptr<DataType> out_type() const override { return result_type_; }
-
-  Status Add(FunctionContext* ctx, const std::shared_ptr<Array>& lhs,
-             const std::shared_ptr<Array>& rhs, std::shared_ptr<Array>* result) override {
-    auto lhs_array = std::static_pointer_cast<ArrayType>(lhs);
-    auto rhs_array = std::static_pointer_cast<ArrayType>(rhs);
-    return Add(ctx, lhs_array, rhs_array, result);
-  }
-};
-
-Status AddKernel::Make(const std::shared_ptr<DataType>& value_type,
-                       std::unique_ptr<AddKernel>* out) {
-  AddKernel* kernel;
-  switch (value_type->id()) {
-    case Type::UINT8:
-      kernel = new AddKernelImpl<UInt8Type>(value_type);
-      break;
-    case Type::INT8:
-      kernel = new AddKernelImpl<Int8Type>(value_type);
-      break;
-    case Type::UINT16:
-      kernel = new AddKernelImpl<UInt16Type>(value_type);
-      break;
-    case Type::INT16:
-      kernel = new AddKernelImpl<Int16Type>(value_type);
-      break;
-    case Type::UINT32:
-      kernel = new AddKernelImpl<UInt32Type>(value_type);
-      break;
-    case Type::INT32:
-      kernel = new AddKernelImpl<Int32Type>(value_type);
-      break;
-    case Type::UINT64:
-      kernel = new AddKernelImpl<UInt64Type>(value_type);
-      break;
-    case Type::INT64:
-      kernel = new AddKernelImpl<Int64Type>(value_type);
-      break;
-    case Type::FLOAT:
-      kernel = new AddKernelImpl<FloatType>(value_type);
-      break;
-    case Type::DOUBLE:
-      kernel = new AddKernelImpl<DoubleType>(value_type);
-      break;
-    default:
-      return Status::NotImplemented("Arithmetic operations on ", *value_type, " arrays");
-  }
-  out->reset(kernel);
-  return Status::OK();
-}
-
-Status Add(FunctionContext* ctx, const Array& lhs, const Array& rhs,
-           std::shared_ptr<Array>* result) {
-  Datum result_datum;
-  std::unique_ptr<AddKernel> kernel;
-  ARROW_RETURN_IF(
-      !lhs.type()->Equals(rhs.type()),
-      Status::Invalid("Array types should be equal to use arithmetic kernels"));
-  RETURN_NOT_OK(AddKernel::Make(lhs.type(), &kernel));
-  RETURN_NOT_OK(kernel->Call(ctx, Datum(lhs.data()), Datum(rhs.data()), &result_datum));
-  *result = result_datum.make_array();
-  return Status::OK();
-}
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/add.h b/cpp/src/arrow/compute/kernels/add.h
deleted file mode 100644
index 19991aa4473..00000000000
--- a/cpp/src/arrow/compute/kernels/add.h
+++ /dev/null
@@ -1,77 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <memory>
-
-#include "arrow/compute/kernel.h"
-#include "arrow/status.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-
-class Array;
-
-namespace compute {
-
-class FunctionContext;
-
-/// \brief Summarizes two arrays.
-///
-/// Summarizes two arrays with the same length.
-/// The output is an array with same length and type as input.
-/// Types of both input arrays should be equal
-///
-/// For example given lhs = [1, null, 3], rhs = [4, 5, 6], the output
-/// will be [5, null, 7]
-///
-/// \param[in] ctx the FunctionContext
-/// \param[in] lhs the first array
-/// \param[in] rhs the second array
-/// \param[out] result the sum of first and second arrays
-
-ARROW_EXPORT
-Status Add(FunctionContext* ctx, const Array& lhs, const Array& rhs,
-           std::shared_ptr<Array>* result);
-
-/// \brief BinaryKernel implementing Add operation
-class ARROW_EXPORT AddKernel : public BinaryKernel {
- public:
-  /// \brief BinaryKernel interface
-  ///
-  /// delegates to subclasses via Add()
-  Status Call(FunctionContext* ctx, const Datum& lhs, const Datum& rhs,
-              Datum* out) override = 0;
-
-  /// \brief output type of this kernel
-  std::shared_ptr<DataType> out_type() const override = 0;
-
-  /// \brief single-array implementation
-  virtual Status Add(FunctionContext* ctx, const std::shared_ptr<Array>& lhs,
-                     const std::shared_ptr<Array>& rhs,
-                     std::shared_ptr<Array>* result) = 0;
-
-  /// \brief factory for Add
-  ///
-  /// \param[in] value_type constructed AddKernel
-  /// \param[out] out created kernel
-  static Status Make(const std::shared_ptr<DataType>& value_type,
-                     std::unique_ptr<AddKernel>* out);
-};
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/aggregate.cc b/cpp/src/arrow/compute/kernels/aggregate.cc
deleted file mode 100644
index 90337588615..00000000000
--- a/cpp/src/arrow/compute/kernels/aggregate.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <utility>
-
-#include "arrow/compute/context.h"
-#include "arrow/compute/kernels/aggregate.h"
-
-namespace arrow {
-namespace compute {
-
-// Helper class that properly invokes destructor when state goes out of scope.
-class ManagedAggregateState {
- public:
-  ManagedAggregateState(std::shared_ptr<AggregateFunction>& desc,
-                        std::shared_ptr<Buffer>&& buffer)
-      : desc_(desc), state_(buffer) {
-    desc_->New(state_->mutable_data());
-  }
-
-  ~ManagedAggregateState() { desc_->Delete(state_->mutable_data()); }
-
-  void* mutable_data() { return state_->mutable_data(); }
-
-  static std::shared_ptr<ManagedAggregateState> Make(
-      std::shared_ptr<AggregateFunction>& desc, MemoryPool* pool) {
-    auto maybe_buf = AllocateBuffer(desc->Size(), pool);
-    if (!maybe_buf.ok()) {
-      return nullptr;
-    }
-    return std::make_shared<ManagedAggregateState>(desc, *std::move(maybe_buf));
-  }
-
- private:
-  std::shared_ptr<AggregateFunction> desc_;
-  std::shared_ptr<Buffer> state_;
-};
-
-Status AggregateUnaryKernel::Call(FunctionContext* ctx, const Datum& input, Datum* out) {
-  if (!input.is_arraylike()) {
-    return Status::Invalid("AggregateKernel expects Array or ChunkedArray datum");
-  }
-  auto state = ManagedAggregateState::Make(aggregate_function_, ctx->memory_pool());
-  if (!state) {
-    return Status::OutOfMemory("AggregateState allocation failed");
-  }
-
-  if (input.is_array()) {
-    auto array = input.make_array();
-    RETURN_NOT_OK(aggregate_function_->Consume(*array, state->mutable_data()));
-  } else {
-    auto chunked_array = input.chunked_array();
-    for (int i = 0; i < chunked_array->num_chunks(); i++) {
-      auto tmp_state =
-          ManagedAggregateState::Make(aggregate_function_, ctx->memory_pool());
-      if (!tmp_state) {
-        return Status::OutOfMemory("AggregateState allocation failed");
-      }
-      RETURN_NOT_OK(aggregate_function_->Consume(*chunked_array->chunk(i),
-                                                 tmp_state->mutable_data()));
-      RETURN_NOT_OK(
-          aggregate_function_->Merge(tmp_state->mutable_data(), state->mutable_data()));
-    }
-  }
-
-  return aggregate_function_->Finalize(state->mutable_data(), out);
-}
-
-std::shared_ptr<DataType> AggregateUnaryKernel::out_type() const {
-  return aggregate_function_->out_type();
-}
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/aggregate.h b/cpp/src/arrow/compute/kernels/aggregate.h
deleted file mode 100644
index f342e31a0b6..00000000000
--- a/cpp/src/arrow/compute/kernels/aggregate.h
+++ /dev/null
@@ -1,115 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <memory>
-
-#include "arrow/compute/kernel.h"
-
-namespace arrow {
-
-class Array;
-class Status;
-
-namespace compute {
-
-class FunctionContext;
-struct Datum;
-
-/// AggregateFunction is an interface for Aggregates
-///
-/// An aggregates transforms an array into single result called a state via the
-/// Consume method.. State supports the merge operation via the Merge method.
-/// State can be sealed into a final result via the Finalize method.
-//
-/// State ownership is handled by callers, thus the interface exposes 3 methods
-/// for the caller to manage memory:
-/// - Size
-/// - New (placement new constructor invocation)
-/// - Delete (state destructor)
-///
-/// Design inspired by ClickHouse aggregate functions.
-class AggregateFunction {
- public:
-  /// \brief Consume an array into a state.
-  virtual Status Consume(const Array& input, void* state) const = 0;
-
-  /// \brief Merge states.
-  virtual Status Merge(const void* src, void* dst) const = 0;
-
-  /// \brief Convert state into a final result.
-  virtual Status Finalize(const void* src, Datum* output) const = 0;
-
-  virtual ~AggregateFunction() {}
-
-  virtual std::shared_ptr<DataType> out_type() const = 0;
-
-  /// State management methods.
-  virtual int64_t Size() const = 0;
-  virtual void New(void* ptr) const = 0;
-  virtual void Delete(void* ptr) const = 0;
-};
-
-/// AggregateFunction partial implementation for static type state
-template <typename State>
-class AggregateFunctionStaticState : public AggregateFunction {
-  virtual Status Consume(const Array& input, State* state) const = 0;
-  virtual Status Merge(const State& src, State* dst) const = 0;
-  virtual Status Finalize(const State& src, Datum* output) const = 0;
-
-  Status Consume(const Array& input, void* state) const final {
-    return Consume(input, static_cast<State*>(state));
-  }
-
-  Status Merge(const void* src, void* dst) const final {
-    return Merge(*static_cast<const State*>(src), static_cast<State*>(dst));
-  }
-
-  /// \brief Convert state into a final result.
-  Status Finalize(const void* src, Datum* output) const final {
-    return Finalize(*static_cast<const State*>(src), output);
-  }
-
-  int64_t Size() const final { return sizeof(State); }
-
-  void New(void* ptr) const final {
-    // By using placement-new syntax, the constructor of the State is invoked
-    // in the memory location defined by the caller. This only supports State
-    // with a parameter-less constructor.
-    new (ptr) State;
-  }
-
-  void Delete(void* ptr) const final { static_cast<State*>(ptr)->~State(); }
-};
-
-/// \brief UnaryKernel implemented by an AggregateState
-class ARROW_EXPORT AggregateUnaryKernel : public UnaryKernel {
- public:
-  explicit AggregateUnaryKernel(std::shared_ptr<AggregateFunction>& aggregate)
-      : aggregate_function_(aggregate) {}
-
-  Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override;
-
-  std::shared_ptr<DataType> out_type() const override;
-
- private:
-  std::shared_ptr<AggregateFunction> aggregate_function_;
-};
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
new file mode 100644
index 00000000000..8fb02f18d9e
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
@@ -0,0 +1,366 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// returnGegarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/kernels/aggregate_internal.h"
+#include "arrow/compute/kernels/common.h"
+
+namespace arrow {
+namespace compute {
+
+namespace {
+
+struct ScalarAggregator : public KernelState {
+  virtual void Consume(KernelContext* ctx, const ExecBatch& batch) = 0;
+  virtual void MergeFrom(KernelContext* ctx, const KernelState& src) = 0;
+  virtual void Finalize(KernelContext* ctx, Datum* out) = 0;
+};
+
+void AggregateConsume(KernelContext* ctx, const ExecBatch& batch) {
+  checked_cast<ScalarAggregator*>(ctx->state())->Consume(ctx, batch);
+}
+
+void AggregateMerge(KernelContext* ctx, const KernelState& src, KernelState* dst) {
+  checked_cast<ScalarAggregator*>(dst)->MergeFrom(ctx, src);
+}
+
+void AggregateFinalize(KernelContext* ctx, Datum* out) {
+  checked_cast<ScalarAggregator*>(ctx->state())->Finalize(ctx, out);
+}
+
+// ----------------------------------------------------------------------
+// Count implementation
+
+struct CountImpl : public ScalarAggregator {
+  explicit CountImpl(CountOptions options)
+      : options(std::move(options)), non_nulls(0), nulls(0) {}
+
+  void Consume(KernelContext*, const ExecBatch& batch) override {
+    const ArrayData& input = *batch[0].array();
+    const int64_t nulls = input.GetNullCount();
+    this->nulls += nulls;
+    this->non_nulls += input.length - nulls;
+  }
+
+  void MergeFrom(KernelContext*, const KernelState& src) override {
+    const auto& other_state = checked_cast<const CountImpl&>(src);
+    this->non_nulls += other_state.non_nulls;
+    this->nulls += other_state.nulls;
+  }
+
+  void Finalize(KernelContext* ctx, Datum* out) override {
+    const auto& state = checked_cast<const CountImpl&>(*ctx->state());
+    switch (state.options.count_mode) {
+      case CountOptions::COUNT_ALL:
+        *out = Datum(state.non_nulls);
+        break;
+      case CountOptions::COUNT_NULL:
+        *out = Datum(state.nulls);
+        break;
+      default:
+        ctx->SetStatus(Status::Invalid("Unknown CountOptions encountered"));
+        break;
+    }
+  }
+
+  CountOptions options;
+  int64_t non_nulls = 0;
+  int64_t nulls = 0;
+};
+
+std::unique_ptr<KernelState> CountInit(KernelContext*, const Kernel&,
+                                       const FunctionOptions* options) {
+  return std::unique_ptr<KernelState>(
+      new CountImpl(static_cast<const CountOptions&>(*options)));
+}
+
+// ----------------------------------------------------------------------
+// Sum implementation
+
+template <typename ArrowType,
+          typename SumType = typename FindAccumulatorType<ArrowType>::Type>
+struct SumState {
+  using ThisType = SumState<ArrowType, SumType>;
+  using T = typename TypeTraits<ArrowType>::CType;
+  using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
+
+  // A small number of elements rounded to the next cacheline. This should
+  // amount to a maximum of 4 cachelines when dealing with 8 bytes elements.
+  static constexpr int64_t kTinyThreshold = 32;
+  static_assert(kTinyThreshold >= (2 * CHAR_BIT) + 1,
+                "ConsumeSparse requires 3 bytes of null bitmap, and 17 is the"
+                "required minimum number of bits/elements to cover 3 bytes.");
+
+  ThisType operator+(const ThisType& rhs) const {
+    return ThisType(this->count + rhs.count, this->sum + rhs.sum);
+  }
+
+  ThisType& operator+=(const ThisType& rhs) {
+    this->count += rhs.count;
+    this->sum += rhs.sum;
+
+    return *this;
+  }
+
+ public:
+  void Consume(const Array& input) {
+    const ArrayType& array = static_cast<const ArrayType&>(input);
+    if (input.null_count() == 0) {
+      (*this) += ConsumeDense(array);
+    } else if (input.length() <= kTinyThreshold) {
+      // In order to simplify ConsumeSparse implementation (requires at least 3
+      // bytes of bitmap data), small arrays are handled differently.
+      (*this) += ConsumeTiny(array);
+    } else {
+      (*this) += ConsumeSparse(array);
+    }
+  }
+
+  size_t count = 0;
+  typename SumType::c_type sum = 0;
+
+ private:
+  ThisType ConsumeDense(const ArrayType& array) const {
+    ThisType local;
+    const auto values = array.raw_values();
+    const int64_t length = array.length();
+    for (int64_t i = 0; i < length; i++) {
+      local.sum += values[i];
+    }
+    local.count = length;
+    return local;
+  }
+
+  ThisType ConsumeTiny(const ArrayType& array) const {
+    ThisType local;
+
+    internal::BitmapReader reader(array.null_bitmap_data(), array.offset(),
+                                  array.length());
+    const auto values = array.raw_values();
+    for (int64_t i = 0; i < array.length(); i++) {
+      if (reader.IsSet()) {
+        local.sum += values[i];
+        local.count++;
+      }
+      reader.Next();
+    }
+
+    return local;
+  }
+
+  // While this is not branchless, gcc needs this to be in a different function
+  // for it to generate cmov which ends to be slightly faster than
+  // multiplication but safe for handling NaN with doubles.
+  inline T MaskedValue(bool valid, T value) const { return valid ? value : 0; }
+
+  inline ThisType UnrolledSum(uint8_t bits, const T* values) const {
+    ThisType local;
+
+    if (bits < 0xFF) {
+      // Some nulls
+      for (size_t i = 0; i < 8; i++) {
+        local.sum += MaskedValue(bits & (1U << i), values[i]);
+      }
+      local.count += BitUtil::kBytePopcount[bits];
+    } else {
+      // No nulls
+      for (size_t i = 0; i < 8; i++) {
+        local.sum += values[i];
+      }
+      local.count += 8;
+    }
+
+    return local;
+  }
+
+  ThisType ConsumeSparse(const ArrayType& array) const {
+    ThisType local;
+
+    // Sliced bitmaps on non-byte positions induce problem with the branchless
+    // unrolled technique. Thus extra padding is added on both left and right
+    // side of the slice such that both ends are byte-aligned. The first and
+    // last bitmap are properly masked to ignore extra values induced by
+    // padding.
+    //
+    // The execution is divided in 3 sections.
+    //
+    // 1. Compute the sum of the first masked byte.
+    // 2. Compute the sum of the middle bytes
+    // 3. Compute the sum of the last masked byte.
+
+    const int64_t length = array.length();
+    const int64_t offset = array.offset();
+
+    // The number of bytes covering the range, this includes partial bytes.
+    // This number bounded by `<= (length / 8) + 2`, e.g. a possible extra byte
+    // on the left, and on the right.
+    const int64_t covering_bytes = BitUtil::CoveringBytes(offset, length);
+    DCHECK_GE(covering_bytes, 3);
+
+    // Align values to the first batch of 8 elements. Note that raw_values() is
+    // already adjusted with the offset, thus we rewind a little to align to
+    // the closest 8-batch offset.
+    const auto values = array.raw_values() - (offset % 8);
+
+    // Align bitmap at the first consumable byte.
+    const auto bitmap = array.null_bitmap_data() + BitUtil::RoundDown(offset, 8) / 8;
+
+    // Consume the first (potentially partial) byte.
+    const uint8_t first_mask = BitUtil::kTrailingBitmask[offset % 8];
+    local += UnrolledSum(bitmap[0] & first_mask, values);
+
+    // Consume the (full) middle bytes. The loop iterates in unit of
+    // batches of 8 values and 1 byte of bitmap.
+    for (int64_t i = 1; i < covering_bytes - 1; i++) {
+      local += UnrolledSum(bitmap[i], &values[i * 8]);
+    }
+
+    // Consume the last (potentially partial) byte.
+    const int64_t last_idx = covering_bytes - 1;
+    const uint8_t last_mask = BitUtil::kPrecedingWrappingBitmask[(offset + length) % 8];
+    local += UnrolledSum(bitmap[last_idx] & last_mask, &values[last_idx * 8]);
+
+    return local;
+  }
+};
+
+template <typename ArrowType>
+struct SumImpl : public ScalarAggregator {
+  using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
+  using ThisType = SumImpl<ArrowType>;
+  using SumType = typename FindAccumulatorType<ArrowType>::Type;
+  using OutputType = typename TypeTraits<SumType>::ScalarType;
+
+  void Consume(KernelContext*, const ExecBatch& batch) override {
+    this->state.Consume(ArrayType(batch[0].array()));
+  }
+
+  void MergeFrom(KernelContext*, const KernelState& src) override {
+    const auto& other = checked_cast<const ThisType&>(src);
+    this->state += other.state;
+  }
+
+  void Finalize(KernelContext*, Datum* out) override {
+    if (state.count == 0) {
+      out->value = std::make_shared<OutputType>();
+    } else {
+      out->value = MakeScalar(state.sum);
+    }
+  }
+
+  SumState<ArrowType> state;
+};
+
+template <typename ArrowType>
+struct MeanImpl : public SumImpl<ArrowType> {
+  void Finalize(KernelContext*, Datum* out) override {
+    const bool is_valid = this->state.count > 0;
+    const double divisor = static_cast<double>(is_valid ? this->state.count : 1UL);
+    const double mean = static_cast<double>(this->state.sum) / divisor;
+
+    if (!is_valid) {
+      out->value = std::make_shared<DoubleScalar>();
+    } else {
+      out->value = std::make_shared<DoubleScalar>(mean);
+    }
+  }
+};
+
+template <template <typename> class KernelClass>
+struct SumLikeInit {
+  std::unique_ptr<KernelState> state;
+  KernelContext* ctx;
+  const DataType& type;
+
+  SumLikeInit(KernelContext* ctx, const DataType& type) : ctx(ctx), type(type) {}
+
+  Status Visit(const DataType&) { return Status::NotImplemented("No sum implemented"); }
+
+  Status Visit(const HalfFloatType&) {
+    return Status::NotImplemented("No sum implemented");
+  }
+
+  template <typename Type>
+  enable_if_number<Type, Status> Visit(const Type&) {
+    state.reset(new KernelClass<Type>());
+    return Status::OK();
+  }
+
+  std::unique_ptr<KernelState> Create() {
+    ctx->SetStatus(VisitTypeInline(type, this));
+    return std::move(state);
+  }
+};
+
+std::unique_ptr<KernelState> SumInit(KernelContext* ctx, const Kernel& kernel,
+                                     const FunctionOptions*) {
+  const DataType& input_type = *kernel.signature->in_types()[0].type();
+  SumLikeInit<SumImpl> visitor(ctx, input_type);
+  return visitor.Create();
+}
+
+std::unique_ptr<KernelState> MeanInit(KernelContext* ctx, const Kernel& kernel,
+                                      const FunctionOptions*) {
+  const DataType& input_type = *kernel.signature->in_types()[0].type();
+  SumLikeInit<MeanImpl> visitor(ctx, input_type);
+  return visitor.Create();
+}
+
+}  // namespace
+
+namespace internal {
+
+void AddAggKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
+                  ScalarAggregateFunction* func) {
+  DCHECK_OK(func->AddKernel(ScalarAggregateKernel(std::move(sig), init, AggregateConsume,
+                                                  AggregateMerge, AggregateFinalize)));
+}
+
+void AddBasicAggKernels(KernelInit init,
+                        const std::vector<std::shared_ptr<DataType>>& types,
+                        std::shared_ptr<DataType> out_ty, ScalarAggregateFunction* func) {
+  for (const auto& ty : types) {
+    // array[T] -> scalar[T]
+    auto sig = KernelSignature::Make({InputType::Array(ty)}, ValueDescr::Scalar(out_ty));
+    AddAggKernel(std::move(sig), init, func);
+  }
+}
+
+void RegisterBasicAggregateFunctions(FunctionRegistry* registry) {
+  auto func = std::make_shared<ScalarAggregateFunction>("count", /*arity=*/1);
+
+  /// Takes any array input, outputs int64 scalar
+  InputType any_array(ValueDescr::ARRAY);
+  AddAggKernel(KernelSignature::Make({any_array}, ValueDescr::Scalar(int64())), CountInit,
+               func.get());
+  DCHECK_OK(registry->AddFunction(std::move(func)));
+
+  func = std::make_shared<ScalarAggregateFunction>("sum", /*arity=*/1);
+  AddBasicAggKernels(SumInit, codegen::SignedIntTypes(), int64(), func.get());
+  AddBasicAggKernels(SumInit, codegen::UnsignedIntTypes(), uint64(), func.get());
+  AddBasicAggKernels(SumInit, codegen::FloatingPointTypes(), float64(), func.get());
+  DCHECK_OK(registry->AddFunction(std::move(func)));
+
+  func = std::make_shared<ScalarAggregateFunction>("mean", /*arity=*/1);
+  AddBasicAggKernels(MeanInit, codegen::SignedIntTypes(), float64(), func.get());
+  AddBasicAggKernels(MeanInit, codegen::UnsignedIntTypes(), float64(), func.get());
+  AddBasicAggKernels(MeanInit, codegen::FloatingPointTypes(), float64(), func.get());
+  DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+}  // namespace internal
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc b/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc
index 0a50494f687..46d726d4071 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc
@@ -20,10 +20,8 @@
 #include <vector>
 
 #include "arrow/builder.h"
+#include "arrow/compute/api.h"
 #include "arrow/compute/benchmark_util.h"
-#include "arrow/compute/context.h"
-#include "arrow/compute/kernel.h"
-#include "arrow/compute/kernels/sum.h"
 #include "arrow/memory_pool.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/random.h"
@@ -314,11 +312,8 @@ static void SumKernel(benchmark::State& state) {
   auto array = std::static_pointer_cast<NumericArray<Int64Type>>(
       rand.Int64(array_size, -100, 100, null_percent));
 
-  FunctionContext ctx;
   for (auto _ : state) {
-    Datum out;
-    ABORT_NOT_OK(Sum(&ctx, Datum(array), &out));
-    benchmark::DoNotOptimize(out);
+    ABORT_NOT_OK(Sum(array).status());
   }
 
   state.counters["size"] = static_cast<double>(state.range(0));
diff --git a/cpp/src/arrow/compute/kernels/isin.h b/cpp/src/arrow/compute/kernels/aggregate_internal.h
similarity index 53%
rename from cpp/src/arrow/compute/kernels/isin.h
rename to cpp/src/arrow/compute/kernels/aggregate_internal.h
index ece9a85e43c..9e3e8c909b8 100644
--- a/cpp/src/arrow/compute/kernels/isin.h
+++ b/cpp/src/arrow/compute/kernels/aggregate_internal.h
@@ -1,7 +1,7 @@
 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
+// returnGegarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
@@ -17,33 +17,30 @@
 
 #pragma once
 
-#include <memory>
-
-#include "arrow/array.h"
-#include "arrow/compute/context.h"
-#include "arrow/compute/kernel.h"
-#include "arrow/status.h"
 #include "arrow/type.h"
-#include "arrow/util/visibility.h"
+#include "arrow/type_traits.h"
 
 namespace arrow {
 namespace compute {
 
-/// \brief IsIn returns boolean values if the value
-/// is in both left and right arrays.
-///
-/// If null occurs in left, if null count in right is not 0,
-/// it returns true, else returns null.
-///
-/// \param[in] context the FunctionContext
-/// \param[in] left array-like input
-/// \param[in] right array-like input
-/// \param[out] out resulting datum
-///
-/// \since 1.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Status IsIn(FunctionContext* context, const Datum& left, const Datum& right, Datum* out);
+// Find the largest compatible primitive type for a primitive type.
+template <typename I, typename Enable = void>
+struct FindAccumulatorType {};
+
+template <typename I>
+struct FindAccumulatorType<I, enable_if_signed_integer<I>> {
+  using Type = Int64Type;
+};
+
+template <typename I>
+struct FindAccumulatorType<I, enable_if_unsigned_integer<I>> {
+  using Type = UInt64Type;
+};
+
+template <typename I>
+struct FindAccumulatorType<I, enable_if_floating_point<I>> {
+  using Type = DoubleType;
+};
 
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc
index 444bda66684..3d2a1d36b6a 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_test.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc
@@ -23,12 +23,8 @@
 #include <gtest/gtest.h>
 
 #include "arrow/array.h"
-#include "arrow/compute/kernel.h"
-#include "arrow/compute/kernels/count.h"
-#include "arrow/compute/kernels/mean.h"
-#include "arrow/compute/kernels/minmax.h"
-#include "arrow/compute/kernels/sum.h"
-#include "arrow/compute/kernels/sum_internal.h"
+#include "arrow/compute/api_eager.h"
+#include "arrow/compute/kernels/aggregate_internal.h"
 #include "arrow/compute/test_util.h"
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
@@ -98,44 +94,40 @@ static Datum NaiveSum(const Array& array) {
 }
 
 template <typename ArrowType>
-void ValidateSum(FunctionContext* ctx, const Array& input, Datum expected) {
+void ValidateSum(const Array& input, Datum expected) {
   using OutputType = typename FindAccumulatorType<ArrowType>::Type;
 
-  Datum result;
-  ASSERT_OK(Sum(ctx, input, &result));
+  ASSERT_OK_AND_ASSIGN(Datum result, Sum(input));
   DatumEqual<OutputType>::EnsureEqual(result, expected);
 }
 
 template <typename ArrowType>
-void ValidateSum(FunctionContext* ctx, const std::shared_ptr<ChunkedArray>& input,
-                 Datum expected) {
+void ValidateSum(const std::shared_ptr<ChunkedArray>& input, Datum expected) {
   using OutputType = typename FindAccumulatorType<ArrowType>::Type;
 
-  Datum result;
-  ASSERT_OK(Sum(ctx, input, &result));
+  ASSERT_OK_AND_ASSIGN(Datum result, Sum(input));
   DatumEqual<OutputType>::EnsureEqual(result, expected);
 }
 
 template <typename ArrowType>
-void ValidateSum(FunctionContext* ctx, const char* json, Datum expected) {
+void ValidateSum(const char* json, Datum expected) {
   auto array = ArrayFromJSON(TypeTraits<ArrowType>::type_singleton(), json);
-  ValidateSum<ArrowType>(ctx, *array, expected);
+  ValidateSum<ArrowType>(*array, expected);
 }
 
 template <typename ArrowType>
-void ValidateSum(FunctionContext* ctx, const std::vector<std::string>& json,
-                 Datum expected) {
+void ValidateSum(const std::vector<std::string>& json, Datum expected) {
   auto array = ChunkedArrayFromJSON(TypeTraits<ArrowType>::type_singleton(), json);
-  ValidateSum<ArrowType>(ctx, array, expected);
+  ValidateSum<ArrowType>(array, expected);
 }
 
 template <typename ArrowType>
-void ValidateSum(FunctionContext* ctx, const Array& array) {
-  ValidateSum<ArrowType>(ctx, array, NaiveSum<ArrowType>(array));
+void ValidateSum(const Array& array) {
+  ValidateSum<ArrowType>(array, NaiveSum<ArrowType>(array));
 }
 
 template <typename ArrowType>
-class TestNumericSumKernel : public ComputeFixture, public TestBase {};
+class TestNumericSumKernel : public TestBase {};
 
 TYPED_TEST_SUITE(TestNumericSumKernel, NumericArrowTypes);
 TYPED_TEST(TestNumericSumKernel, SimpleSum) {
@@ -143,46 +135,46 @@ TYPED_TEST(TestNumericSumKernel, SimpleSum) {
   using ScalarType = typename TypeTraits<SumType>::ScalarType;
   using T = typename TypeParam::c_type;
 
-  ValidateSum<TypeParam>(&this->ctx_, "[]", Datum(std::make_shared<ScalarType>()));
+  ValidateSum<TypeParam>("[]", Datum(std::make_shared<ScalarType>()));
 
-  ValidateSum<TypeParam>(&this->ctx_, "[null]", Datum(std::make_shared<ScalarType>()));
+  ValidateSum<TypeParam>("[null]", Datum(std::make_shared<ScalarType>()));
 
-  ValidateSum<TypeParam>(&this->ctx_, "[0, 1, 2, 3, 4, 5]",
+  ValidateSum<TypeParam>("[0, 1, 2, 3, 4, 5]",
                          Datum(std::make_shared<ScalarType>(static_cast<T>(5 * 6 / 2))));
 
   std::vector<std::string> chunks = {"[0, 1, 2, 3, 4, 5]"};
-  ValidateSum<TypeParam>(&this->ctx_, chunks,
+  ValidateSum<TypeParam>(chunks,
                          Datum(std::make_shared<ScalarType>(static_cast<T>(5 * 6 / 2))));
 
   chunks = {"[0, 1, 2]", "[3, 4, 5]"};
-  ValidateSum<TypeParam>(&this->ctx_, chunks,
+  ValidateSum<TypeParam>(chunks,
                          Datum(std::make_shared<ScalarType>(static_cast<T>(5 * 6 / 2))));
 
   chunks = {"[0, 1, 2]", "[]", "[3, 4, 5]"};
-  ValidateSum<TypeParam>(&this->ctx_, chunks,
+  ValidateSum<TypeParam>(chunks,
                          Datum(std::make_shared<ScalarType>(static_cast<T>(5 * 6 / 2))));
 
   chunks = {};
-  ValidateSum<TypeParam>(&this->ctx_, chunks,
+  ValidateSum<TypeParam>(chunks,
                          Datum(std::make_shared<ScalarType>()));  // null
 
   const T expected_result = static_cast<T>(14);
-  ValidateSum<TypeParam>(&this->ctx_, "[1, null, 3, null, 3, null, 7]",
+  ValidateSum<TypeParam>("[1, null, 3, null, 3, null, 7]",
                          Datum(std::make_shared<ScalarType>(expected_result)));
 }
 
 template <typename ArrowType>
-class TestRandomNumericSumKernel : public ComputeFixture, public TestBase {};
+class TestRandomNumericSumKernel : public TestBase {};
 
 TYPED_TEST_SUITE(TestRandomNumericSumKernel, NumericArrowTypes);
 TYPED_TEST(TestRandomNumericSumKernel, RandomArraySum) {
   auto rand = random::RandomArrayGenerator(0x5487655);
-  for (size_t i = 3; i < 14; i += 2) {
+  for (size_t i = 3; i < 10; i += 2) {
     for (auto null_probability : {0.0, 0.1, 0.5, 1.0}) {
       for (auto length_adjust : {-2, -1, 0, 1, 2}) {
         int64_t length = (1UL << i) + length_adjust;
         auto array = rand.Numeric<TypeParam>(length, 0, 100, null_probability);
-        ValidateSum<TypeParam>(&this->ctx_, *array);
+        ValidateSum<TypeParam>(*array);
       }
     }
   }
@@ -191,10 +183,10 @@ TYPED_TEST(TestRandomNumericSumKernel, RandomArraySum) {
 TYPED_TEST(TestRandomNumericSumKernel, RandomSliceArraySum) {
   auto arithmetic = ArrayFromJSON(TypeTraits<TypeParam>::type_singleton(),
                                   "[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]");
-  ValidateSum<TypeParam>(&this->ctx_, *arithmetic);
+  ValidateSum<TypeParam>(*arithmetic);
   for (size_t i = 1; i < 15; i++) {
     auto slice = arithmetic->Slice(i, 16);
-    ValidateSum<TypeParam>(&this->ctx_, *slice);
+    ValidateSum<TypeParam>(*slice);
   }
 
   // Trigger ConsumeSparse with different slice offsets.
@@ -204,85 +196,7 @@ TYPED_TEST(TestRandomNumericSumKernel, RandomSliceArraySum) {
   for (size_t i = 1; i < 16; i++) {
     for (size_t j = 1; j < 16; j++) {
       auto slice = array->Slice(i, length - j);
-      ValidateSum<TypeParam>(&this->ctx_, *slice);
-    }
-  }
-}
-
-///
-/// Mean
-///
-
-template <typename ArrowType>
-static Datum NaiveMean(const Array& array) {
-  using MeanScalarType = typename TypeTraits<DoubleType>::ScalarType;
-
-  const auto result = NaiveSumPartial<ArrowType>(array);
-  const double mean = static_cast<double>(result.first) /
-                      static_cast<double>(result.second ? result.second : 1UL);
-  const bool is_valid = result.second > 0;
-
-  if (!is_valid) return Datum(std::make_shared<MeanScalarType>());
-  return Datum(std::make_shared<MeanScalarType>(mean));
-}
-
-template <typename ArrowType>
-void ValidateMean(FunctionContext* ctx, const Array& input, Datum expected) {
-  using OutputType = typename FindAccumulatorType<DoubleType>::Type;
-
-  Datum result;
-  ASSERT_OK(Mean(ctx, input, &result));
-  DatumEqual<OutputType>::EnsureEqual(result, expected);
-}
-
-template <typename ArrowType>
-void ValidateMean(FunctionContext* ctx, const char* json, Datum expected) {
-  auto array = ArrayFromJSON(TypeTraits<ArrowType>::type_singleton(), json);
-  ValidateMean<ArrowType>(ctx, *array, expected);
-}
-
-template <typename ArrowType>
-void ValidateMean(FunctionContext* ctx, const Array& array) {
-  ValidateMean<ArrowType>(ctx, array, NaiveMean<ArrowType>(array));
-}
-
-template <typename ArrowType>
-class TestMeanKernelNumeric : public ComputeFixture, public TestBase {};
-
-TYPED_TEST_SUITE(TestMeanKernelNumeric, NumericArrowTypes);
-TYPED_TEST(TestMeanKernelNumeric, SimpleMean) {
-  using ScalarType = typename TypeTraits<DoubleType>::ScalarType;
-
-  ValidateMean<TypeParam>(&this->ctx_, "[]", Datum(std::make_shared<ScalarType>()));
-
-  ValidateMean<TypeParam>(&this->ctx_, "[null]", Datum(std::make_shared<ScalarType>()));
-
-  ValidateMean<TypeParam>(&this->ctx_, "[1, null, 1]",
-                          Datum(std::make_shared<ScalarType>(1.0)));
-
-  ValidateMean<TypeParam>(&this->ctx_, "[1, 2, 3, 4, 5, 6, 7, 8]",
-                          Datum(std::make_shared<ScalarType>(4.5)));
-
-  ValidateMean<TypeParam>(&this->ctx_, "[0, 0, 0, 0, 0, 0, 0, 0]",
-                          Datum(std::make_shared<ScalarType>(0.0)));
-
-  ValidateMean<TypeParam>(&this->ctx_, "[1, 1, 1, 1, 1, 1, 1, 1]",
-                          Datum(std::make_shared<ScalarType>(1.0)));
-}
-
-template <typename ArrowType>
-class TestRandomNumericMeanKernel : public ComputeFixture, public TestBase {};
-
-TYPED_TEST_SUITE(TestRandomNumericMeanKernel, NumericArrowTypes);
-TYPED_TEST(TestRandomNumericMeanKernel, RandomArrayMean) {
-  auto rand = random::RandomArrayGenerator(0x8afc055);
-  for (size_t i = 3; i < 14; i += 2) {
-    for (auto null_probability : {0.0, 0.1, 0.5, 1.0}) {
-      for (auto length_adjust : {-2, -1, 0, 1, 2}) {
-        int64_t length = (1UL << i) + length_adjust;
-        auto array = rand.Numeric<TypeParam>(length, 0, 100, null_probability);
-        ValidateMean<TypeParam>(&this->ctx_, *array);
-      }
+      ValidateSum<TypeParam>(*slice);
     }
   }
 }
@@ -302,175 +216,53 @@ static CountPair NaiveCount(const Array& array) {
   return count;
 }
 
-void ValidateCount(FunctionContext* ctx, const Array& input, CountPair expected) {
+void ValidateCount(const Array& input, CountPair expected) {
   CountOptions all = CountOptions(CountOptions::COUNT_ALL);
   CountOptions nulls = CountOptions(CountOptions::COUNT_NULL);
-  Datum result;
 
-  ASSERT_OK(Count(ctx, all, input, &result));
+  ASSERT_OK_AND_ASSIGN(Datum result, Count(input, all));
   AssertDatumsEqual(result, Datum(expected.first));
 
-  ASSERT_OK(Count(ctx, nulls, input, &result));
+  ASSERT_OK_AND_ASSIGN(result, Count(input, nulls));
   AssertDatumsEqual(result, Datum(expected.second));
 }
 
 template <typename ArrowType>
-void ValidateCount(FunctionContext* ctx, const char* json, CountPair expected) {
+void ValidateCount(const char* json, CountPair expected) {
   auto array = ArrayFromJSON(TypeTraits<ArrowType>::type_singleton(), json);
-  ValidateCount(ctx, *array, expected);
+  ValidateCount(*array, expected);
 }
 
-void ValidateCount(FunctionContext* ctx, const Array& input) {
-  ValidateCount(ctx, input, NaiveCount(input));
-}
+void ValidateCount(const Array& input) { ValidateCount(input, NaiveCount(input)); }
 
 template <typename ArrowType>
-class TestCountKernel : public ComputeFixture, public TestBase {};
+class TestCountKernel : public TestBase {};
 
 TYPED_TEST_SUITE(TestCountKernel, NumericArrowTypes);
 TYPED_TEST(TestCountKernel, SimpleCount) {
-  ValidateCount<TypeParam>(&this->ctx_, "[]", {0, 0});
-  ValidateCount<TypeParam>(&this->ctx_, "[null]", {0, 1});
-  ValidateCount<TypeParam>(&this->ctx_, "[1, null, 2]", {2, 1});
-  ValidateCount<TypeParam>(&this->ctx_, "[null, null, null]", {0, 3});
-  ValidateCount<TypeParam>(&this->ctx_, "[1, 2, 3, 4, 5, 6, 7, 8, 9]", {9, 0});
+  ValidateCount<TypeParam>("[]", {0, 0});
+  ValidateCount<TypeParam>("[null]", {0, 1});
+  ValidateCount<TypeParam>("[1, null, 2]", {2, 1});
+  ValidateCount<TypeParam>("[null, null, null]", {0, 3});
+  ValidateCount<TypeParam>("[1, 2, 3, 4, 5, 6, 7, 8, 9]", {9, 0});
 }
 
 template <typename ArrowType>
-class TestRandomNumericCountKernel : public ComputeFixture, public TestBase {};
+class TestRandomNumericCountKernel : public TestBase {};
 
 TYPED_TEST_SUITE(TestRandomNumericCountKernel, NumericArrowTypes);
 TYPED_TEST(TestRandomNumericCountKernel, RandomArrayCount) {
   auto rand = random::RandomArrayGenerator(0x1205643);
-  for (size_t i = 3; i < 14; i++) {
+  for (size_t i = 3; i < 10; i++) {
     for (auto null_probability : {0.0, 0.01, 0.1, 0.25, 0.5, 1.0}) {
       for (auto length_adjust : {-2, -1, 0, 1, 2}) {
         int64_t length = (1UL << i) + length_adjust;
         auto array = rand.Numeric<TypeParam>(length, 0, 100, null_probability);
-        ValidateCount(&this->ctx_, *array);
+        ValidateCount(*array);
       }
     }
   }
 }
 
-///
-/// Min / Max
-///
-
-template <typename ArrowType>
-class TestNumericMinMaxKernel : public ComputeFixture, public TestBase {
-  using Traits = TypeTraits<ArrowType>;
-  using ArrayType = typename Traits::ArrayType;
-  using c_type = typename ArrayType::value_type;
-  using ScalarType = typename Traits::ScalarType;
-
- public:
-  void AssertMinMaxIs(const Datum& array, c_type expected_min, c_type expected_max,
-                      const MinMaxOptions& options) {
-    Datum out;
-    ASSERT_OK(MinMax(&this->ctx_, options, array, &out));
-
-    ASSERT_TRUE(out.is_collection());
-
-    Datum out_min = out.collection()[0];
-    ASSERT_TRUE(out_min.is_scalar());
-    ASSERT_EQ(checked_cast<const ScalarType&>(*out_min.scalar()).value, expected_min);
-
-    Datum out_max = out.collection()[1];
-    ASSERT_TRUE(out_max.is_scalar());
-    ASSERT_EQ(checked_cast<const ScalarType&>(*out_max.scalar()).value, expected_max);
-  }
-
-  void AssertMinMaxIs(const std::string& json, c_type expected_min, c_type expected_max,
-                      const MinMaxOptions& options) {
-    auto array = ArrayFromJSON(Traits::type_singleton(), json);
-    AssertMinMaxIs(array, expected_min, expected_max, options);
-  }
-
-  void AssertMinMaxIs(const std::vector<std::string>& json, c_type expected_min,
-                      c_type expected_max, const MinMaxOptions& options) {
-    auto array = ChunkedArrayFromJSON(Traits::type_singleton(), json);
-    AssertMinMaxIs(array, expected_min, expected_max, options);
-  }
-
-  void AssertMinMaxIsNull(const Datum& array, const MinMaxOptions& options) {
-    Datum out;
-    ASSERT_OK(MinMax(&this->ctx_, options, array, &out));
-
-    ASSERT_TRUE(out.is_collection());
-    for (const auto& item : out.collection()) {
-      ASSERT_TRUE(item.is_scalar());
-      ASSERT_FALSE(item.scalar()->is_valid);
-    }
-  }
-
-  void AssertMinMaxIsNull(const std::string& json, const MinMaxOptions& options) {
-    auto array = ArrayFromJSON(Traits::type_singleton(), json);
-    AssertMinMaxIsNull(array, options);
-  }
-
-  void AssertMinMaxIsNull(const std::vector<std::string>& json,
-                          const MinMaxOptions& options) {
-    auto array = ChunkedArrayFromJSON(Traits::type_singleton(), json);
-    AssertMinMaxIsNull(array, options);
-  }
-};
-
-template <typename ArrowType>
-class TestFloatingMinMaxKernel : public TestNumericMinMaxKernel<ArrowType> {};
-
-TYPED_TEST_SUITE(TestNumericMinMaxKernel, IntegralArrowTypes);
-TYPED_TEST(TestNumericMinMaxKernel, Basics) {
-  MinMaxOptions options;
-  std::vector<std::string> chunked_input1 = {"[5, 1, 2, 3, 4]", "[9, 1, null, 3, 4]"};
-  std::vector<std::string> chunked_input2 = {"[5, null, 2, 3, 4]", "[9, 1, 2, 3, 4]"};
-  std::vector<std::string> chunked_input3 = {"[5, 1, 2, 3, null]", "[9, 1, null, 3, 4]"};
-
-  this->AssertMinMaxIs("[5, 1, 2, 3, 4]", 1, 5, options);
-  this->AssertMinMaxIs("[5, null, 2, 3, 4]", 2, 5, options);
-  this->AssertMinMaxIs(chunked_input1, 1, 9, options);
-  this->AssertMinMaxIs(chunked_input2, 1, 9, options);
-  this->AssertMinMaxIs(chunked_input3, 1, 9, options);
-
-  options = MinMaxOptions(MinMaxOptions::OUTPUT_NULL);
-  this->AssertMinMaxIs("[5, 1, 2, 3, 4]", 1, 5, options);
-  // output null
-  this->AssertMinMaxIsNull("[5, null, 2, 3, 4]", options);
-  // output null
-  this->AssertMinMaxIsNull(chunked_input1, options);
-  this->AssertMinMaxIsNull(chunked_input2, options);
-  this->AssertMinMaxIsNull(chunked_input3, options);
-}
-
-TYPED_TEST_SUITE(TestFloatingMinMaxKernel, RealArrowTypes);
-TYPED_TEST(TestFloatingMinMaxKernel, Floats) {
-  MinMaxOptions options;
-  std::vector<std::string> chunked_input1 = {"[5, 1, 2, 3, 4]", "[9, 1, null, 3, 4]"};
-  std::vector<std::string> chunked_input2 = {"[5, null, 2, 3, 4]", "[9, 1, 2, 3, 4]"};
-  std::vector<std::string> chunked_input3 = {"[5, 1, 2, 3, null]", "[9, 1, null, 3, 4]"};
-
-  this->AssertMinMaxIs("[5, 1, 2, 3, 4]", 1, 5, options);
-  this->AssertMinMaxIs("[5, 1, 2, 3, 4]", 1, 5, options);
-  this->AssertMinMaxIs("[5, null, 2, 3, 4]", 2, 5, options);
-  this->AssertMinMaxIs("[5, Inf, 2, 3, 4]", 2.0, INFINITY, options);
-  this->AssertMinMaxIs("[5, NaN, 2, 3, 4]", 2, 5, options);
-  this->AssertMinMaxIs("[5, -Inf, 2, 3, 4]", -INFINITY, 5, options);
-  this->AssertMinMaxIs(chunked_input1, 1, 9, options);
-  this->AssertMinMaxIs(chunked_input2, 1, 9, options);
-  this->AssertMinMaxIs(chunked_input3, 1, 9, options);
-
-  options = MinMaxOptions(MinMaxOptions::OUTPUT_NULL);
-  this->AssertMinMaxIs("[5, 1, 2, 3, 4]", 1, 5, options);
-  this->AssertMinMaxIs("[5, -Inf, 2, 3, 4]", -INFINITY, 5, options);
-  // output null
-  this->AssertMinMaxIsNull("[5, null, 2, 3, 4]", options);
-  // output null
-  this->AssertMinMaxIsNull("[5, -Inf, null, 3, 4]", options);
-  // output null
-  this->AssertMinMaxIsNull(chunked_input1, options);
-  this->AssertMinMaxIsNull(chunked_input2, options);
-  this->AssertMinMaxIsNull(chunked_input3, options);
-}
-
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/boolean.cc b/cpp/src/arrow/compute/kernels/boolean.cc
deleted file mode 100644
index 29fe09f96e8..00000000000
--- a/cpp/src/arrow/compute/kernels/boolean.cc
+++ /dev/null
@@ -1,269 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/kernels/boolean.h"
-
-#include <bitset>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "arrow/array.h"
-#include "arrow/buffer.h"
-#include "arrow/compute/context.h"
-#include "arrow/compute/kernel.h"
-#include "arrow/compute/kernels/util_internal.h"
-#include "arrow/type_fwd.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/logging.h"
-
-namespace arrow {
-
-using internal::Bitmap;
-using internal::BitmapAnd;
-using internal::BitmapOr;
-using internal::BitmapXor;
-using internal::CountSetBits;
-using internal::InvertBitmap;
-
-namespace compute {
-
-class BooleanUnaryKernel : public UnaryKernel {
- public:
-  std::shared_ptr<DataType> out_type() const override { return boolean(); }
-};
-
-class InvertKernel : public BooleanUnaryKernel {
-  Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override {
-    DCHECK_EQ(Datum::ARRAY, input.kind());
-    constexpr int64_t kZeroDestOffset = 0;
-
-    const ArrayData& in_data = *input.array();
-    std::shared_ptr<ArrayData> result = out->array();
-    result->type = boolean();
-
-    // Handle output data buffer
-    if (in_data.length > 0) {
-      RETURN_NOT_OK(detail::PropagateNulls(ctx, in_data, result.get()));
-      const Buffer& data_buffer = *in_data.buffers[1];
-      DCHECK_LE(BitUtil::BytesForBits(in_data.length), data_buffer.size());
-      InvertBitmap(data_buffer.data(), in_data.offset, in_data.length,
-                   result->buffers[1]->mutable_data(), kZeroDestOffset);
-    }
-    return Status::OK();
-  }
-};
-
-Status Invert(FunctionContext* ctx, const Datum& value, Datum* out) {
-  InvertKernel invert;
-  detail::PrimitiveAllocatingUnaryKernel kernel(&invert);
-
-  std::vector<Datum> result;
-  RETURN_NOT_OK(detail::InvokeUnaryArrayKernel(ctx, &kernel, value, &result));
-
-  *out = detail::WrapDatumsLike(value, invert.out_type(), result);
-  return Status::OK();
-}
-
-enum class ResolveNull { KLEENE_LOGIC, PROPAGATE };
-
-class BinaryBooleanKernel : public BinaryKernel {
- public:
-  explicit BinaryBooleanKernel(ResolveNull resolve_null) : resolve_null_(resolve_null) {}
-
- protected:
-  virtual Status Compute(FunctionContext* ctx, const ArrayData& left,
-                         const ArrayData& right, ArrayData* out) = 0;
-
-  Status Call(FunctionContext* ctx, const Datum& left, const Datum& right,
-              Datum* out) override {
-    DCHECK_EQ(Datum::ARRAY, right.kind());
-    DCHECK_EQ(Datum::ARRAY, left.kind());
-
-    const ArrayData& left_data = *left.array();
-    const ArrayData& right_data = *right.array();
-    DCHECK_EQ(left_data.length, right_data.length);
-    ArrayData* result;
-
-    result = out->array().get();
-    return Compute(ctx, left_data, right_data, result);
-  }
-
-  std::shared_ptr<DataType> out_type() const override { return boolean(); }
-
-  enum BitmapIndex { LEFT_VALID, LEFT_DATA, RIGHT_VALID, RIGHT_DATA };
-
-  template <typename ComputeWord>
-  Status ComputeKleene(ComputeWord&& compute_word, FunctionContext* ctx,
-                       const ArrayData& left, const ArrayData& right, ArrayData* out) {
-    DCHECK(left.null_count != 0 || right.null_count != 0);
-
-    Bitmap bitmaps[4];
-    bitmaps[LEFT_VALID] = {left.buffers[0], left.offset, left.length};
-    bitmaps[LEFT_DATA] = {left.buffers[1], left.offset, left.length};
-
-    bitmaps[RIGHT_VALID] = {right.buffers[0], right.offset, right.length};
-    bitmaps[RIGHT_DATA] = {right.buffers[1], right.offset, right.length};
-
-    ARROW_ASSIGN_OR_RAISE(out->buffers[0],
-                          AllocateEmptyBitmap(out->length, ctx->memory_pool()));
-
-    auto out_validity = out->GetMutableValues<uint64_t>(0);
-    auto out_data = out->GetMutableValues<uint64_t>(1);
-
-    int64_t i = 0;
-    auto apply = [&](uint64_t left_valid, uint64_t left_data, uint64_t right_valid,
-                     uint64_t right_data) {
-      auto left_true = left_valid & left_data;
-      auto left_false = left_valid & ~left_data;
-
-      auto right_true = right_valid & right_data;
-      auto right_false = right_valid & ~right_data;
-
-      compute_word(left_true, left_false, right_true, right_false, &out_validity[i],
-                   &out_data[i]);
-      ++i;
-    };
-
-    if (right.null_count == 0 || left.null_count == 0) {
-      if (left.null_count == 0) {
-        // ensure only bitmaps[RIGHT_VALID].buffer might be null
-        std::swap(bitmaps[LEFT_VALID], bitmaps[RIGHT_VALID]);
-        std::swap(bitmaps[LEFT_DATA], bitmaps[RIGHT_DATA]);
-      }
-      // override bitmaps[RIGHT_VALID] to make it safe for Visit()
-      bitmaps[RIGHT_VALID] = bitmaps[RIGHT_DATA];
-
-      Bitmap::VisitWords(bitmaps, [&](std::array<uint64_t, 4> words) {
-        apply(words[LEFT_VALID], words[LEFT_DATA], ~uint64_t(0), words[RIGHT_DATA]);
-      });
-    } else {
-      Bitmap::VisitWords(bitmaps, [&](std::array<uint64_t, 4> words) {
-        apply(words[LEFT_VALID], words[LEFT_DATA], words[RIGHT_VALID], words[RIGHT_DATA]);
-      });
-    }
-    return Status::OK();
-  }
-
-  ResolveNull resolve_null_;
-};
-
-class AndKernel : public BinaryBooleanKernel {
- public:
-  using BinaryBooleanKernel::BinaryBooleanKernel;
-
- private:
-  Status Compute(FunctionContext* ctx, const ArrayData& left, const ArrayData& right,
-                 ArrayData* out) override {
-    if (resolve_null_ == ResolveNull::PROPAGATE ||
-        (left.GetNullCount() == 0 && right.GetNullCount() == 0)) {
-      RETURN_NOT_OK(detail::AssignNullIntersection(ctx, left, right, out));
-      if (right.length > 0) {
-        BitmapAnd(left.buffers[1]->data(), left.offset, right.buffers[1]->data(),
-                  right.offset, right.length, 0, out->buffers[1]->mutable_data());
-      }
-      return Status::OK();
-    }
-
-    auto compute_word = [](uint64_t left_true, uint64_t left_false, uint64_t right_true,
-                           uint64_t right_false, uint64_t* out_valid,
-                           uint64_t* out_data) {
-      *out_data = left_true & right_true;
-      *out_valid = left_false | right_false | (left_true & right_true);
-    };
-
-    return ComputeKleene(compute_word, ctx, left, right, out);
-  }
-};
-
-Status And(FunctionContext* ctx, const Datum& left, const Datum& right, Datum* out) {
-  AndKernel and_kernel(ResolveNull::PROPAGATE);
-  detail::PrimitiveAllocatingBinaryKernel kernel(&and_kernel);
-  return detail::InvokeBinaryArrayKernel(ctx, &kernel, left, right, out);
-}
-
-Status KleeneAnd(FunctionContext* ctx, const Datum& left, const Datum& right,
-                 Datum* out) {
-  AndKernel and_kernel(ResolveNull::KLEENE_LOGIC);
-  detail::PrimitiveAllocatingBinaryKernel kernel(&and_kernel);
-  return detail::InvokeBinaryArrayKernel(ctx, &kernel, left, right, out);
-}
-
-class OrKernel : public BinaryBooleanKernel {
- public:
-  using BinaryBooleanKernel::BinaryBooleanKernel;
-
- private:
-  Status Compute(FunctionContext* ctx, const ArrayData& left, const ArrayData& right,
-                 ArrayData* out) override {
-    if (resolve_null_ == ResolveNull::PROPAGATE ||
-        (left.GetNullCount() == 0 && right.GetNullCount() == 0)) {
-      RETURN_NOT_OK(detail::AssignNullIntersection(ctx, left, right, out));
-      if (right.length > 0) {
-        BitmapOr(left.buffers[1]->data(), left.offset, right.buffers[1]->data(),
-                 right.offset, right.length, 0, out->buffers[1]->mutable_data());
-      }
-      return Status::OK();
-    }
-
-    static auto compute_word = [](uint64_t left_true, uint64_t left_false,
-                                  uint64_t right_true, uint64_t right_false,
-                                  uint64_t* out_valid, uint64_t* out_data) {
-      *out_data = left_true | right_true;
-      *out_valid = left_true | right_true | (left_false & right_false);
-    };
-
-    return ComputeKleene(compute_word, ctx, left, right, out);
-  }
-};
-
-Status Or(FunctionContext* ctx, const Datum& left, const Datum& right, Datum* out) {
-  OrKernel or_kernel(ResolveNull::PROPAGATE);
-  detail::PrimitiveAllocatingBinaryKernel kernel(&or_kernel);
-  return detail::InvokeBinaryArrayKernel(ctx, &kernel, left, right, out);
-}
-
-Status KleeneOr(FunctionContext* ctx, const Datum& left, const Datum& right, Datum* out) {
-  OrKernel or_kernel(ResolveNull::KLEENE_LOGIC);
-  detail::PrimitiveAllocatingBinaryKernel kernel(&or_kernel);
-  return detail::InvokeBinaryArrayKernel(ctx, &kernel, left, right, out);
-}
-
-class XorKernel : public BinaryBooleanKernel {
- public:
-  XorKernel() : BinaryBooleanKernel(ResolveNull::PROPAGATE) {}
-
- private:
-  Status Compute(FunctionContext* ctx, const ArrayData& left, const ArrayData& right,
-                 ArrayData* out) override {
-    RETURN_NOT_OK(detail::AssignNullIntersection(ctx, left, right, out));
-    if (right.length > 0) {
-      BitmapXor(left.buffers[1]->data(), left.offset, right.buffers[1]->data(),
-                right.offset, right.length, 0, out->buffers[1]->mutable_data());
-    }
-    return Status::OK();
-  }
-};
-
-Status Xor(FunctionContext* ctx, const Datum& left, const Datum& right, Datum* out) {
-  XorKernel xor_kernel;
-  detail::PrimitiveAllocatingBinaryKernel kernel(&xor_kernel);
-  return detail::InvokeBinaryArrayKernel(ctx, &kernel, left, right, out);
-}
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/boolean.h b/cpp/src/arrow/compute/kernels/boolean.h
deleted file mode 100644
index 6c22120d6bd..00000000000
--- a/cpp/src/arrow/compute/kernels/boolean.h
+++ /dev/null
@@ -1,105 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include "arrow/status.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-namespace compute {
-
-struct Datum;
-class FunctionContext;
-
-/// \brief Invert the values of a boolean datum
-/// \param[in] context the FunctionContext
-/// \param[in] value datum to invert
-/// \param[out] out resulting datum
-///
-/// \since 0.11.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Status Invert(FunctionContext* context, const Datum& value, Datum* out);
-
-/// \brief Element-wise AND of two boolean datums which always propagates nulls
-/// (null and false is null).
-///
-/// \param[in] context the FunctionContext
-/// \param[in] left left operand (array)
-/// \param[in] right right operand (array)
-/// \param[out] out resulting datum
-///
-/// \since 0.11.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Status And(FunctionContext* context, const Datum& left, const Datum& right, Datum* out);
-
-/// \brief Element-wise AND of two boolean datums with a Kleene truth table
-/// (null and false is false).
-///
-/// \param[in] context the FunctionContext
-/// \param[in] left left operand (array)
-/// \param[in] right right operand (array)
-/// \param[out] out resulting datum
-///
-/// \since 1.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Status KleeneAnd(FunctionContext* context, const Datum& left, const Datum& right,
-                 Datum* out);
-
-/// \brief Element-wise OR of two boolean datums which always propagates nulls
-/// (null and true is null).
-///
-/// \param[in] context the FunctionContext
-/// \param[in] left left operand (array)
-/// \param[in] right right operand (array)
-/// \param[out] out resulting datum
-///
-/// \since 0.11.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Status Or(FunctionContext* context, const Datum& left, const Datum& right, Datum* out);
-
-/// \brief Element-wise OR of two boolean datums with a Kleene truth table
-/// (null or true is true).
-///
-/// \param[in] context the FunctionContext
-/// \param[in] left left operand (array)
-/// \param[in] right right operand (array)
-/// \param[out] out resulting datum
-///
-/// \since 1.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Status KleeneOr(FunctionContext* context, const Datum& left, const Datum& right,
-                Datum* out);
-
-/// \brief Element-wise XOR of two boolean datums
-/// \param[in] context the FunctionContext
-/// \param[in] left left operand (array)
-/// \param[in] right right operand (array)
-/// \param[out] out resulting datum
-///
-/// \since 0.11.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Status Xor(FunctionContext* context, const Datum& left, const Datum& right, Datum* out);
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc
deleted file mode 100644
index 3923dfadaa8..00000000000
--- a/cpp/src/arrow/compute/kernels/cast.cc
+++ /dev/null
@@ -1,1549 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/kernels/cast.h"
-
-#include <cstdint>
-#include <cstring>
-#include <functional>
-#include <limits>
-#include <memory>
-#include <string>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "arrow/array.h"
-#include "arrow/buffer.h"
-#include "arrow/builder.h"
-#include "arrow/type.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/formatting.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/macros.h"
-#include "arrow/util/time.h"
-#include "arrow/util/utf8.h"
-#include "arrow/util/value_parsing.h"  // IWYU pragma: keep
-#include "arrow/visitor_inline.h"
-
-#include "arrow/compute/context.h"
-#include "arrow/compute/kernel.h"
-#include "arrow/compute/kernels/util_internal.h"
-
-#ifdef ARROW_EXTRA_ERROR_CONTEXT
-
-#define FUNC_RETURN_NOT_OK(expr)                     \
-  do {                                               \
-    Status _st = (expr);                             \
-    if (ARROW_PREDICT_FALSE(!_st.ok())) {            \
-      _st.AddContextLine(__FILE__, __LINE__, #expr); \
-      ctx->SetStatus(_st);                           \
-      return;                                        \
-    }                                                \
-  } while (0)
-
-#else
-
-#define FUNC_RETURN_NOT_OK(expr)          \
-  do {                                    \
-    Status _st = (expr);                  \
-    if (ARROW_PREDICT_FALSE(!_st.ok())) { \
-      ctx->SetStatus(_st);                \
-      return;                             \
-    }                                     \
-  } while (0)
-
-#endif  // ARROW_EXTRA_ERROR_CONTEXT
-
-namespace arrow {
-
-using internal::checked_cast;
-using internal::CopyBitmap;
-
-namespace compute {
-
-constexpr int64_t kMillisecondsInDay = 86400000;
-
-Status CastNotImplemented(const DataType& in_type, const DataType& out_type) {
-  return Status::NotImplemented("No cast implemented from ", in_type.ToString(), " to ",
-                                out_type.ToString());
-}
-
-template <typename OutType, typename InType, typename Enable = void>
-struct CastFunctor {};
-
-// ----------------------------------------------------------------------
-// Dictionary to null
-
-template <>
-struct CastFunctor<NullType, DictionaryType> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    output->buffers = {nullptr};
-    output->null_count = output->length;
-  }
-};
-
-// ----------------------------------------------------------------------
-// Boolean to other things
-
-// Cast from Boolean to other numbers
-template <typename T>
-struct CastFunctor<T, BooleanType, enable_if_number<T>> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    using c_type = typename T::c_type;
-    constexpr auto kOne = static_cast<c_type>(1);
-    constexpr auto kZero = static_cast<c_type>(0);
-
-    if (input.length == 0) return;
-
-    internal::BitmapReader bit_reader(input.buffers[1]->data(), input.offset,
-                                      input.length);
-    auto out = output->GetMutableValues<c_type>(1);
-    for (int64_t i = 0; i < input.length; ++i) {
-      *out++ = bit_reader.IsSet() ? kOne : kZero;
-      bit_reader.Next();
-    }
-  }
-};
-
-// Number to Boolean
-template <typename I>
-struct CastFunctor<BooleanType, I,
-                   enable_if_t<is_number_type<I>::value && !is_boolean_type<I>::value>> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    auto in_data = input.GetValues<typename I::c_type>(1);
-    const auto generate = [&in_data]() -> bool { return *in_data++ != 0; };
-    internal::GenerateBitsUnrolled(output->buffers[1]->mutable_data(), output->offset,
-                                   input.length, generate);
-  }
-};
-
-// ----------------------------------------------------------------------
-// Integers and Floating Point
-
-// Conversions pairs (<O, I>) are partitioned in 4 type traits:
-// - is_number_downcast
-// - is_integral_signed_to_unsigned
-// - is_integral_unsigned_to_signed
-// - is_float_truncate
-//
-// Each class has a different way of validation if the conversion is safe
-// (either with bounded intervals or with explicit C casts)
-
-template <typename O, typename I, typename Enable = void>
-struct is_number_downcast {
-  static constexpr bool value = false;
-};
-
-template <typename O, typename I>
-struct is_number_downcast<
-    O, I, enable_if_t<is_number_type<O>::value && is_number_type<I>::value>> {
-  using O_T = typename O::c_type;
-  using I_T = typename I::c_type;
-
-  static constexpr bool value =
-      ((!std::is_same<O, I>::value) &&
-       // Both types are of the same sign-ness.
-       ((std::is_signed<O_T>::value == std::is_signed<I_T>::value) &&
-        // Both types are of the same integral-ness.
-        (std::is_floating_point<O_T>::value == std::is_floating_point<I_T>::value)) &&
-       // Smaller output size
-       (sizeof(O_T) < sizeof(I_T)));
-};
-
-template <typename O, typename I, typename Enable = void>
-struct is_integral_signed_to_unsigned {
-  static constexpr bool value = false;
-};
-
-template <typename O, typename I>
-struct is_integral_signed_to_unsigned<
-    O, I, enable_if_t<is_integer_type<O>::value && is_integer_type<I>::value>> {
-  using O_T = typename O::c_type;
-  using I_T = typename I::c_type;
-
-  static constexpr bool value =
-      ((!std::is_same<O, I>::value) &&
-       ((std::is_unsigned<O_T>::value && std::is_signed<I_T>::value)));
-};
-
-template <typename O, typename I, typename Enable = void>
-struct is_integral_unsigned_to_signed {
-  static constexpr bool value = false;
-};
-
-template <typename O, typename I>
-struct is_integral_unsigned_to_signed<
-    O, I, enable_if_t<is_integer_type<O>::value && is_integer_type<I>::value>> {
-  using O_T = typename O::c_type;
-  using I_T = typename I::c_type;
-
-  static constexpr bool value =
-      ((!std::is_same<O, I>::value) &&
-       ((std::is_signed<O_T>::value && std::is_unsigned<I_T>::value)));
-};
-
-// This set of functions SafeMinimum/SafeMaximum would be simplified with
-// C++17 and `if constexpr`.
-
-// clang-format doesn't handle this construct properly. Thus the macro, but it
-// also improves readability.
-//
-// The effective return type of the function is always `I::c_type`, this is
-// just how enable_if works with functions.
-#define RET_TYPE(TRAIT) enable_if_t<TRAIT<O, I>::value, typename I::c_type>
-
-template <typename O, typename I>
-constexpr RET_TYPE(is_number_downcast) SafeMinimum() {
-  using out_type = typename O::c_type;
-
-  return std::numeric_limits<out_type>::lowest();
-}
-
-template <typename O, typename I>
-constexpr RET_TYPE(is_number_downcast) SafeMaximum() {
-  using out_type = typename O::c_type;
-
-  return std::numeric_limits<out_type>::max();
-}
-
-template <typename O, typename I>
-constexpr RET_TYPE(is_integral_unsigned_to_signed) SafeMinimum() {
-  return 0;
-}
-
-template <typename O, typename I>
-constexpr RET_TYPE(is_integral_unsigned_to_signed) SafeMaximum() {
-  using in_type = typename I::c_type;
-  using out_type = typename O::c_type;
-
-  // Equality is missing because in_type::max() > out_type::max() when types
-  // are of the same width.
-  return static_cast<in_type>(sizeof(in_type) < sizeof(out_type)
-                                  ? std::numeric_limits<in_type>::max()
-                                  : std::numeric_limits<out_type>::max());
-}
-
-template <typename O, typename I>
-constexpr RET_TYPE(is_integral_signed_to_unsigned) SafeMinimum() {
-  return 0;
-}
-
-template <typename O, typename I>
-constexpr RET_TYPE(is_integral_signed_to_unsigned) SafeMaximum() {
-  using in_type = typename I::c_type;
-  using out_type = typename O::c_type;
-
-  return static_cast<in_type>(sizeof(in_type) <= sizeof(out_type)
-                                  ? std::numeric_limits<in_type>::max()
-                                  : std::numeric_limits<out_type>::max());
-}
-
-#undef RET_TYPE
-
-template <typename O, typename I>
-struct CastFunctor<O, I,
-                   enable_if_t<is_number_downcast<O, I>::value ||
-                               is_integral_signed_to_unsigned<O, I>::value ||
-                               is_integral_unsigned_to_signed<O, I>::value>> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    using in_type = typename I::c_type;
-    using out_type = typename O::c_type;
-
-    auto in_offset = input.offset;
-
-    const in_type* in_data = input.GetValues<in_type>(1);
-    auto out_data = output->GetMutableValues<out_type>(1);
-
-    if (!options.allow_int_overflow) {
-      constexpr in_type kMax = SafeMaximum<O, I>();
-      constexpr in_type kMin = SafeMinimum<O, I>();
-
-      // Null count may be -1 if the input array had been sliced
-      if (input.null_count != 0) {
-        internal::BitmapReader is_valid_reader(input.buffers[0]->data(), in_offset,
-                                               input.length);
-        for (int64_t i = 0; i < input.length; ++i) {
-          if (ARROW_PREDICT_FALSE(is_valid_reader.IsSet() &&
-                                  (*in_data > kMax || *in_data < kMin))) {
-            ctx->SetStatus(Status::Invalid("Integer value out of bounds"));
-          }
-          *out_data++ = static_cast<out_type>(*in_data++);
-          is_valid_reader.Next();
-        }
-      } else {
-        for (int64_t i = 0; i < input.length; ++i) {
-          if (ARROW_PREDICT_FALSE(*in_data > kMax || *in_data < kMin)) {
-            ctx->SetStatus(Status::Invalid("Integer value out of bounds"));
-          }
-          *out_data++ = static_cast<out_type>(*in_data++);
-        }
-      }
-    } else {
-      for (int64_t i = 0; i < input.length; ++i) {
-        *out_data++ = static_cast<out_type>(*in_data++);
-      }
-    }
-  }
-};
-
-// Float to Integer or Integer to Float
-template <typename O, typename I, typename Enable = void>
-struct is_float_truncate {
-  static constexpr bool value = false;
-};
-
-template <typename O, typename I>
-struct is_float_truncate<
-    O, I,
-    enable_if_t<(is_integer_type<O>::value && is_floating_type<I>::value) ||
-                (is_integer_type<I>::value && is_floating_type<O>::value)>> {
-  static constexpr bool value = true;
-};
-
-template <typename O, typename I>
-struct CastFunctor<O, I, enable_if_t<is_float_truncate<O, I>::value>> {
-  ARROW_DISABLE_UBSAN("float-cast-overflow")
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    using in_type = typename I::c_type;
-    using out_type = typename O::c_type;
-
-    auto in_offset = input.offset;
-    const in_type* in_data = input.GetValues<in_type>(1);
-    auto out_data = output->GetMutableValues<out_type>(1);
-
-    if (options.allow_float_truncate) {
-      // unsafe cast
-      for (int64_t i = 0; i < input.length; ++i) {
-        *out_data++ = static_cast<out_type>(*in_data++);
-      }
-    } else {
-      // safe cast
-      if (input.null_count != 0) {
-        internal::BitmapReader is_valid_reader(input.buffers[0]->data(), in_offset,
-                                               input.length);
-        for (int64_t i = 0; i < input.length; ++i) {
-          auto out_value = static_cast<out_type>(*in_data);
-          if (ARROW_PREDICT_FALSE(is_valid_reader.IsSet() &&
-                                  static_cast<in_type>(out_value) != *in_data)) {
-            ctx->SetStatus(Status::Invalid("Floating point value truncated"));
-          }
-          *out_data++ = out_value;
-          in_data++;
-          is_valid_reader.Next();
-        }
-      } else {
-        for (int64_t i = 0; i < input.length; ++i) {
-          auto out_value = static_cast<out_type>(*in_data);
-          if (ARROW_PREDICT_FALSE(static_cast<in_type>(out_value) != *in_data)) {
-            ctx->SetStatus(Status::Invalid("Floating point value truncated"));
-          }
-          *out_data++ = out_value;
-          in_data++;
-        }
-      }
-    }
-  }
-};
-
-// Leftover of Number combinations that are safe to cast.
-template <typename O, typename I, typename Enable = void>
-struct is_safe_numeric_cast {
-  static constexpr bool value = false;
-};
-
-template <typename O, typename I>
-struct is_safe_numeric_cast<
-    O, I, enable_if_t<is_number_type<O>::value && is_number_type<I>::value>> {
-  using O_T = typename O::c_type;
-  using I_T = typename I::c_type;
-
-  static constexpr bool value =
-      (std::is_signed<O_T>::value == std::is_signed<I_T>::value) &&
-      (std::is_integral<O_T>::value == std::is_integral<I_T>::value) &&
-      (sizeof(O_T) >= sizeof(I_T)) && (!std::is_same<O, I>::value);
-};
-
-template <typename O, typename I>
-struct CastFunctor<
-    O, I,
-    enable_if_t<is_safe_numeric_cast<O, I>::value && !is_float_truncate<O, I>::value &&
-                !is_number_downcast<O, I>::value>> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    using in_type = typename I::c_type;
-    using out_type = typename O::c_type;
-
-    const in_type* in_data = input.GetValues<in_type>(1);
-    auto out_data = output->GetMutableValues<out_type>(1);
-    for (int64_t i = 0; i < input.length; ++i) {
-      // Due to various checks done via type-trait, the cast is safe and bear
-      // no truncation.
-      *out_data++ = static_cast<out_type>(*in_data++);
-    }
-  }
-};
-
-// ----------------------------------------------------------------------
-// Decimals
-
-// Decimal to Integer
-
-template <typename O>
-struct CastFunctor<O, Decimal128Type, enable_if_t<is_integer_type<O>::value>> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    using out_type = typename O::c_type;
-    const auto& in_type_inst = checked_cast<const Decimal128Type&>(*input.type);
-    auto in_scale = in_type_inst.scale();
-
-    auto out_data = output->GetMutableValues<out_type>(1);
-
-    constexpr auto min_value = std::numeric_limits<out_type>::min();
-    constexpr auto max_value = std::numeric_limits<out_type>::max();
-    constexpr auto zero = out_type{};
-
-    if (options.allow_decimal_truncate) {
-      if (in_scale < 0) {
-        // Unsafe upscale
-        auto convert_value = [&](util::optional<util::string_view> v) {
-          *out_data = zero;
-          if (v.has_value()) {
-            auto dec_value = Decimal128(reinterpret_cast<const uint8_t*>(v->data()));
-            auto converted = dec_value.IncreaseScaleBy(-in_scale);
-            if (!options.allow_int_overflow &&
-                ARROW_PREDICT_FALSE(converted < min_value || converted > max_value)) {
-              ctx->SetStatus(Status::Invalid("Integer value out of bounds"));
-            } else {
-              *out_data = static_cast<out_type>(converted.low_bits());
-            }
-          }
-          ++out_data;
-        };
-        VisitArrayDataInline<Decimal128Type>(input, std::move(convert_value));
-      } else {
-        // Unsafe downscale
-        auto convert_value = [&](util::optional<util::string_view> v) {
-          *out_data = zero;
-          if (v.has_value()) {
-            auto dec_value = Decimal128(reinterpret_cast<const uint8_t*>(v->data()));
-            auto converted = dec_value.ReduceScaleBy(in_scale, false);
-            if (!options.allow_int_overflow &&
-                ARROW_PREDICT_FALSE(converted < min_value || converted > max_value)) {
-              ctx->SetStatus(Status::Invalid("Integer value out of bounds"));
-            } else {
-              *out_data = static_cast<out_type>(converted.low_bits());
-            }
-          }
-          ++out_data;
-        };
-        VisitArrayDataInline<Decimal128Type>(input, std::move(convert_value));
-      }
-    } else {
-      // Safe rescale
-      auto convert_value = [&](util::optional<util::string_view> v) {
-        *out_data = zero;
-        if (v.has_value()) {
-          auto dec_value = Decimal128(reinterpret_cast<const uint8_t*>(v->data()));
-          auto result = dec_value.Rescale(in_scale, 0);
-          if (ARROW_PREDICT_FALSE(!result.ok())) {
-            ctx->SetStatus(result.status());
-          } else {
-            auto converted = *std::move(result);
-            if (!options.allow_int_overflow &&
-                ARROW_PREDICT_FALSE(converted < min_value || converted > max_value)) {
-              ctx->SetStatus(Status::Invalid("Integer value out of bounds"));
-            } else {
-              *out_data = static_cast<out_type>(converted.low_bits());
-            }
-          }
-        }
-        ++out_data;
-      };
-      VisitArrayDataInline<Decimal128Type>(input, std::move(convert_value));
-    }
-  }
-};
-
-// Decimal to Decimal
-
-template <>
-struct CastFunctor<Decimal128Type, Decimal128Type> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    const auto& in_type_inst = checked_cast<const Decimal128Type&>(*input.type);
-    const auto& out_type_inst = checked_cast<const Decimal128Type&>(*output->type);
-    auto in_scale = in_type_inst.scale();
-    auto out_scale = out_type_inst.scale();
-
-    auto out_data = output->GetMutableValues<uint8_t>(1);
-
-    const auto write_zero = [](uint8_t* out_data) { memset(out_data, 0, 16); };
-
-    if (options.allow_decimal_truncate) {
-      if (in_scale < out_scale) {
-        // Unsafe upscale
-        auto convert_value = [&](util::optional<util::string_view> v) {
-          if (v.has_value()) {
-            auto dec_value = Decimal128(reinterpret_cast<const uint8_t*>(v->data()));
-            dec_value.IncreaseScaleBy(out_scale - in_scale).ToBytes(out_data);
-          } else {
-            write_zero(out_data);
-          }
-          out_data += 16;
-        };
-        VisitArrayDataInline<Decimal128Type>(input, std::move(convert_value));
-      } else {
-        // Unsafe downscale
-        auto convert_value = [&](util::optional<util::string_view> v) {
-          if (v.has_value()) {
-            auto dec_value = Decimal128(reinterpret_cast<const uint8_t*>(v->data()));
-            dec_value.ReduceScaleBy(in_scale - out_scale, false).ToBytes(out_data);
-          } else {
-            write_zero(out_data);
-          }
-          out_data += 16;
-        };
-        VisitArrayDataInline<Decimal128Type>(input, std::move(convert_value));
-      }
-    } else {
-      // Safe rescale
-      auto convert_value = [&](util::optional<util::string_view> v) {
-        if (v.has_value()) {
-          auto dec_value = Decimal128(reinterpret_cast<const uint8_t*>(v->data()));
-          auto result = dec_value.Rescale(in_scale, out_scale);
-          if (ARROW_PREDICT_FALSE(!result.ok())) {
-            ctx->SetStatus(result.status());
-            write_zero(out_data);
-          } else {
-            (*std::move(result)).ToBytes(out_data);
-          }
-        } else {
-          write_zero(out_data);
-        }
-        out_data += 16;
-      };
-      VisitArrayDataInline<Decimal128Type>(input, std::move(convert_value));
-    }
-  }
-};
-
-// ----------------------------------------------------------------------
-// From one timestamp to another
-
-template <typename in_type, typename out_type>
-void ShiftTime(FunctionContext* ctx, const CastOptions& options,
-               const util::DivideOrMultiply factor_op, const int64_t factor,
-               const ArrayData& input, ArrayData* output) {
-  const in_type* in_data = input.GetValues<in_type>(1);
-  auto out_data = output->GetMutableValues<out_type>(1);
-
-  if (factor == 1) {
-    for (int64_t i = 0; i < input.length; i++) {
-      out_data[i] = static_cast<out_type>(in_data[i]);
-    }
-  } else if (factor_op == util::MULTIPLY) {
-    if (options.allow_time_overflow) {
-      for (int64_t i = 0; i < input.length; i++) {
-        out_data[i] = static_cast<out_type>(in_data[i] * factor);
-      }
-    } else {
-#define RAISE_OVERFLOW_CAST(VAL)                                                  \
-  ctx->SetStatus(Status::Invalid("Casting from ", input.type->ToString(), " to ", \
-                                 output->type->ToString(), " would result in ",   \
-                                 "out of bounds timestamp: ", VAL));
-
-      int64_t max_val = std::numeric_limits<int64_t>::max() / factor;
-      int64_t min_val = std::numeric_limits<int64_t>::min() / factor;
-      if (input.null_count != 0) {
-        internal::BitmapReader bit_reader(input.buffers[0]->data(), input.offset,
-                                          input.length);
-        for (int64_t i = 0; i < input.length; i++) {
-          if (bit_reader.IsSet() && (in_data[i] < min_val || in_data[i] > max_val)) {
-            RAISE_OVERFLOW_CAST(in_data[i]);
-            break;
-          }
-          out_data[i] = static_cast<out_type>(in_data[i] * factor);
-          bit_reader.Next();
-        }
-      } else {
-        for (int64_t i = 0; i < input.length; i++) {
-          if (in_data[i] < min_val || in_data[i] > max_val) {
-            RAISE_OVERFLOW_CAST(in_data[i]);
-            break;
-          }
-          out_data[i] = static_cast<out_type>(in_data[i] * factor);
-        }
-      }
-
-#undef RAISE_OVERFLOW_CAST
-    }
-  } else {
-    if (options.allow_time_truncate) {
-      for (int64_t i = 0; i < input.length; i++) {
-        out_data[i] = static_cast<out_type>(in_data[i] / factor);
-      }
-    } else {
-#define RAISE_INVALID_CAST(VAL)                                                   \
-  ctx->SetStatus(Status::Invalid("Casting from ", input.type->ToString(), " to ", \
-                                 output->type->ToString(), " would lose data: ", VAL));
-
-      if (input.null_count != 0) {
-        internal::BitmapReader bit_reader(input.buffers[0]->data(), input.offset,
-                                          input.length);
-        for (int64_t i = 0; i < input.length; i++) {
-          out_data[i] = static_cast<out_type>(in_data[i] / factor);
-          if (bit_reader.IsSet() && (out_data[i] * factor != in_data[i])) {
-            RAISE_INVALID_CAST(in_data[i]);
-            break;
-          }
-          bit_reader.Next();
-        }
-      } else {
-        for (int64_t i = 0; i < input.length; i++) {
-          out_data[i] = static_cast<out_type>(in_data[i] / factor);
-          if (out_data[i] * factor != in_data[i]) {
-            RAISE_INVALID_CAST(in_data[i]);
-            break;
-          }
-        }
-      }
-
-#undef RAISE_INVALID_CAST
-    }
-  }
-}
-
-// <TimestampType, TimestampType> and <DurationType, DurationType>
-template <typename O, typename I>
-struct CastFunctor<
-    O, I,
-    enable_if_t<(is_timestamp_type<O>::value && is_timestamp_type<I>::value) ||
-                (is_duration_type<O>::value && is_duration_type<I>::value)>> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    // If units are the same, zero copy, otherwise convert
-    const auto& in_type = checked_cast<const I&>(*input.type);
-    const auto& out_type = checked_cast<const O&>(*output->type);
-
-    if (in_type.unit() == out_type.unit()) {
-      ZeroCopyData(input, output);
-      return;
-    }
-
-    auto conversion = util::kTimestampConversionTable[static_cast<int>(in_type.unit())]
-                                                     [static_cast<int>(out_type.unit())];
-    ShiftTime<int64_t, int64_t>(ctx, options, conversion.first, conversion.second, input,
-                                output);
-  }
-};
-
-template <>
-struct CastFunctor<Date32Type, TimestampType> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    const auto& in_type = checked_cast<const TimestampType&>(*input.type);
-
-    static const int64_t kTimestampToDateFactors[4] = {
-        86400LL,                             // SECOND
-        86400LL * 1000LL,                    // MILLI
-        86400LL * 1000LL * 1000LL,           // MICRO
-        86400LL * 1000LL * 1000LL * 1000LL,  // NANO
-    };
-
-    const int64_t factor = kTimestampToDateFactors[static_cast<int>(in_type.unit())];
-    ShiftTime<int64_t, int32_t>(ctx, options, util::DIVIDE, factor, input, output);
-  }
-};
-
-template <>
-struct CastFunctor<Date64Type, TimestampType> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    const auto& in_type = checked_cast<const TimestampType&>(*input.type);
-
-    auto conversion = util::kTimestampConversionTable[static_cast<int>(in_type.unit())]
-                                                     [static_cast<int>(TimeUnit::MILLI)];
-    ShiftTime<int64_t, int64_t>(ctx, options, conversion.first, conversion.second, input,
-                                output);
-    if (!ctx->status().ok()) {
-      return;
-    }
-
-    // Ensure that intraday milliseconds have been zeroed out
-    auto out_data = output->GetMutableValues<int64_t>(1);
-
-    if (input.null_count != 0) {
-      internal::BitmapReader bit_reader(input.buffers[0]->data(), input.offset,
-                                        input.length);
-
-      for (int64_t i = 0; i < input.length; ++i) {
-        const int64_t remainder = out_data[i] % kMillisecondsInDay;
-        if (ARROW_PREDICT_FALSE(!options.allow_time_truncate && bit_reader.IsSet() &&
-                                remainder > 0)) {
-          ctx->SetStatus(
-              Status::Invalid("Timestamp value had non-zero intraday milliseconds"));
-          break;
-        }
-        out_data[i] -= remainder;
-        bit_reader.Next();
-      }
-    } else {
-      for (int64_t i = 0; i < input.length; ++i) {
-        const int64_t remainder = out_data[i] % kMillisecondsInDay;
-        if (ARROW_PREDICT_FALSE(!options.allow_time_truncate && remainder > 0)) {
-          ctx->SetStatus(
-              Status::Invalid("Timestamp value had non-zero intraday milliseconds"));
-          break;
-        }
-        out_data[i] -= remainder;
-      }
-    }
-  }
-};
-
-// ----------------------------------------------------------------------
-// From one time32 or time64 to another
-
-template <typename O, typename I>
-struct CastFunctor<O, I, enable_if_t<is_time_type<I>::value && is_time_type<O>::value>> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    using in_t = typename I::c_type;
-    using out_t = typename O::c_type;
-
-    // If units are the same, zero copy, otherwise convert
-    const auto& in_type = checked_cast<const I&>(*input.type);
-    const auto& out_type = checked_cast<const O&>(*output->type);
-
-    if (in_type.unit() == out_type.unit()) {
-      ZeroCopyData(input, output);
-      return;
-    }
-
-    auto conversion = util::kTimestampConversionTable[static_cast<int>(in_type.unit())]
-                                                     [static_cast<int>(out_type.unit())];
-
-    ShiftTime<in_t, out_t>(ctx, options, conversion.first, conversion.second, input,
-                           output);
-  }
-};
-
-// ----------------------------------------------------------------------
-// Between date32 and date64
-
-template <>
-struct CastFunctor<Date64Type, Date32Type> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    ShiftTime<int32_t, int64_t>(ctx, options, util::MULTIPLY, kMillisecondsInDay, input,
-                                output);
-  }
-};
-
-template <>
-struct CastFunctor<Date32Type, Date64Type> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    ShiftTime<int64_t, int32_t>(ctx, options, util::DIVIDE, kMillisecondsInDay, input,
-                                output);
-  }
-};
-
-// ----------------------------------------------------------------------
-// List to List
-
-class CastKernelBase : public UnaryKernel {
- public:
-  explicit CastKernelBase(std::shared_ptr<DataType> out_type)
-      : out_type_(std::move(out_type)) {}
-
-  std::shared_ptr<DataType> out_type() const override { return out_type_; }
-
-  virtual Status Init(const DataType& in_type) { return Status::OK(); }
-
- protected:
-  std::shared_ptr<DataType> out_type_;
-};
-
-bool NeedToPreallocate(const DataType& type) { return is_fixed_width(type.id()); }
-
-Status InvokeWithAllocation(FunctionContext* ctx, UnaryKernel* func, const Datum& input,
-                            Datum* out) {
-  std::vector<Datum> result;
-  if (NeedToPreallocate(*func->out_type())) {
-    // Create wrapper that allocates output memory for primitive types
-    detail::PrimitiveAllocatingUnaryKernel wrapper(func);
-    RETURN_NOT_OK(detail::InvokeUnaryArrayKernel(ctx, &wrapper, input, &result));
-  } else {
-    RETURN_NOT_OK(detail::InvokeUnaryArrayKernel(ctx, func, input, &result));
-  }
-  ARROW_RETURN_IF_ERROR(ctx);
-  *out = detail::WrapDatumsLike(input, func->out_type(), result);
-  return Status::OK();
-}
-
-template <typename TypeClass>
-class ListCastKernel : public CastKernelBase {
- public:
-  ListCastKernel(std::unique_ptr<UnaryKernel> child_caster,
-                 std::shared_ptr<DataType> out_type)
-      : CastKernelBase(std::move(out_type)), child_caster_(std::move(child_caster)) {}
-
-  Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override {
-    DCHECK_EQ(Datum::ARRAY, input.kind());
-
-    const ArrayData& in_data = *input.array();
-    DCHECK_EQ(TypeClass::type_id, in_data.type->id());
-    ArrayData* result;
-
-    if (in_data.offset != 0) {
-      return Status::NotImplemented(
-          "Casting sliced lists (non-zero offset) not yet implemented");
-    }
-
-    if (out->kind() == Datum::NONE) {
-      out->value = ArrayData::Make(out_type_, in_data.length);
-    }
-
-    result = out->array().get();
-
-    // Copy buffers from parent
-    result->buffers = in_data.buffers;
-
-    Datum casted_child;
-    RETURN_NOT_OK(InvokeWithAllocation(ctx, child_caster_.get(), in_data.child_data[0],
-                                       &casted_child));
-    DCHECK_EQ(Datum::ARRAY, casted_child.kind());
-    result->child_data.push_back(casted_child.array());
-    return Status::OK();
-  }
-
- private:
-  std::unique_ptr<UnaryKernel> child_caster_;
-};
-
-// ----------------------------------------------------------------------
-// Null to other things
-
-class FromNullCastKernel : public CastKernelBase {
- public:
-  explicit FromNullCastKernel(std::shared_ptr<DataType> out_type)
-      : CastKernelBase(std::move(out_type)) {}
-
-  Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override {
-    DCHECK_EQ(Datum::ARRAY, input.kind());
-
-    const ArrayData& in_data = *input.array();
-    DCHECK_EQ(Type::NA, in_data.type->id());
-    auto length = in_data.length;
-
-    // A ArrayData may be preallocated for the output (see InvokeUnaryArrayKernel),
-    // however, it doesn't have any actual data, so throw it away and start anew.
-    std::unique_ptr<ArrayBuilder> builder;
-    RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), out_type_, &builder));
-    NullBuilderVisitor visitor = {length, builder.get()};
-    RETURN_NOT_OK(VisitTypeInline(*out_type_, &visitor));
-
-    std::shared_ptr<Array> out_array;
-    RETURN_NOT_OK(visitor.builder_->Finish(&out_array));
-    out->value = out_array->data();
-    return Status::OK();
-  }
-
-  struct NullBuilderVisitor {
-    // Generic implementation
-    Status Visit(const DataType& type) { return builder_->AppendNulls(length_); }
-
-    Status Visit(const StructType& type) {
-      RETURN_NOT_OK(builder_->AppendNulls(length_));
-      auto& struct_builder = checked_cast<StructBuilder&>(*builder_);
-      // Append nulls to all child builders too
-      for (int i = 0; i < struct_builder.num_fields(); ++i) {
-        NullBuilderVisitor visitor = {length_, struct_builder.field_builder(i)};
-        RETURN_NOT_OK(VisitTypeInline(*type.field(i)->type(), &visitor));
-      }
-      return Status::OK();
-    }
-
-    Status Visit(const DictionaryType& type) {
-      // XXX (ARROW-5215): Cannot implement this easily, as DictionaryBuilder
-      // disregards the index type given in the dictionary type, and instead
-      // chooses the smallest possible index type.
-      return CastNotImplemented(*null(), type);
-    }
-
-    Status Visit(const UnionType& type) { return CastNotImplemented(*null(), type); }
-
-    int64_t length_;
-    ArrayBuilder* builder_;
-  };
-};
-
-// ----------------------------------------------------------------------
-// Dictionary to other things
-
-template <typename T, typename IndexType, typename Enable = void>
-struct FromDictVisitor {};
-
-// Visitor for Dict<FixedSizeBinaryType>
-template <typename T, typename IndexType>
-struct FromDictVisitor<T, IndexType, enable_if_fixed_size_binary<T>> {
-  using ArrayType = typename TypeTraits<T>::ArrayType;
-
-  FromDictVisitor(FunctionContext* ctx, const ArrayType& dictionary, ArrayData* output)
-      : dictionary_(dictionary),
-        byte_width_(dictionary.byte_width()),
-        out_(output->buffers[1]->mutable_data() + byte_width_ * output->offset) {}
-
-  Status Init() { return Status::OK(); }
-
-  Status VisitNull() {
-    memset(out_, 0, byte_width_);
-    out_ += byte_width_;
-    return Status::OK();
-  }
-
-  Status VisitValue(typename IndexType::c_type dict_index) {
-    const uint8_t* value = dictionary_.Value(dict_index);
-    memcpy(out_, value, byte_width_);
-    out_ += byte_width_;
-    return Status::OK();
-  }
-
-  Status Finish() { return Status::OK(); }
-
-  const ArrayType& dictionary_;
-  int32_t byte_width_;
-  uint8_t* out_;
-};
-
-// Visitor for Dict<BinaryType>
-template <typename T, typename IndexType>
-struct FromDictVisitor<T, IndexType, enable_if_base_binary<T>> {
-  using ArrayType = typename TypeTraits<T>::ArrayType;
-
-  FromDictVisitor(FunctionContext* ctx, const ArrayType& dictionary, ArrayData* output)
-      : ctx_(ctx), dictionary_(dictionary), output_(output) {}
-
-  Status Init() {
-    RETURN_NOT_OK(MakeBuilder(ctx_->memory_pool(), output_->type, &builder_));
-    binary_builder_ = checked_cast<BinaryBuilder*>(builder_.get());
-    return Status::OK();
-  }
-
-  Status VisitNull() { return binary_builder_->AppendNull(); }
-
-  Status VisitValue(typename IndexType::c_type dict_index) {
-    return binary_builder_->Append(dictionary_.GetView(dict_index));
-  }
-
-  Status Finish() {
-    std::shared_ptr<Array> plain_array;
-    RETURN_NOT_OK(binary_builder_->Finish(&plain_array));
-    // Copy all buffer except the valid bitmap
-    DCHECK_EQ(output_->buffers.size(), 1);
-    for (size_t i = 1; i < plain_array->data()->buffers.size(); i++) {
-      output_->buffers.push_back(plain_array->data()->buffers[i]);
-    }
-    return Status::OK();
-  }
-
-  FunctionContext* ctx_;
-  const ArrayType& dictionary_;
-  ArrayData* output_;
-  std::unique_ptr<ArrayBuilder> builder_;
-  BinaryBuilder* binary_builder_;
-};
-
-// Visitor for Dict<NumericType | TemporalType>
-template <typename T, typename IndexType>
-struct FromDictVisitor<
-    T, IndexType, enable_if_t<is_number_type<T>::value || is_temporal_type<T>::value>> {
-  using ArrayType = typename TypeTraits<T>::ArrayType;
-
-  using value_type = typename T::c_type;
-
-  FromDictVisitor(FunctionContext* ctx, const ArrayType& dictionary, ArrayData* output)
-      : dictionary_(dictionary), out_(output->GetMutableValues<value_type>(1)) {}
-
-  Status Init() { return Status::OK(); }
-
-  Status VisitNull() {
-    *out_++ = value_type{};  // Zero-initialize
-    return Status::OK();
-  }
-
-  Status VisitValue(typename IndexType::c_type dict_index) {
-    *out_++ = dictionary_.Value(dict_index);
-    return Status::OK();
-  }
-
-  Status Finish() { return Status::OK(); }
-
-  const ArrayType& dictionary_;
-  value_type* out_;
-};
-
-template <typename T>
-struct FromDictUnpackHelper {
-  using ArrayType = typename TypeTraits<T>::ArrayType;
-
-  template <typename IndexType>
-  Status Unpack(FunctionContext* ctx, const ArrayData& indices,
-                const ArrayType& dictionary, ArrayData* output) {
-    FromDictVisitor<T, IndexType> visitor{ctx, dictionary, output};
-    RETURN_NOT_OK(visitor.Init());
-    RETURN_NOT_OK(ArrayDataVisitor<IndexType>::Visit(indices, &visitor));
-    return visitor.Finish();
-  }
-};
-
-// Dispatch dictionary casts to UnpackHelper
-template <typename T>
-struct CastFunctor<T, DictionaryType> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    using ArrayType = typename TypeTraits<T>::ArrayType;
-
-    const DictionaryType& type = checked_cast<const DictionaryType&>(*input.type);
-    const Array& dictionary = *input.dictionary;
-    const DataType& values_type = *dictionary.type();
-
-    // Check if values and output type match
-    DCHECK(values_type.Equals(*output->type))
-        << "Dictionary type: " << values_type << " target type: " << (*output->type);
-
-    FromDictUnpackHelper<T> unpack_helper;
-    switch (type.index_type()->id()) {
-      case Type::INT8:
-        FUNC_RETURN_NOT_OK(unpack_helper.template Unpack<Int8Type>(
-            ctx, input, static_cast<const ArrayType&>(dictionary), output));
-        break;
-      case Type::INT16:
-        FUNC_RETURN_NOT_OK(unpack_helper.template Unpack<Int16Type>(
-            ctx, input, static_cast<const ArrayType&>(dictionary), output));
-        break;
-      case Type::INT32:
-        FUNC_RETURN_NOT_OK(unpack_helper.template Unpack<Int32Type>(
-            ctx, input, static_cast<const ArrayType&>(dictionary), output));
-        break;
-      case Type::INT64:
-        FUNC_RETURN_NOT_OK(unpack_helper.template Unpack<Int64Type>(
-            ctx, input, static_cast<const ArrayType&>(dictionary), output));
-        break;
-      default:
-        ctx->SetStatus(
-            Status::TypeError("Invalid index type: ", type.index_type()->ToString()));
-        return;
-    }
-  }
-};
-
-// ----------------------------------------------------------------------
-// String to Number
-
-template <typename I, typename O>
-struct CastFunctor<
-    O, I, enable_if_t<is_string_like_type<I>::value && is_number_type<O>::value>> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    using out_type = typename O::c_type;
-
-    typename TypeTraits<I>::ArrayType input_array(input.Copy());
-    auto out_data = output->GetMutableValues<out_type>(1);
-    for (int64_t i = 0; i < input.length; ++i, ++out_data) {
-      if (input_array.IsNull(i)) {
-        continue;
-      }
-
-      auto str = input_array.GetView(i);
-      if (!internal::ParseValue<O>(str.data(), str.length(), out_data)) {
-        ctx->SetStatus(Status::Invalid("Failed to cast String '", str, "' into ",
-                                       output->type->ToString()));
-        return;
-      }
-    }
-  }
-};
-
-// ----------------------------------------------------------------------
-// String to Boolean
-
-template <typename I>
-struct CastFunctor<BooleanType, I, enable_if_t<is_string_like_type<I>::value>> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    typename TypeTraits<I>::ArrayType input_array(input.Copy());
-    internal::FirstTimeBitmapWriter writer(output->buffers[1]->mutable_data(),
-                                           output->offset, input.length);
-
-    for (int64_t i = 0; i < input.length; ++i) {
-      if (input_array.IsNull(i)) {
-        writer.Next();
-        continue;
-      }
-
-      bool value;
-      auto str = input_array.GetView(i);
-      if (!internal::ParseValue<BooleanType>(str.data(), str.length(), &value)) {
-        ctx->SetStatus(Status::Invalid("Failed to cast String '",
-                                       input_array.GetString(i), "' into ",
-                                       output->type->ToString()));
-        return;
-      }
-
-      if (value) {
-        writer.Set();
-      } else {
-        writer.Clear();
-      }
-      writer.Next();
-    }
-    writer.Finish();
-  }
-};
-
-// ----------------------------------------------------------------------
-// String to Timestamp
-
-template <typename I>
-struct CastFunctor<TimestampType, I, enable_if_t<is_string_like_type<I>::value>> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    using out_type = TimestampType::c_type;
-
-    typename TypeTraits<I>::ArrayType input_array(input.Copy());
-    auto out_data = output->GetMutableValues<out_type>(1);
-
-    const TimeUnit::type unit = checked_cast<const TimestampType&>(*output->type).unit();
-
-    for (int64_t i = 0; i < input.length; ++i, ++out_data) {
-      if (input_array.IsNull(i)) {
-        continue;
-      }
-      const auto str = input_array.GetView(i);
-      if (!internal::ParseTimestampISO8601(str.data(), str.length(), unit, out_data)) {
-        ctx->SetStatus(Status::Invalid("Failed to cast String '", str, "' into ",
-                                       output->type->ToString()));
-        return;
-      }
-    }
-  }
-};
-
-// ----------------------------------------------------------------------
-// Number / Boolean to String
-
-template <typename I, typename O>
-struct CastFunctor<O, I,
-                   enable_if_t<is_string_like_type<O>::value &&
-                               (is_number_type<I>::value || is_boolean_type<I>::value)>> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    ctx->SetStatus(Convert(ctx, options, input, output));
-  }
-
-  Status Convert(FunctionContext* ctx, const CastOptions& options, const ArrayData& input,
-                 ArrayData* output) {
-    using value_type = typename TypeTraits<I>::CType;
-    using BuilderType = typename TypeTraits<O>::BuilderType;
-    using FormatterType = typename internal::StringFormatter<I>;
-
-    FormatterType formatter(input.type);
-    BuilderType builder(input.type, ctx->memory_pool());
-
-    auto convert_value = [&](util::optional<value_type> v) {
-      if (v.has_value()) {
-        return formatter(*v, [&](util::string_view v) { return builder.Append(v); });
-      } else {
-        return builder.AppendNull();
-      }
-    };
-    RETURN_NOT_OK(VisitArrayDataInline<I>(input, std::move(convert_value)));
-
-    std::shared_ptr<Array> output_array;
-    RETURN_NOT_OK(builder.Finish(&output_array));
-    *output = std::move(*output_array->data());
-    return Status::OK();
-  }
-};
-
-// ----------------------------------------------------------------------
-// Binary to String
-//
-
-#if defined(_MSC_VER)
-// Silence warning: """'visitor': unreferenced local variable"""
-#pragma warning(push)
-#pragma warning(disable : 4101)
-#endif
-
-template <typename I, typename O>
-struct BinaryToStringSameWidthCastFunctor {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    if (!options.allow_invalid_utf8) {
-      util::InitializeUTF8();
-
-      ArrayDataVisitor<I> visitor;
-      Status st = visitor.Visit(input, this);
-      if (!st.ok()) {
-        ctx->SetStatus(st);
-        return;
-      }
-    }
-    ZeroCopyData(input, output);
-  }
-
-  Status VisitNull() { return Status::OK(); }
-
-  Status VisitValue(util::string_view str) {
-    if (ARROW_PREDICT_FALSE(!arrow::util::ValidateUTF8(str))) {
-      return Status::Invalid("Invalid UTF8 payload");
-    }
-    return Status::OK();
-  }
-};
-
-template <>
-struct CastFunctor<StringType, BinaryType>
-    : public BinaryToStringSameWidthCastFunctor<StringType, BinaryType> {};
-
-template <>
-struct CastFunctor<LargeStringType, LargeBinaryType>
-    : public BinaryToStringSameWidthCastFunctor<LargeStringType, LargeBinaryType> {};
-
-#if defined(_MSC_VER)
-#pragma warning(pop)
-#endif
-
-// ----------------------------------------------------------------------
-
-typedef std::function<void(FunctionContext*, const CastOptions& options, const ArrayData&,
-                           ArrayData*)>
-    CastFunction;
-
-class IdentityCast : public CastKernelBase {
- public:
-  using CastKernelBase::CastKernelBase;
-
-  Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override {
-    DCHECK_EQ(input.kind(), Datum::ARRAY);
-    out->value = input.array()->Copy();
-    return Status::OK();
-  }
-};
-
-class ZeroCopyCast : public CastKernelBase {
- public:
-  using CastKernelBase::CastKernelBase;
-
-  Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override {
-    DCHECK_EQ(input.kind(), Datum::ARRAY);
-    auto result = input.array()->Copy();
-    result->type = out_type_;
-    out->value = result;
-    return Status::OK();
-  }
-};
-
-class ExtensionCastKernel : public CastKernelBase {
- public:
-  static Status Make(const DataType& in_type, std::shared_ptr<DataType> out_type,
-                     const CastOptions& options,
-                     std::unique_ptr<CastKernelBase>* kernel) {
-    const auto storage_type = checked_cast<const ExtensionType&>(in_type).storage_type();
-
-    std::unique_ptr<UnaryKernel> storage_caster;
-    RETURN_NOT_OK(GetCastFunction(*storage_type, out_type, options, &storage_caster));
-    kernel->reset(
-        new ExtensionCastKernel(std::move(storage_caster), std::move(out_type)));
-
-    return Status::OK();
-  }
-
-  Status Init(const DataType& in_type) override {
-    auto& type = checked_cast<const ExtensionType&>(in_type);
-    storage_type_ = type.storage_type();
-    extension_name_ = type.extension_name();
-    return Status::OK();
-  }
-
-  Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override {
-    DCHECK_EQ(input.kind(), Datum::ARRAY);
-
-    // validate: type is the same as the type the kernel was constructed with
-    const auto& input_type = checked_cast<const ExtensionType&>(*input.type());
-    if (input_type.extension_name() != extension_name_) {
-      return Status::TypeError(
-          "The cast kernel was constructed to cast from the extension type named '",
-          extension_name_, "' but input has extension type named '",
-          input_type.extension_name(), "'");
-    }
-    if (!input_type.storage_type()->Equals(storage_type_)) {
-      return Status::TypeError("The cast kernel was constructed with a storage type: ",
-                               storage_type_->ToString(),
-                               ", but it is called with a different storage type:",
-                               input_type.storage_type()->ToString());
-    }
-
-    // construct an ArrayData object with the underlying storage type
-    auto new_input = input.array()->Copy();
-    new_input->type = storage_type_;
-    return InvokeWithAllocation(ctx, storage_caster_.get(), new_input, out);
-  }
-
- protected:
-  ExtensionCastKernel(std::unique_ptr<UnaryKernel> storage_caster,
-                      std::shared_ptr<DataType> out_type)
-      : CastKernelBase(std::move(out_type)), storage_caster_(std::move(storage_caster)) {}
-
-  std::string extension_name_;
-  std::shared_ptr<DataType> storage_type_;
-  std::unique_ptr<UnaryKernel> storage_caster_;
-};
-
-class CastKernel : public CastKernelBase {
- public:
-  CastKernel(const CastOptions& options, const CastFunction& func,
-             std::shared_ptr<DataType> out_type)
-      : CastKernelBase(std::move(out_type)), options_(options), func_(func) {}
-
-  Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override {
-    DCHECK_EQ(input.kind(), Datum::ARRAY);
-    DCHECK_EQ(out->kind(), Datum::ARRAY);
-
-    const ArrayData& in_data = *input.array();
-    ArrayData* result = out->array().get();
-
-    RETURN_NOT_OK(detail::PropagateNulls(ctx, in_data, result));
-
-    func_(ctx, options_, in_data, result);
-    ARROW_RETURN_IF_ERROR(ctx);
-    return Status::OK();
-  }
-
- private:
-  CastOptions options_;
-  CastFunction func_;
-};
-
-class DictionaryCastKernel : public CastKernel {
- public:
-  using CastKernel::CastKernel;
-
-  Status Init(const DataType& in_type) override {
-    const auto value_type = checked_cast<const DictionaryType&>(in_type).value_type();
-    if (!out_type_->Equals(value_type)) {
-      return CastNotImplemented(in_type, *out_type_);
-    }
-    return Status::OK();
-  }
-};
-
-#define CAST_CASE(InType, OutType)                                                      \
-  case OutType::type_id:                                                                \
-    func = [](FunctionContext* ctx, const CastOptions& options, const ArrayData& input, \
-              ArrayData* out) {                                                         \
-      CastFunctor<OutType, InType> func;                                                \
-      func(ctx, options, input, out);                                                   \
-    };                                                                                  \
-    break;
-
-#define GET_CAST_FUNCTION(CASE_GENERATOR, InType, KernelType)           \
-  static std::unique_ptr<CastKernelBase> Get##InType##CastFunc(         \
-      std::shared_ptr<DataType> out_type, const CastOptions& options) { \
-    CastFunction func;                                                  \
-    switch (out_type->id()) {                                           \
-      CASE_GENERATOR(CAST_CASE);                                        \
-      default:                                                          \
-        break;                                                          \
-    }                                                                   \
-    if (func != nullptr) {                                              \
-      return std::unique_ptr<CastKernelBase>(                           \
-          new KernelType(options, func, std::move(out_type)));          \
-    }                                                                   \
-    return nullptr;                                                     \
-  }
-
-#include "generated/cast_codegen_internal.h"  // NOLINT
-
-GET_CAST_FUNCTION(BOOLEAN_CASES, BooleanType, CastKernel)
-GET_CAST_FUNCTION(UINT8_CASES, UInt8Type, CastKernel)
-GET_CAST_FUNCTION(INT8_CASES, Int8Type, CastKernel)
-GET_CAST_FUNCTION(UINT16_CASES, UInt16Type, CastKernel)
-GET_CAST_FUNCTION(INT16_CASES, Int16Type, CastKernel)
-GET_CAST_FUNCTION(UINT32_CASES, UInt32Type, CastKernel)
-GET_CAST_FUNCTION(INT32_CASES, Int32Type, CastKernel)
-GET_CAST_FUNCTION(UINT64_CASES, UInt64Type, CastKernel)
-GET_CAST_FUNCTION(INT64_CASES, Int64Type, CastKernel)
-GET_CAST_FUNCTION(FLOAT_CASES, FloatType, CastKernel)
-GET_CAST_FUNCTION(DOUBLE_CASES, DoubleType, CastKernel)
-GET_CAST_FUNCTION(DECIMAL128_CASES, Decimal128Type, CastKernel)
-GET_CAST_FUNCTION(DATE32_CASES, Date32Type, CastKernel)
-GET_CAST_FUNCTION(DATE64_CASES, Date64Type, CastKernel)
-GET_CAST_FUNCTION(TIME32_CASES, Time32Type, CastKernel)
-GET_CAST_FUNCTION(TIME64_CASES, Time64Type, CastKernel)
-GET_CAST_FUNCTION(TIMESTAMP_CASES, TimestampType, CastKernel)
-GET_CAST_FUNCTION(DURATION_CASES, DurationType, CastKernel)
-GET_CAST_FUNCTION(BINARY_CASES, BinaryType, CastKernel)
-GET_CAST_FUNCTION(STRING_CASES, StringType, CastKernel)
-GET_CAST_FUNCTION(LARGEBINARY_CASES, LargeBinaryType, CastKernel)
-GET_CAST_FUNCTION(LARGESTRING_CASES, LargeStringType, CastKernel)
-GET_CAST_FUNCTION(DICTIONARY_CASES, DictionaryType, DictionaryCastKernel)
-
-#define CAST_FUNCTION_CASE(InType)                          \
-  case InType::type_id:                                     \
-    cast_kernel = Get##InType##CastFunc(out_type, options); \
-    break
-
-namespace {
-
-template <typename TypeClass>
-Status GetListCastFunc(const DataType& in_type, std::shared_ptr<DataType> out_type,
-                       const CastOptions& options,
-                       std::unique_ptr<CastKernelBase>* kernel) {
-  if (out_type->id() != TypeClass::type_id) {
-    return Status::Invalid("Cannot cast from ", in_type.ToString(), " to ",
-                           out_type->ToString());
-  }
-  const DataType& in_value_type = *checked_cast<const TypeClass&>(in_type).value_type();
-  std::shared_ptr<DataType> out_value_type =
-      checked_cast<const TypeClass&>(*out_type).value_type();
-  std::unique_ptr<UnaryKernel> child_caster;
-  RETURN_NOT_OK(GetCastFunction(in_value_type, out_value_type, options, &child_caster));
-  *kernel = std::unique_ptr<CastKernelBase>(
-      new ListCastKernel<TypeClass>(std::move(child_caster), std::move(out_type)));
-  return Status::OK();
-}
-
-}  // namespace
-
-inline bool IsZeroCopyCast(Type::type in_type, Type::type out_type) {
-  switch (in_type) {
-    case Type::INT32:
-      return (out_type == Type::DATE32) || (out_type == Type::TIME32);
-    case Type::INT64:
-      return ((out_type == Type::DATE64) || (out_type == Type::TIME64) ||
-              (out_type == Type::TIMESTAMP) || (out_type == Type::DURATION));
-    case Type::DATE32:
-    case Type::TIME32:
-      return out_type == Type::INT32;
-    case Type::DATE64:
-    case Type::TIME64:
-    case Type::TIMESTAMP:
-    case Type::DURATION:
-      return out_type == Type::INT64;
-    default:
-      break;
-  }
-  return false;
-}
-
-Status GetCastFunction(const DataType& in_type, std::shared_ptr<DataType> out_type,
-                       const CastOptions& options, std::unique_ptr<UnaryKernel>* kernel) {
-  if (in_type.Equals(out_type)) {
-    kernel->reset(new IdentityCast(std::move(out_type)));
-    return Status::OK();
-  }
-
-  if (IsZeroCopyCast(in_type.id(), out_type->id())) {
-    kernel->reset(new ZeroCopyCast(std::move(out_type)));
-    return Status::OK();
-  }
-
-  std::unique_ptr<CastKernelBase> cast_kernel;
-  switch (in_type.id()) {
-    CAST_FUNCTION_CASE(BooleanType);
-    CAST_FUNCTION_CASE(UInt8Type);
-    CAST_FUNCTION_CASE(Int8Type);
-    CAST_FUNCTION_CASE(UInt16Type);
-    CAST_FUNCTION_CASE(Int16Type);
-    CAST_FUNCTION_CASE(UInt32Type);
-    CAST_FUNCTION_CASE(Int32Type);
-    CAST_FUNCTION_CASE(UInt64Type);
-    CAST_FUNCTION_CASE(Int64Type);
-    CAST_FUNCTION_CASE(FloatType);
-    CAST_FUNCTION_CASE(DoubleType);
-    CAST_FUNCTION_CASE(Decimal128Type);
-    CAST_FUNCTION_CASE(Date32Type);
-    CAST_FUNCTION_CASE(Date64Type);
-    CAST_FUNCTION_CASE(Time32Type);
-    CAST_FUNCTION_CASE(Time64Type);
-    CAST_FUNCTION_CASE(TimestampType);
-    CAST_FUNCTION_CASE(DurationType);
-    CAST_FUNCTION_CASE(BinaryType);
-    CAST_FUNCTION_CASE(StringType);
-    CAST_FUNCTION_CASE(LargeBinaryType);
-    CAST_FUNCTION_CASE(LargeStringType);
-    CAST_FUNCTION_CASE(DictionaryType);
-    case Type::NA:
-      cast_kernel.reset(new FromNullCastKernel(out_type));
-      break;
-    case Type::LIST:
-      RETURN_NOT_OK(GetListCastFunc<ListType>(in_type, out_type, options, &cast_kernel));
-      break;
-    case Type::LARGE_LIST:
-      RETURN_NOT_OK(
-          GetListCastFunc<LargeListType>(in_type, out_type, options, &cast_kernel));
-      break;
-    case Type::EXTENSION:
-      RETURN_NOT_OK(
-          ExtensionCastKernel::Make(std::move(in_type), out_type, options, &cast_kernel));
-      break;
-    default:
-      break;
-  }
-  if (cast_kernel == nullptr) {
-    return CastNotImplemented(in_type, *out_type);
-  }
-  Status st = cast_kernel->Init(in_type);
-  if (st.ok()) {
-    *kernel = std::move(cast_kernel);
-  }
-  return st;
-}
-
-Status Cast(FunctionContext* ctx, const Datum& value, std::shared_ptr<DataType> out_type,
-            const CastOptions& options, Datum* out) {
-  const DataType& in_type = *value.type();
-
-  // Dynamic dispatch to obtain right cast function
-  std::unique_ptr<UnaryKernel> func;
-  RETURN_NOT_OK(GetCastFunction(in_type, std::move(out_type), options, &func));
-  return InvokeWithAllocation(ctx, func.get(), value, out);
-}
-
-Status Cast(FunctionContext* ctx, const Array& array, std::shared_ptr<DataType> out_type,
-            const CastOptions& options, std::shared_ptr<Array>* out) {
-  Datum datum_out;
-  RETURN_NOT_OK(Cast(ctx, Datum(array.data()), std::move(out_type), options, &datum_out));
-  DCHECK_EQ(Datum::ARRAY, datum_out.kind());
-  *out = MakeArray(datum_out.array());
-  return Status::OK();
-}
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/cast.h b/cpp/src/arrow/compute/kernels/cast.h
deleted file mode 100644
index fec63a91026..00000000000
--- a/cpp/src/arrow/compute/kernels/cast.h
+++ /dev/null
@@ -1,101 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <memory>
-
-#include "arrow/status.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-
-class Array;
-class DataType;
-
-namespace compute {
-
-struct Datum;
-class FunctionContext;
-class UnaryKernel;
-
-struct ARROW_EXPORT CastOptions {
-  CastOptions()
-      : allow_int_overflow(false),
-        allow_time_truncate(false),
-        allow_time_overflow(false),
-        allow_decimal_truncate(false),
-        allow_float_truncate(false),
-        allow_invalid_utf8(false) {}
-
-  explicit CastOptions(bool safe)
-      : allow_int_overflow(!safe),
-        allow_time_truncate(!safe),
-        allow_time_overflow(!safe),
-        allow_decimal_truncate(!safe),
-        allow_float_truncate(!safe),
-        allow_invalid_utf8(!safe) {}
-
-  static CastOptions Safe() { return CastOptions(true); }
-
-  static CastOptions Unsafe() { return CastOptions(false); }
-
-  bool allow_int_overflow;
-  bool allow_time_truncate;
-  bool allow_time_overflow;
-  bool allow_decimal_truncate;
-  bool allow_float_truncate;
-  // Indicate if conversions from Binary/FixedSizeBinary to string must
-  // validate the utf8 payload.
-  bool allow_invalid_utf8;
-};
-
-/// \since 0.7.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Status GetCastFunction(const DataType& in_type, std::shared_ptr<DataType> to_type,
-                       const CastOptions& options, std::unique_ptr<UnaryKernel>* kernel);
-
-/// \brief Cast from one array type to another
-/// \param[in] context the FunctionContext
-/// \param[in] value array to cast
-/// \param[in] to_type type to cast to
-/// \param[in] options casting options
-/// \param[out] out resulting array
-///
-/// \since 0.7.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Status Cast(FunctionContext* context, const Array& value,
-            std::shared_ptr<DataType> to_type, const CastOptions& options,
-            std::shared_ptr<Array>* out);
-
-/// \brief Cast from one value to another
-/// \param[in] context the FunctionContext
-/// \param[in] value datum to cast
-/// \param[in] to_type type to cast to
-/// \param[in] options casting options
-/// \param[out] out resulting datum
-///
-/// \since 0.8.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Status Cast(FunctionContext* context, const Datum& value,
-            std::shared_ptr<DataType> to_type, const CastOptions& options, Datum* out);
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc
new file mode 100644
index 00000000000..403aeed25ac
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc
@@ -0,0 +1,145 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/kernels/codegen_internal.h"
+
+#include <cstdint>
+#include <memory>
+#include <mutex>
+#include <vector>
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace compute {
+
+void ExecFail(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  ctx->SetStatus(Status::NotImplemented("This kernel is malformed"));
+}
+
+namespace codegen {
+
+void BinaryExecFlipped(KernelContext* ctx, ArrayKernelExec exec,
+                       const ExecBatch& batch, Datum* out) {
+  ExecBatch flipped_batch = batch;
+  Datum tmp = flipped_batch.values[0];
+  flipped_batch.values[0] = flipped_batch.values[1];
+  flipped_batch.values[1] = tmp;
+  exec(ctx, flipped_batch, out);
+}
+
+std::vector<std::shared_ptr<DataType>> g_signed_int_types;
+std::vector<std::shared_ptr<DataType>> g_unsigned_int_types;
+std::vector<std::shared_ptr<DataType>> g_int_types;
+std::vector<std::shared_ptr<DataType>> g_floating_types;
+std::vector<std::shared_ptr<DataType>> g_numeric_types;
+std::vector<std::shared_ptr<DataType>> g_base_binary_types;
+std::vector<std::shared_ptr<DataType>> g_temporal_types;
+static std::once_flag codegen_static_initialized;
+
+static void InitStaticData() {
+  // Signed int types
+  g_signed_int_types.push_back(int8());
+  g_signed_int_types.push_back(int16());
+  g_signed_int_types.push_back(int32());
+  g_signed_int_types.push_back(int64());
+
+  // Unsigned int types
+  g_unsigned_int_types.push_back(uint8());
+  g_unsigned_int_types.push_back(uint16());
+  g_unsigned_int_types.push_back(uint32());
+  g_unsigned_int_types.push_back(uint64());
+
+  // All int types
+  g_int_types.push_back(int8());
+  g_int_types.push_back(int16());
+  g_int_types.push_back(int32());
+  g_int_types.push_back(int64());
+  g_int_types.push_back(uint8());
+  g_int_types.push_back(uint16());
+  g_int_types.push_back(uint32());
+  g_int_types.push_back(uint64());
+
+  // Floating point types
+  g_floating_types.push_back(float32());
+  g_floating_types.push_back(float64());
+
+  // Numeric types
+  g_numeric_types.push_back(uint8());
+  g_numeric_types.push_back(uint16());
+  g_numeric_types.push_back(uint32());
+  g_numeric_types.push_back(uint64());
+  g_numeric_types.push_back(int8());
+  g_numeric_types.push_back(int16());
+  g_numeric_types.push_back(int32());
+  g_numeric_types.push_back(int64());
+  g_numeric_types.push_back(float32());
+  g_numeric_types.push_back(float64());
+
+  // Temporal types
+  g_temporal_types.push_back(date32());
+  g_temporal_types.push_back(date64());
+  g_temporal_types.push_back(time32(TimeUnit::SECOND));
+  g_temporal_types.push_back(time32(TimeUnit::MILLI));
+  g_temporal_types.push_back(time64(TimeUnit::MICRO));
+  g_temporal_types.push_back(time64(TimeUnit::NANO));
+  g_temporal_types.push_back(timestamp(TimeUnit::SECOND));
+  g_temporal_types.push_back(timestamp(TimeUnit::MILLI));
+  g_temporal_types.push_back(timestamp(TimeUnit::MICRO));
+  g_temporal_types.push_back(timestamp(TimeUnit::NANO));
+
+  // Base binary types (without FixedSizeBinary)
+  g_base_binary_types.push_back(binary());
+  g_base_binary_types.push_back(utf8());
+  g_base_binary_types.push_back(large_binary());
+  g_base_binary_types.push_back(large_utf8());
+}
+
+const std::vector<std::shared_ptr<DataType>>& BaseBinaryTypes() {
+  std::call_once(codegen_static_initialized, InitStaticData);
+  return g_base_binary_types;
+}
+
+const std::vector<std::shared_ptr<DataType>>& SignedIntTypes() {
+  std::call_once(codegen_static_initialized, InitStaticData);
+  return g_signed_int_types;
+}
+
+const std::vector<std::shared_ptr<DataType>>& UnsignedIntTypes() {
+  std::call_once(codegen_static_initialized, InitStaticData);
+  return g_unsigned_int_types;
+}
+
+const std::vector<std::shared_ptr<DataType>>& FloatingPointTypes() {
+  std::call_once(codegen_static_initialized, InitStaticData);
+  return g_floating_types;
+}
+
+const std::vector<std::shared_ptr<DataType>>& NumericTypes() {
+  std::call_once(codegen_static_initialized, InitStaticData);
+  return g_numeric_types;
+}
+
+const std::vector<std::shared_ptr<DataType>>& TemporalTypes() {
+  std::call_once(codegen_static_initialized, InitStaticData);
+  return g_temporal_types;
+}
+
+}  // namespace codegen
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h
new file mode 100644
index 00000000000..78a2733c6a3
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -0,0 +1,429 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/scalar.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/string_view.h"
+
+namespace arrow {
+namespace compute {
+
+// A kernel that exposes Call methods that handles iteration over ArrayData
+// inputs itself
+//
+
+constexpr int kValidity = 0;
+constexpr int kBinaryOffsets = 1;
+constexpr int kPrimitiveData = 1;
+constexpr int kBinaryData = 2;
+
+// ----------------------------------------------------------------------
+// Iteration / value access utilities
+
+template <typename T, typename R = void>
+using enable_if_has_c_type_not_boolean = enable_if_t<has_c_type<T>::value &&
+                                                     !is_boolean_type<T>::value, R>;
+
+template <typename Type, typename Enable = void>
+struct ArrayIterator;
+
+template <typename Type>
+struct ArrayIterator<Type, enable_if_has_c_type_not_boolean<Type>> {
+  using T = typename Type::c_type;
+  const T* values;
+  ArrayIterator(const ArrayData& data) : values(data.GetValues<T>(1)) {}
+  T operator()() { return *values++; }
+};
+
+template <typename Type>
+struct ArrayIterator<Type, enable_if_boolean<Type>> {
+  internal::BitmapReader reader;
+  ArrayIterator(const ArrayData& data)
+      : reader(data.buffers[1]->data(), data.offset, data.length) {}
+  bool operator()() {
+    bool out = reader.IsSet();
+    reader.Next();
+    return out;
+  }
+};
+
+template <typename Type>
+struct ArrayIterator<Type, enable_if_base_binary<Type>> {
+  int64_t position = 0;
+  typename TypeTraits<Type>::ArrayType arr;
+  ArrayIterator(const ArrayData& data)
+      : arr(data.Copy()) {}
+  util::string_view operator()() { return arr.GetView(position++); }
+};
+
+template <typename Type, typename Enable = void>
+struct UnboxScalar;
+
+template <typename Type>
+struct UnboxScalar<Type, enable_if_has_c_type<Type>> {
+  using ScalarType = typename TypeTraits<Type>::ScalarType;
+  static typename Type::c_type Unbox(const Datum& datum) {
+    return datum.scalar_as<ScalarType>().value;
+  }
+};
+
+template <typename Type>
+struct UnboxScalar<Type, enable_if_base_binary<Type>> {
+  static util::string_view Unbox(const Datum& datum) {
+    return util::string_view(*datum.scalar_as<BaseBinaryScalar>().value);
+  }
+};
+
+template <typename Type, typename Enable = void>
+struct GetValueType;
+
+template <typename Type>
+struct GetValueType<Type, enable_if_has_c_type<Type>> {
+  using T = typename Type::c_type;
+};
+
+template <typename Type>
+struct GetValueType<
+    Type, enable_if_t<is_base_binary_type<Type>::value || is_decimal_type<Type>::value ||
+                      is_fixed_size_binary_type<Type>::value>> {
+  using T = util::string_view;
+};
+
+// ----------------------------------------------------------------------
+
+struct SimpleExec {
+  // Operator must implement
+  //
+  // static void Call(KernelContext*, const ArrayData& in, ArrayData* out)
+  template <typename Operator>
+  static void Unary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    if (batch[0].kind() == Datum::SCALAR) {
+      ctx->SetStatus(Status::NotImplemented("NYI"));
+    } else if (batch.length > 0) {
+      Operator::Call(ctx, *batch[0].array(), out->mutable_array());
+    }
+  }
+
+  // Operator must implement
+  //
+  // static void Call(KernelContext*, const ArrayData& arg0, const ArrayData& arg1,
+  //                  ArrayData* out)
+  template <typename Operator>
+  static void Binary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    if (batch[0].kind() == Datum::SCALAR || batch[1].kind() == Datum::SCALAR) {
+      ctx->SetStatus(Status::NotImplemented("NYI"));
+    } else if (batch.length > 0) {
+      Operator::Call(ctx, *batch[0].array(), *batch[1].array(), out->mutable_array());
+    }
+  }
+};
+
+// TODO: Run benchmarks to determine if OutputAdapter is a zero-cost abstraction
+struct ScalarPrimitiveExec {
+  template <typename Op, typename OutType, typename Arg0Type>
+  static void Unary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    using OUT = typename OutType::c_type;
+    using ARG0 = typename Arg0Type::c_type;
+
+    // No support for selection vectors yet implemented
+    DCHECK_EQ(nullptr, batch.selection_vector);
+    if (batch[0].kind() == Datum::SCALAR) {
+      ctx->SetStatus(Status::NotImplemented("NYI"));
+    } else {
+      ArrayData* out_arr = out->mutable_array();
+      auto out_data = out_arr->GetMutableValues<OUT>(kPrimitiveData);
+      auto arg0_data = batch[0].array()->GetValues<ARG0>(kPrimitiveData);
+      for (int64_t i = 0; i < batch.length; ++i) {
+        *out_data++ = Op::template Call<OUT, ARG0>(ctx, *arg0_data++);
+      }
+    }
+  }
+
+  template <typename Op, typename OutType, typename Arg0Type, typename Arg1Type>
+  static void Binary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    using OUT = typename OutType::c_type;
+    using ARG0 = typename Arg0Type::c_type;
+    using ARG1 = typename Arg1Type::c_type;
+
+    // No support for selection vectors yet implemented
+    DCHECK_EQ(nullptr, batch.selection_vector);
+
+    if (batch[0].kind() == Datum::SCALAR || batch[1].kind() == Datum::SCALAR) {
+      ctx->SetStatus(Status::NotImplemented("NYI"));
+    } else {
+      ArrayData* out_arr = out->mutable_array();
+      auto out_data = out_arr->GetMutableValues<OUT>(kPrimitiveData);
+      auto arg0_data = batch[0].array()->GetValues<ARG0>(kPrimitiveData);
+      auto arg1_data = batch[1].array()->GetValues<ARG1>(kPrimitiveData);
+      for (int64_t i = 0; i < batch.length; ++i) {
+        *out_data++ = Op::template Call<OUT, ARG0, ARG1>(ctx, *arg0_data++, *arg1_data++);
+      }
+    }
+  }
+};
+
+// ----------------------------------------------------------------------
+// Generate an array kernel given template classes
+
+void ExecFail(KernelContext* ctx, const ExecBatch& batch, Datum* out);
+
+// ----------------------------------------------------------------------
+// Boolean data utilities
+
+// ----------------------------------------------------------------------
+// Code generator for numeric-type kernels where the input and output types
+// are all the same
+
+namespace codegen {
+
+const std::vector<std::shared_ptr<DataType>>& BaseBinaryTypes();
+const std::vector<std::shared_ptr<DataType>>& SignedIntTypes();
+const std::vector<std::shared_ptr<DataType>>& UnsignedIntTypes();
+const std::vector<std::shared_ptr<DataType>>& FloatingPointTypes();
+const std::vector<std::shared_ptr<DataType>>& NumericTypes();
+const std::vector<std::shared_ptr<DataType>>& TemporalTypes();
+
+template <typename Type, typename Enable = void>
+struct OutputAdapter;
+
+template <typename Type>
+struct OutputAdapter<Type, enable_if_boolean<Type>> {
+  template <typename Generator>
+  static void Write(KernelContext*, Datum* out, Generator&& generator) {
+    ArrayData* out_arr = out->mutable_array();
+    auto out_bitmap = out_arr->buffers[1]->mutable_data();
+    internal::GenerateBitsUnrolled(out_bitmap, out_arr->offset, out_arr->length,
+                                   std::forward<Generator>(generator));
+  }
+};
+
+template <typename Type>
+struct OutputAdapter<Type, enable_if_has_c_type_not_boolean<Type>> {
+  template <typename Generator>
+  static void Write(KernelContext*, Datum* out, Generator&& generator) {
+    ArrayData* out_arr = out->mutable_array();
+    auto out_data = out_arr->GetMutableValues<typename Type::c_type>(kPrimitiveData);
+    // TODO: Is this as fast as a more explicitly inlined function?
+    for (int64_t i = 0 ; i < out_arr->length; ++i) {
+      *out_data++ = generator();
+    }
+  }
+};
+
+template <typename Type>
+struct OutputAdapter<Type, enable_if_base_binary<Type>> {
+  template <typename Generator>
+  static void Write(KernelContext* ctx, Datum* out, Generator&& generator) {
+    ctx->SetStatus(Status::NotImplemented("NYI"));
+  }
+};
+
+void BinaryExecFlipped(KernelContext* ctx, ArrayKernelExec exec,
+                       const ExecBatch& batch, Datum* out);
+
+// A binary kernel that outputs boolean values.
+template <typename OutType, typename Arg0Type, typename Arg1Type, typename Op,
+          typename FlippedOp = Op>
+struct ScalarBinary {
+  using OutScalarType = typename TypeTraits<OutType>::ScalarType;
+  template <typename ChosenOp>
+  static void ArrayArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    ArrayIterator<Arg0Type> arg0(*batch[0].array());
+    ArrayIterator<Arg1Type> arg1(*batch[1].array());
+    OutputAdapter<OutType>::Write(ctx, out, [&]() -> bool {
+        return ChosenOp::template Call(ctx, arg0(), arg1());
+    });
+  }
+
+  template <typename ChosenOp>
+  static void ArrayScalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    ArrayIterator<Arg0Type> arg0(*batch[0].array());
+    auto arg1 = UnboxScalar<Arg1Type>::Unbox(batch[1]);
+    OutputAdapter<OutType>::Write(ctx, out, [&]() -> bool {
+        return ChosenOp::template Call(ctx, arg0(), arg1);
+    });
+  }
+
+  template <typename ChosenOp>
+  static void ScalarScalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    auto arg0 = UnboxScalar<Arg0Type>::Unbox(batch[0]);
+    auto arg1 = UnboxScalar<Arg1Type>::Unbox(batch[1]);
+    out->value = std::make_shared<OutScalarType>(ChosenOp::template Call(ctx, arg0, arg1));
+  }
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+
+    if (batch[0].kind() == Datum::ARRAY) {
+      if (batch[1].kind() == Datum::ARRAY) {
+        return ArrayArray<Op>(ctx, batch, out);
+      } else {
+        return ArrayScalar<Op>(ctx, batch, out);
+      }
+    } else {
+      if (batch[1].kind() == Datum::ARRAY) {
+        // e.g. if we were doing scalar < array, we flip and do array >= scalar
+        return BinaryExecFlipped(ctx, ArrayScalar<FlippedOp>, batch, out);
+      } else {
+        return ScalarScalar<Op>(ctx, batch, out);
+      }
+    }
+  }
+};
+
+template <typename OutType, typename ArgType, typename Op,
+          typename FlippedOp = Op>
+using ScalarBinaryEqualTypes = ScalarBinary<OutType, ArgType, ArgType, Op, FlippedOp>;
+
+struct NumericEqualTypes {
+  template <typename Op>
+  static ArrayKernelExec MakeUnary(const DataType& in_type) {
+    switch (in_type.id()) {
+      case Type::INT8:
+        return ScalarPrimitiveExec::Unary<Op, Int8Type, Int8Type>;
+      case Type::UINT8:
+        return ScalarPrimitiveExec::Unary<Op, UInt8Type, UInt8Type>;
+      case Type::INT16:
+        return ScalarPrimitiveExec::Unary<Op, Int16Type, Int16Type>;
+      case Type::UINT16:
+        return ScalarPrimitiveExec::Unary<Op, UInt16Type, UInt16Type>;
+      case Type::INT32:
+        return ScalarPrimitiveExec::Unary<Op, Int32Type, Int32Type>;
+      case Type::UINT32:
+        return ScalarPrimitiveExec::Unary<Op, UInt32Type, UInt32Type>;
+      case Type::INT64:
+        return ScalarPrimitiveExec::Unary<Op, Int64Type, Int64Type>;
+      case Type::UINT64:
+        return ScalarPrimitiveExec::Unary<Op, UInt64Type, UInt64Type>;
+      case Type::FLOAT:
+        return ScalarPrimitiveExec::Unary<Op, FloatType, FloatType>;
+      case Type::DOUBLE:
+        return ScalarPrimitiveExec::Unary<Op, DoubleType, DoubleType>;
+      default:
+        DCHECK(false);
+        return ExecFail;
+    }
+  }
+
+  template <typename Op>
+  static ArrayKernelExec MakeBinary(const DataType& in_type) {
+    switch (in_type.id()) {
+      case Type::INT8:
+        return ScalarPrimitiveExec::Binary<Op, Int8Type, Int8Type, Int8Type>;
+      case Type::UINT8:
+        return ScalarPrimitiveExec::Binary<Op, UInt8Type, UInt8Type, UInt8Type>;
+      case Type::INT16:
+        return ScalarPrimitiveExec::Binary<Op, Int16Type, Int16Type, Int16Type>;
+      case Type::UINT16:
+        return ScalarPrimitiveExec::Binary<Op, UInt16Type, UInt16Type, UInt16Type>;
+      case Type::INT32:
+        return ScalarPrimitiveExec::Binary<Op, Int32Type, Int32Type, Int32Type>;
+      case Type::UINT32:
+        return ScalarPrimitiveExec::Binary<Op, UInt32Type, UInt32Type, UInt32Type>;
+      case Type::INT64:
+        return ScalarPrimitiveExec::Binary<Op, Int64Type, Int64Type, Int64Type>;
+      case Type::UINT64:
+        return ScalarPrimitiveExec::Binary<Op, UInt64Type, UInt64Type, UInt64Type>;
+      case Type::FLOAT:
+        return ScalarPrimitiveExec::Binary<Op, FloatType, FloatType, FloatType>;
+      case Type::DOUBLE:
+        return ScalarPrimitiveExec::Binary<Op, DoubleType, DoubleType, DoubleType>;
+      default:
+        DCHECK(false);
+        return ExecFail;
+    }
+  }
+};
+
+template <template <typename...> class Generator,
+          typename OutType, typename... Args>
+ArrayKernelExec NumericSetReturn(const DataType& in_type) {
+  switch (in_type.id()) {
+    case Type::INT8:
+      return Generator<OutType, Int8Type, Args...>::Exec;
+    case Type::UINT8:
+      return Generator<OutType, UInt8Type, Args...>::Exec;
+    case Type::INT16:
+      return Generator<OutType, Int16Type, Args...>::Exec;
+    case Type::UINT16:
+      return Generator<OutType, UInt16Type, Args...>::Exec;
+    case Type::INT32:
+      return Generator<OutType, Int32Type, Args...>::Exec;
+    case Type::UINT32:
+      return Generator<OutType, UInt32Type, Args...>::Exec;
+    case Type::INT64:
+      return Generator<OutType, Int64Type, Args...>::Exec;
+    case Type::UINT64:
+      return Generator<OutType, UInt64Type, Args...>::Exec;
+    case Type::FLOAT:
+      return Generator<OutType, FloatType, Args...>::Exec;
+    case Type::DOUBLE:
+      return Generator<OutType, DoubleType, Args...>::Exec;
+    default:
+      DCHECK(false);
+      return ExecFail;
+  }
+}
+
+template <template <typename...> class Generator,
+          typename OutType, typename... Args>
+ArrayKernelExec BaseBinarySetReturn(const DataType& in_type) {
+  switch (in_type.id()) {
+    case Type::BINARY:
+      return Generator<OutType, BinaryType, Args...>::Exec;
+    case Type::STRING:
+      return Generator<OutType, StringType, Args...>::Exec;
+    case Type::LARGE_BINARY:
+      return Generator<OutType, LargeBinaryType, Args...>::Exec;
+    case Type::LARGE_STRING:
+      return Generator<OutType, LargeStringType, Args...>::Exec;
+    default:
+      DCHECK(false);
+      return ExecFail;
+  }
+}
+
+template <template <typename...> class Generator,
+          typename OutType, typename... Args>
+ArrayKernelExec TemporalSetReturn(const DataType& in_type) {
+  switch (in_type.id()) {
+    case Type::DATE32:
+      return Generator<OutType, Date32Type, Args...>::Exec;
+    case Type::DATE64:
+      return Generator<OutType, Date64Type, Args...>::Exec;
+    case Type::TIME32:
+      return Generator<OutType, Time32Type, Args...>::Exec;
+    case Type::TIME64:
+      return Generator<OutType, Time64Type, Args...>::Exec;
+    case Type::TIMESTAMP:
+      return Generator<OutType, TimestampType, Args...>::Exec;
+    default:
+      DCHECK(false);
+      return ExecFail;
+  }
+}
+
+}  // namespace codegen
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/common.h b/cpp/src/arrow/compute/kernels/common.h
new file mode 100644
index 00000000000..9f0a7432df9
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/common.h
@@ -0,0 +1,50 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>      // IWYU pragma: export
+#include <memory>       // IWYU pragma: export
+#include <type_traits>  // IWYU pragma: export
+#include <utility>      // IWYU pragma: export
+#include <vector>       // IWYU pragma: export
+
+#include "arrow/array.h"                             // IWYU pragma: export
+#include "arrow/buffer.h"                            // IWYU pragma: export
+#include "arrow/compute/exec.h"                      // IWYU pragma: export
+#include "arrow/compute/function.h"                  // IWYU pragma: export
+#include "arrow/compute/kernel.h"                    // IWYU pragma: export
+#include "arrow/compute/kernels/codegen_internal.h"  // IWYU pragma: export
+#include "arrow/compute/options.h"                   // IWYU pragma: export
+#include "arrow/compute/registry.h"                  // IWYU pragma: export
+#include "arrow/datum.h"                             // IWYU pragma: export
+#include "arrow/memory_pool.h"                       // IWYU pragma: export
+#include "arrow/status.h"                            // IWYU pragma: export
+#include "arrow/type.h"                              // IWYU pragma: export
+#include "arrow/type_traits.h"                       // IWYU pragma: export
+#include "arrow/util/bit_util.h"                     // IWYU pragma: export
+#include "arrow/util/checked_cast.h"                 // IWYU pragma: export
+#include "arrow/util/logging.h"                      // IWYU pragma: export
+#include "arrow/util/macros.h"                       // IWYU pragma: export
+#include "arrow/util/string_view.h"                  // IWYU pragma: export
+#include "arrow/visitor_inline.h"                    // IWYU pragma: export
+
+namespace arrow {
+
+using internal::checked_cast;
+
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/compare.cc b/cpp/src/arrow/compute/kernels/compare.cc
deleted file mode 100644
index a3c0e6caedc..00000000000
--- a/cpp/src/arrow/compute/kernels/compare.cc
+++ /dev/null
@@ -1,332 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/kernels/compare.h"
-
-#include <utility>
-
-#include "arrow/compute/context.h"
-#include "arrow/compute/kernel.h"
-#include "arrow/compute/kernels/util_internal.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/string_view.h"
-#include "arrow/visitor_inline.h"
-
-namespace arrow {
-
-using internal::checked_cast;
-using internal::checked_pointer_cast;
-using util::string_view;
-
-namespace compute {
-
-template <typename T, CompareOperator Op>
-struct Comparator;
-
-template <typename T>
-struct Comparator<T, CompareOperator::EQUAL> {
-  constexpr static bool Compare(const T& lhs, const T& rhs) { return lhs == rhs; }
-};
-
-template <typename T>
-struct Comparator<T, CompareOperator::NOT_EQUAL> {
-  constexpr static bool Compare(const T& lhs, const T& rhs) { return lhs != rhs; }
-};
-
-template <typename T>
-struct Comparator<T, CompareOperator::GREATER> {
-  constexpr static bool Compare(const T& lhs, const T& rhs) { return lhs > rhs; }
-};
-
-template <typename T>
-struct Comparator<T, CompareOperator::GREATER_EQUAL> {
-  constexpr static bool Compare(const T& lhs, const T& rhs) { return lhs >= rhs; }
-};
-
-template <typename T>
-struct Comparator<T, CompareOperator::LESS> {
-  constexpr static bool Compare(const T& lhs, const T& rhs) { return lhs < rhs; }
-};
-
-template <typename T>
-struct Comparator<T, CompareOperator::LESS_EQUAL> {
-  constexpr static bool Compare(const T& lhs, const T& rhs) { return lhs <= rhs; }
-};
-
-// return flipped_op such that (a op b) is equivalent to (b flipped_op a)
-static CompareOperator FlippedCompareOperator(CompareOperator op) {
-  switch (op) {
-    case CompareOperator::LESS:
-      return CompareOperator::GREATER;
-
-    case CompareOperator::GREATER:
-      return CompareOperator::LESS;
-
-    case CompareOperator::LESS_EQUAL:
-      return CompareOperator::GREATER_EQUAL;
-
-    case CompareOperator::GREATER_EQUAL:
-      return CompareOperator::LESS_EQUAL;
-
-    case CompareOperator::EQUAL:
-    case CompareOperator::NOT_EQUAL:
-    default:
-      break;
-  }
-  return op;
-}
-
-template <typename Value>
-struct RepeatedValue {
-  Value operator()() { return value_; }
-  Value value_;
-};
-
-struct RepeatedBufferAsStringView {
-  explicit RepeatedBufferAsStringView(const Buffer& buffer) : value_(buffer) {}
-  util::string_view operator()() { return value_; }
-  util::string_view value_;
-};
-
-struct ReadFromBitmap : internal::BitmapReader {
-  using internal::BitmapReader::BitmapReader;
-
-  bool operator()() {
-    bool out = IsSet();
-    Next();
-    return out;
-  }
-};
-
-template <typename T>
-struct DereferenceIncrementPointer {
-  T operator()() { return *ptr_++; }
-  const T* ptr_;
-};
-
-template <typename ArrayType>
-struct GetViewFromStringLikeArray {
-  explicit GetViewFromStringLikeArray(const ArrayType* array) : array_(array) {}
-
-  string_view operator()() { return array_->GetView(i_++); }
-
-  const ArrayType* array_;
-  int64_t i_ = 0;
-};
-
-template <typename T, typename RangeType = RepeatedValue<typename T::c_type>>
-RangeType MakeRange(const TemporalScalar<T>& scalar) {
-  return RangeType{scalar.value};
-}
-
-template <typename T, typename RangeType = RepeatedValue<typename T::c_type>>
-RangeType MakeRange(const internal::PrimitiveScalar<T>& scalar) {
-  return RangeType{scalar.value};
-}
-
-RepeatedBufferAsStringView MakeRange(const BaseBinaryScalar& scalar) {
-  return RepeatedBufferAsStringView{*scalar.value};
-}
-
-ReadFromBitmap MakeRange(const BooleanArray& array) {
-  return ReadFromBitmap(array.data()->GetValues<uint8_t>(1), array.offset(),
-                        array.length());
-}
-
-template <typename T,
-          typename RangeType = DereferenceIncrementPointer<typename T::c_type>>
-RangeType MakeRange(const NumericArray<T>& array) {
-  return RangeType{array.raw_values()};
-}
-
-template <typename T, typename RangeType = GetViewFromStringLikeArray<BaseBinaryArray<T>>>
-RangeType MakeRange(const BaseBinaryArray<T>& array) {
-  return RangeType{&array};
-}
-
-inline Status AssignNulls(FunctionContext* ctx, const Array& array, const Scalar& scalar,
-                          ArrayData* out) {
-  return scalar.is_valid ? detail::PropagateNulls(ctx, *array.data(), out)
-                         : detail::SetAllNulls(ctx, *array.data(), out);
-}
-
-inline Status AssignNulls(FunctionContext* ctx, const Array& left, const Array& right,
-                          ArrayData* out) {
-  return detail::AssignNullIntersection(ctx, *left.data(), *right.data(), out);
-}
-
-template <CompareOperator Op, typename L, typename R>
-Status Compare(L&& get_left, R&& get_right, ArrayData* out) {
-  auto out_bitmap = out->buffers[1]->mutable_data();
-  internal::GenerateBitsUnrolled(out_bitmap, 0, out->length, [&]() -> bool {
-    return Comparator<decltype(get_left()), Op>::Compare(get_left(), get_right());
-  });
-  return Status::OK();
-}
-
-template <typename ArrowType, CompareOperator Op>
-class CompareKernel final : public BinaryKernel {
- public:
-  using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
-  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
-
-  std::shared_ptr<DataType> out_type() const override { return boolean(); }
-
-  Status Call(FunctionContext* ctx, const Datum& left, const Datum& right,
-              Datum* out_datum) override {
-    auto out = out_datum->array();
-
-    auto left_array = AsArray(left);
-    auto right_array = AsArray(right);
-    auto right_scalar = AsScalar(right);
-
-    if (left_array && right_array) {
-      RETURN_NOT_OK(AssignNulls(ctx, *left_array, *right_array, out.get()));
-      return Compare<Op>(MakeRange(*left_array), MakeRange(*right_array), out.get());
-    }
-
-    if (left_array && right_scalar) {
-      RETURN_NOT_OK(AssignNulls(ctx, *left_array, *right_scalar, out.get()));
-      return Compare<Op>(MakeRange(*left_array), MakeRange(*right_scalar), out.get());
-    }
-
-    return Status::Invalid("Invalid datum signature for CompareBinaryKernel::Call");
-  }
-
- private:
-  static std::shared_ptr<ArrayType> AsArray(const Datum& datum) {
-    if (datum.kind() != Datum::ARRAY) return nullptr;
-    return checked_pointer_cast<ArrayType>(datum.make_array());
-  }
-
-  static std::shared_ptr<ScalarType> AsScalar(const Datum& datum) {
-    if (datum.kind() != Datum::SCALAR) return nullptr;
-    return checked_pointer_cast<ScalarType>(datum.scalar());
-  }
-};
-
-template <typename ArrowType>
-std::shared_ptr<BinaryKernel> UnpackOperator(CompareOperator op) {
-  switch (op) {
-    case CompareOperator::EQUAL:
-      return std::make_shared<CompareKernel<ArrowType, CompareOperator::EQUAL>>();
-
-    case CompareOperator::NOT_EQUAL:
-      return std::make_shared<CompareKernel<ArrowType, CompareOperator::NOT_EQUAL>>();
-
-    case CompareOperator::GREATER:
-      return std::make_shared<CompareKernel<ArrowType, CompareOperator::GREATER>>();
-
-    case CompareOperator::GREATER_EQUAL:
-      return std::make_shared<CompareKernel<ArrowType, CompareOperator::GREATER_EQUAL>>();
-
-    case CompareOperator::LESS:
-      return std::make_shared<CompareKernel<ArrowType, CompareOperator::LESS>>();
-
-    case CompareOperator::LESS_EQUAL:
-      return std::make_shared<CompareKernel<ArrowType, CompareOperator::LESS_EQUAL>>();
-  }
-
-  return nullptr;
-}
-
-struct UnpackType {
-  Status Visit(const NullType& unreachable) { return Status::OK(); }
-
-  Status Visit(const BooleanType& t) {
-    *out_ = UnpackOperator<BooleanType>(options_.op);
-    return Status::OK();
-  }
-
-  template <typename Numeric>
-  enable_if_number<Numeric, Status> Visit(const Numeric& t) {
-    *out_ = UnpackOperator<Numeric>(options_.op);
-    return Status::OK();
-  }
-
-  template <typename Temporal>
-  enable_if_temporal<Temporal, Status> Visit(const Temporal& t) {
-    *out_ = UnpackOperator<Temporal>(options_.op);
-    return Status::OK();
-  }
-
-  template <typename StringLike>
-  enable_if_base_binary<StringLike, Status> Visit(const StringLike& t) {
-    *out_ = UnpackOperator<StringLike>(options_.op);
-    return Status::OK();
-  }
-
-  Status Visit(const DictionaryType& t) { return NotImplemented(t); }
-  Status Visit(const DayTimeIntervalType& t) { return NotImplemented(t); }
-  Status Visit(const MonthIntervalType& t) { return NotImplemented(t); }
-  Status Visit(const FixedSizeBinaryType& t) { return NotImplemented(t); }
-  Status Visit(const DurationType& t) { return NotImplemented(t); }
-  Status Visit(const Decimal128Type& t) { return NotImplemented(t); }
-  Status Visit(const ListType& t) { return NotImplemented(t); }
-  Status Visit(const LargeListType& t) { return NotImplemented(t); }
-  Status Visit(const MapType& t) { return NotImplemented(t); }
-  Status Visit(const FixedSizeListType& t) { return NotImplemented(t); }
-  Status Visit(const UnionType& t) { return NotImplemented(t); }
-  Status Visit(const ExtensionType& t) { return NotImplemented(t); }
-  Status Visit(const StructType& t) { return NotImplemented(t); }
-
-  Status NotImplemented(const DataType& t) {
-    return Status::NotImplemented("Compare not implemented for type ", t);
-  }
-
-  std::shared_ptr<BinaryKernel>* out_;
-  CompareOptions options_;
-};
-
-// make a compare kernel and invoke it
-inline Status FinishCompare(FunctionContext* context, const Datum& left,
-                            const Datum& right, CompareOptions options, Datum* out) {
-  std::shared_ptr<BinaryKernel> kernel;
-  UnpackType visitor{&kernel, options};
-  RETURN_NOT_OK(VisitTypeInline(*left.type(), &visitor));
-
-  out->value = ArrayData::Make(kernel->out_type(), left.length());
-
-  return detail::PrimitiveAllocatingBinaryKernel(kernel.get())
-      .Call(context, left, right, out);
-}
-
-Status Compare(FunctionContext* context, const Datum& left, const Datum& right,
-               CompareOptions options, Datum* out) {
-  if (!left.type()->Equals(right.type())) {
-    return Status::TypeError("Cannot compare data of differing type ", *left.type(),
-                             " vs ", *right.type());
-  }
-
-  std::shared_ptr<BinaryKernel> kernel;
-  if (left.is_scalar()) {
-    if (right.is_scalar()) {
-      return Status::Invalid("Invalid datum signature for Compare");
-    }
-
-    // flip the comparison so that the scalar is the right hand side
-    options.op = FlippedCompareOperator(options.op);
-    return FinishCompare(context, right, left, options, out);
-  }
-
-  return FinishCompare(context, left, right, options, out);
-}
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/compare.h b/cpp/src/arrow/compute/kernels/compare.h
deleted file mode 100644
index c39f1b65696..00000000000
--- a/cpp/src/arrow/compute/kernels/compare.h
+++ /dev/null
@@ -1,72 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <memory>
-
-#include "arrow/compute/kernel.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-
-class DataType;
-class Status;
-
-namespace compute {
-
-class FunctionContext;
-
-enum CompareOperator {
-  EQUAL,
-  NOT_EQUAL,
-  GREATER,
-  GREATER_EQUAL,
-  LESS,
-  LESS_EQUAL,
-};
-
-struct CompareOptions {
-  explicit CompareOptions(CompareOperator op) : op(op) {}
-
-  enum CompareOperator op;
-};
-
-/// \brief BinaryKernel bound implementing comparison
-ARROW_EXPORT
-Status MakeCompareKernel(const DataType& type, CompareOptions options,
-                         std::shared_ptr<BinaryKernel>* out);
-
-/// \brief Compare a numeric array with a scalar.
-///
-/// \param[in] context the FunctionContext
-/// \param[in] left datum to compare, must be an Array
-/// \param[in] right datum to compare, must be a Scalar of the same type than
-///            left Datum.
-/// \param[in] options compare options
-/// \param[out] out resulting datum
-///
-/// Note on floating point arrays, this uses ieee-754 compare semantics.
-///
-/// \since 0.14.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Status Compare(FunctionContext* context, const Datum& left, const Datum& right,
-               struct CompareOptions options, Datum* out);
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/count.cc b/cpp/src/arrow/compute/kernels/count.cc
deleted file mode 100644
index 44ba9d52299..00000000000
--- a/cpp/src/arrow/compute/kernels/count.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/kernels/count.h"
-
-#include "arrow/compute/kernel.h"
-#include "arrow/compute/kernels/aggregate.h"
-
-namespace arrow {
-namespace compute {
-
-struct CountState {
-  CountState() : non_nulls(0), nulls(0) {}
-  CountState(int64_t non_nulls, int64_t nulls) : non_nulls(non_nulls), nulls(nulls) {}
-
-  CountState operator+(const CountState& rhs) const {
-    return CountState(this->non_nulls + rhs.non_nulls, this->nulls + rhs.nulls);
-  }
-
-  CountState& operator+=(const CountState& rhs) {
-    this->non_nulls += rhs.non_nulls;
-    this->nulls += rhs.nulls;
-    return *this;
-  }
-
-  std::shared_ptr<Scalar> NonNullsAsScalar() const {
-    using ScalarType = typename CTypeTraits<int64_t>::ScalarType;
-    return std::make_shared<ScalarType>(non_nulls);
-  }
-
-  std::shared_ptr<Scalar> NullsAsScalar() const {
-    using ScalarType = typename CTypeTraits<int64_t>::ScalarType;
-    return std::make_shared<ScalarType>(nulls);
-  }
-
-  int64_t non_nulls = 0;
-  int64_t nulls = 0;
-};
-
-class CountAggregateFunction final : public AggregateFunctionStaticState<CountState> {
- public:
-  explicit CountAggregateFunction(const CountOptions& options) : options_(options) {}
-
-  Status Consume(const Array& input, CountState* state) const override {
-    const int64_t length = input.length();
-    const int64_t nulls = input.null_count();
-
-    state->nulls = nulls;
-    state->non_nulls = length - nulls;
-
-    return Status::OK();
-  }
-
-  Status Merge(const CountState& src, CountState* dst) const override {
-    *dst += src;
-    return Status::OK();
-  }
-
-  Status Finalize(const CountState& src, Datum* output) const override {
-    switch (options_.count_mode) {
-      case CountOptions::COUNT_ALL:
-        *output = src.NonNullsAsScalar();
-        break;
-      case CountOptions::COUNT_NULL:
-        *output = src.NullsAsScalar();
-        break;
-      default:
-        return Status::Invalid("Unknown CountOptions encountered");
-    }
-
-    return Status::OK();
-  }
-
-  std::shared_ptr<DataType> out_type() const override { return int64(); }
-
- private:
-  CountOptions options_;
-};
-
-std::shared_ptr<AggregateFunction> MakeCountAggregateFunction(
-    FunctionContext* context, const CountOptions& options) {
-  return std::make_shared<CountAggregateFunction>(options);
-}
-
-Status Count(FunctionContext* context, const CountOptions& options, const Datum& value,
-             Datum* out) {
-  if (!value.is_array()) return Status::Invalid("Count is expecting an array datum.");
-
-  auto aggregate = MakeCountAggregateFunction(context, options);
-  auto kernel = std::make_shared<AggregateUnaryKernel>(aggregate);
-
-  return kernel->Call(context, value, out);
-}
-
-Status Count(FunctionContext* context, const CountOptions& options, const Array& array,
-             Datum* out) {
-  return Count(context, options, array.data(), out);
-}
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/count.h b/cpp/src/arrow/compute/kernels/count.h
deleted file mode 100644
index c33ac48665a..00000000000
--- a/cpp/src/arrow/compute/kernels/count.h
+++ /dev/null
@@ -1,88 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <memory>
-#include <type_traits>
-
-#include "arrow/status.h"
-#include "arrow/type.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-
-class Array;
-class DataType;
-
-namespace compute {
-
-struct Datum;
-class FunctionContext;
-class AggregateFunction;
-
-/// \class CountOptions
-///
-/// The user control the Count kernel behavior with this class. By default, the
-/// it will count all non-null values.
-struct ARROW_EXPORT CountOptions {
-  enum mode {
-    // Count all non-null values.
-    COUNT_ALL = 0,
-    // Count all null values.
-    COUNT_NULL,
-  };
-
-  explicit CountOptions(enum mode count_mode) : count_mode(count_mode) {}
-
-  enum mode count_mode = COUNT_ALL;
-};
-
-/// \brief Return Count function aggregate
-ARROW_EXPORT
-std::shared_ptr<AggregateFunction> MakeCount(FunctionContext* context,
-                                             const CountOptions& options);
-
-/// \brief Count non-null (or null) values in an array.
-///
-/// \param[in] context the FunctionContext
-/// \param[in] options counting options, see CountOptions for more information
-/// \param[in] datum to count
-/// \param[out] out resulting datum
-///
-/// \since 0.13.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Status Count(FunctionContext* context, const CountOptions& options, const Datum& datum,
-             Datum* out);
-
-/// \brief Count non-null (or null) values in an array.
-///
-/// \param[in] context the FunctionContext
-/// \param[in] options counting options, see CountOptions for more information
-/// \param[in] array to count
-/// \param[out] out resulting datum
-///
-/// \since 0.13.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Status Count(FunctionContext* context, const CountOptions& options, const Array& array,
-             Datum* out);
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/filter.h b/cpp/src/arrow/compute/kernels/filter.h
deleted file mode 100644
index c121c9b3d91..00000000000
--- a/cpp/src/arrow/compute/kernels/filter.h
+++ /dev/null
@@ -1,105 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <memory>
-#include <utility>
-
-#include "arrow/compute/kernel.h"
-#include "arrow/record_batch.h"
-#include "arrow/status.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-
-class Array;
-
-namespace compute {
-
-class FunctionContext;
-
-struct FilterOptions {
-  /// Configure the action taken when a slot of the selection mask is null
-  enum NullSelectionBehavior {
-    /// the corresponding filtered value will be removed in the output
-    DROP,
-    /// the corresponding filtered value will be null in the output
-    EMIT_NULL,
-  };
-
-  NullSelectionBehavior null_selection_behavior = DROP;
-};
-
-/// \brief Filter with a boolean selection filter
-///
-/// The output will be populated with values from the input at positions
-/// where the selection filter is not 0. Nulls in the filter will be handled
-/// based on options.null_selection_behavior.
-///
-/// For example given values = ["a", "b", "c", null, "e", "f"] and
-/// filter = [0, 1, 1, 0, null, 1], the output will be
-/// (null_selection_behavior == DROP)      = ["b", "c", "f"]
-/// (null_selection_behavior == EMIT_NULL) = ["b", "c", null, "f"]
-///
-/// \param[in] ctx the FunctionContext
-/// \param[in] values array to filter
-/// \param[in] filter indicates which values should be filtered out
-/// \param[in] options configures null_selection_behavior
-/// \param[out] out resulting array
-ARROW_EXPORT
-Status Filter(FunctionContext* ctx, const Datum& values, const Datum& filter,
-              FilterOptions options, Datum* out);
-
-/// \brief BinaryKernel implementing Filter operation
-class ARROW_EXPORT FilterKernel : public BinaryKernel {
- public:
-  const FilterOptions& options() const { return options_; }
-
-  /// \brief BinaryKernel interface
-  ///
-  /// delegates to subclasses via Filter()
-  Status Call(FunctionContext* ctx, const Datum& values, const Datum& filter,
-              Datum* out) override;
-
-  /// \brief output type of this kernel (identical to type of values filtered)
-  std::shared_ptr<DataType> out_type() const override { return type_; }
-
-  /// \brief factory for FilterKernels
-  ///
-  /// \param[in] value_type constructed FilterKernel will support filtering
-  ///            values of this type
-  /// \param[in] options configures null_selection_behavior
-  /// \param[out] out created kernel
-  static Status Make(std::shared_ptr<DataType> value_type, FilterOptions options,
-                     std::unique_ptr<FilterKernel>* out);
-
-  /// \brief single-array implementation
-  virtual Status Filter(FunctionContext* ctx, const Array& values,
-                        const BooleanArray& filter, int64_t out_length,
-                        std::shared_ptr<Array>* out) = 0;
-
- protected:
-  explicit FilterKernel(std::shared_ptr<DataType> type, FilterOptions options)
-      : type_(std::move(type)), options_(options) {}
-
-  std::shared_ptr<DataType> type_;
-  FilterOptions options_;
-};
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/hash.h b/cpp/src/arrow/compute/kernels/hash.h
deleted file mode 100644
index 1ffca1a618c..00000000000
--- a/cpp/src/arrow/compute/kernels/hash.h
+++ /dev/null
@@ -1,102 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <memory>
-
-#include "arrow/compute/kernel.h"
-#include "arrow/status.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-
-class Array;
-class DataType;
-struct ArrayData;
-
-namespace compute {
-
-class FunctionContext;
-
-/// \brief Compute unique elements from an array-like object
-///
-/// Note if a null occurs in the input it will NOT be included in the output.
-///
-/// \param[in] context the FunctionContext
-/// \param[in] datum array-like input
-/// \param[out] out result as Array
-///
-/// \since 0.8.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Status Unique(FunctionContext* context, const Datum& datum, std::shared_ptr<Array>* out);
-
-// Constants for accessing the output of ValueCounts
-ARROW_EXPORT extern const char kValuesFieldName[];
-ARROW_EXPORT extern const char kCountsFieldName[];
-ARROW_EXPORT extern const int32_t kValuesFieldIndex;
-ARROW_EXPORT extern const int32_t kCountsFieldIndex;
-/// \brief Return counts of unique elements from an array-like object.
-///
-/// Note that the counts do not include counts for nulls in the array.  These can be
-/// obtained separately from metadata.
-///
-/// For floating point arrays there is no attempt to normalize -0.0, 0.0 and NaN values
-/// which can lead to unexpected results if the input Array has these values.
-///
-/// \param[in] context the FunctionContext
-/// \param[in] value array-like input
-/// \param[out] counts An array of  <input type "Values", int64_t "Counts"> structs.
-///
-/// \since 0.13.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Status ValueCounts(FunctionContext* context, const Datum& value,
-                   std::shared_ptr<Array>* counts);
-
-/// \brief Dictionary-encode values in an array-like object
-/// \param[in] context the FunctionContext
-/// \param[in] data array-like input
-/// \param[out] out result with same shape and type as input
-///
-/// \since 0.8.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Status DictionaryEncode(FunctionContext* context, const Datum& data, Datum* out);
-
-// TODO(wesm): Define API for incremental dictionary encoding
-
-// TODO(wesm): Define API for regularizing DictionaryArray objects with
-// different dictionaries
-
-//
-// ARROW_EXPORT
-// Status DictionaryEncode(FunctionContext* context, const Datum& data,
-//                         const Array& prior_dictionary, Datum* out);
-
-// TODO(wesm): Implement these next
-// ARROW_EXPORT
-// Status Match(FunctionContext* context, const Datum& values, const Datum& member_set,
-//              Datum* out);
-
-// ARROW_EXPORT
-// Status IsIn(FunctionContext* context, const Datum& values, const Datum& member_set,
-//             Datum* out);
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/isin.cc b/cpp/src/arrow/compute/kernels/isin.cc
deleted file mode 100644
index 60fb25909be..00000000000
--- a/cpp/src/arrow/compute/kernels/isin.cc
+++ /dev/null
@@ -1,306 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/kernels/isin.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <sstream>
-#include <string>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "arrow/array.h"
-#include "arrow/array/dict_internal.h"
-#include "arrow/buffer.h"
-#include "arrow/builder.h"
-#include "arrow/compute/context.h"
-#include "arrow/compute/kernel.h"
-#include "arrow/compute/kernels/util_internal.h"
-#include "arrow/memory_pool.h"
-#include "arrow/type.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/hashing.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/macros.h"
-#include "arrow/util/string_view.h"
-#include "arrow/visitor_inline.h"
-
-namespace arrow {
-
-using internal::checked_cast;
-using internal::DictionaryTraits;
-using internal::HashTraits;
-
-namespace compute {
-
-class IsInKernelImpl : public UnaryKernel {
-  virtual Status Compute(FunctionContext* ctx, const Datum& left, Datum* out) = 0;
-
- public:
-  // \brief Check if value in both arrays or not and returns boolean values/null
-  Status Call(FunctionContext* ctx, const Datum& left, Datum* out) override {
-    DCHECK_EQ(Datum::ARRAY, left.kind());
-    RETURN_NOT_OK(Compute(ctx, left, out));
-    return Status::OK();
-  }
-
-  std::shared_ptr<DataType> out_type() const override { return boolean(); }
-
-  virtual Status ConstructRight(FunctionContext* ctx, const Datum& right) = 0;
-};
-
-// ----------------------------------------------------------------------
-// Using a visitor create a memo_table_ for the right array
-// TODO: Implement for small lists
-
-template <typename T, typename Scalar>
-struct MemoTableRight {
-  Status Reset(MemoryPool* pool) {
-    memo_table_.reset(new MemoTable(pool, 0));
-    return Status::OK();
-  }
-
-  Status Append(FunctionContext* ctx, const Datum& right) {
-    const ArrayData& right_data = *right.array();
-    right_null_count += right_data.GetNullCount();
-
-    auto insert_value = [&](util::optional<Scalar> v) {
-      if (v.has_value()) {
-        int32_t unused_memo_index;
-        return memo_table_->GetOrInsert(*v, &unused_memo_index);
-      } else {
-        return Status::OK();
-      }
-    };
-    return VisitArrayDataInline<T>(right_data, std::move(insert_value));
-  }
-
-  using MemoTable = typename HashTraits<T>::MemoTableType;
-  std::unique_ptr<MemoTable> memo_table_;
-  int64_t right_null_count{};
-};
-
-// ----------------------------------------------------------------------
-
-template <typename Type, typename Scalar>
-class IsInKernel : public IsInKernelImpl {
- public:
-  IsInKernel(const std::shared_ptr<DataType>& type, MemoryPool* pool)
-      : type_(type), pool_(pool) {}
-
-  Status Compute(FunctionContext* ctx, const Datum& left, Datum* out) override {
-    const ArrayData& left_data = *left.array();
-
-    output = out->array();
-    output->type = boolean();
-
-    writer = std::make_shared<internal::FirstTimeBitmapWriter>(
-        output.get()->buffers[1]->mutable_data(), output.get()->offset, left_data.length);
-
-    auto lookup_value = [&](util::optional<Scalar> v) {
-      if (!v.has_value() || memo_table_->Get(*v) != -1) {
-        writer->Set();
-      } else {
-        writer->Clear();
-      }
-      writer->Next();
-    };
-    VisitArrayDataInline<Type>(left_data, std::move(lookup_value));
-
-    writer->Finish();
-
-    // if right null count is zero and left null count is not zero, propagate nulls
-    if (right_null_count_ == 0 && left_data.GetNullCount() != 0) {
-      RETURN_NOT_OK(detail::PropagateNulls(ctx, left_data, output.get()));
-    }
-    return Status::OK();
-  }
-
-  Status ConstructRight(FunctionContext* ctx, const Datum& right) override {
-    MemoTableRight<Type, Scalar> func;
-    RETURN_NOT_OK(func.Reset(pool_));
-
-    if (right.kind() == Datum::ARRAY) {
-      RETURN_NOT_OK(func.Append(ctx, right));
-    } else if (right.kind() == Datum::CHUNKED_ARRAY) {
-      const ChunkedArray& right_array = *right.chunked_array();
-      for (int i = 0; i < right_array.num_chunks(); i++) {
-        RETURN_NOT_OK(func.Append(ctx, right_array.chunk(i)));
-      }
-    } else {
-      return Status::Invalid("Input Datum was not array-like");
-    }
-
-    memo_table_ = std::move(func.memo_table_);
-    right_null_count_ = func.right_null_count;
-    return Status::OK();
-  }
-
- protected:
-  using MemoTable = typename HashTraits<Type>::MemoTableType;
-  std::unique_ptr<MemoTable> memo_table_;
-  std::shared_ptr<DataType> type_;
-  MemoryPool* pool_;
-
- private:
-  // \brief Additional member "right_null_count" is used to check if
-  // null count in right is not 0
-  int64_t right_null_count_{};
-  std::shared_ptr<internal::FirstTimeBitmapWriter> writer;
-  std::shared_ptr<ArrayData> output;
-};
-
-// ----------------------------------------------------------------------
-// (NullType has a separate implementation)
-
-class NullIsInKernel : public IsInKernelImpl {
- public:
-  NullIsInKernel(const std::shared_ptr<DataType>& type, MemoryPool* pool) {}
-
-  // \brief When array is NullType, based on the null count for the arrays,
-  // return true, else propagate to all nulls
-  Status Compute(FunctionContext* ctx, const Datum& left, Datum* out) override {
-    const ArrayData& left_data = *left.array();
-    left_null_count = left_data.GetNullCount();
-
-    output = out->array();
-    output->type = boolean();
-
-    writer = std::make_shared<internal::FirstTimeBitmapWriter>(
-        output.get()->buffers[1]->mutable_data(), output.get()->offset, left_data.length);
-
-    if (left_null_count != 0 && right_null_count == 0) {
-      RETURN_NOT_OK(detail::PropagateNulls(ctx, left_data, output.get()));
-    } else {
-      for (int64_t i = 0; i < left_data.length; ++i) {
-        writer->Set();
-        writer->Next();
-      }
-      writer->Finish();
-    }
-    return Status::OK();
-  }
-
-  Status ConstructRight(FunctionContext* ctx, const Datum& right) override {
-    if (right.kind() == Datum::ARRAY) {
-      const ArrayData& right_data = *right.array();
-      right_null_count = right_data.GetNullCount();
-    } else if (right.kind() == Datum::CHUNKED_ARRAY) {
-      const ChunkedArray& right_array = *right.chunked_array();
-      for (int i = 0; i < right_array.num_chunks(); i++) {
-        right_null_count += right_array.chunk(i)->null_count();
-      }
-    } else {
-      return Status::Invalid("Input Datum was not array-like");
-    }
-    return Status::OK();
-  }
-
- private:
-  int64_t left_null_count{};
-  int64_t right_null_count{};
-  std::shared_ptr<internal::FirstTimeBitmapWriter> writer;
-  std::shared_ptr<ArrayData> output;
-};
-
-// ----------------------------------------------------------------------
-// Kernel wrapper for generic hash table kernels
-
-template <typename Type, typename Enable = void>
-struct IsInKernelTraits {};
-
-template <typename Type>
-struct IsInKernelTraits<Type, enable_if_null<Type>> {
-  using IsInKernelImpl = NullIsInKernel;
-};
-
-template <typename Type>
-struct IsInKernelTraits<Type, enable_if_has_c_type<Type>> {
-  using IsInKernelImpl = IsInKernel<Type, typename Type::c_type>;
-};
-
-template <typename Type>
-struct IsInKernelTraits<Type, enable_if_has_string_view<Type>> {
-  using IsInKernelImpl = IsInKernel<Type, util::string_view>;
-};
-
-Status GetIsInKernel(FunctionContext* ctx, const std::shared_ptr<DataType>& type,
-                     const Datum& right, std::unique_ptr<IsInKernelImpl>* out) {
-  std::unique_ptr<IsInKernelImpl> kernel;
-
-#define ISIN_CASE(InType)                                               \
-  case InType::type_id:                                                 \
-    kernel.reset(new typename IsInKernelTraits<InType>::IsInKernelImpl( \
-        type, ctx->memory_pool()));                                     \
-    break
-
-  switch (type->id()) {
-    ISIN_CASE(NullType);
-    ISIN_CASE(BooleanType);
-    ISIN_CASE(UInt8Type);
-    ISIN_CASE(Int8Type);
-    ISIN_CASE(UInt16Type);
-    ISIN_CASE(Int16Type);
-    ISIN_CASE(UInt32Type);
-    ISIN_CASE(Int32Type);
-    ISIN_CASE(UInt64Type);
-    ISIN_CASE(Int64Type);
-    ISIN_CASE(FloatType);
-    ISIN_CASE(DoubleType);
-    ISIN_CASE(Date32Type);
-    ISIN_CASE(Date64Type);
-    ISIN_CASE(Time32Type);
-    ISIN_CASE(Time64Type);
-    ISIN_CASE(TimestampType);
-    ISIN_CASE(BinaryType);
-    ISIN_CASE(StringType);
-    ISIN_CASE(FixedSizeBinaryType);
-    ISIN_CASE(Decimal128Type);
-    default:
-      break;
-  }
-#undef ISIN_CASE
-
-  if (!kernel) {
-    return Status::NotImplemented("IsIn is not implemented for ", type->ToString());
-  }
-  RETURN_NOT_OK(kernel->ConstructRight(ctx, right));
-  *out = std::move(kernel);
-  return Status::OK();
-}
-
-Status IsIn(FunctionContext* ctx, const Datum& left, const Datum& right, Datum* out) {
-  DCHECK(left.type()->Equals(right.type()));
-  std::vector<Datum> outputs;
-  std::unique_ptr<IsInKernelImpl> lkernel;
-
-  RETURN_NOT_OK(GetIsInKernel(ctx, left.type(), right, &lkernel));
-  detail::PrimitiveAllocatingUnaryKernel kernel(lkernel.get());
-  RETURN_NOT_OK(detail::InvokeUnaryArrayKernel(ctx, &kernel, left, &outputs));
-
-  *out = detail::WrapDatumsLike(left, lkernel->out_type(), outputs);
-  return Status::OK();
-}
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/isin_test.cc b/cpp/src/arrow/compute/kernels/isin_test.cc
deleted file mode 100644
index d38a6fdb20d..00000000000
--- a/cpp/src/arrow/compute/kernels/isin_test.cc
+++ /dev/null
@@ -1,415 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <algorithm>
-#include <cstdint>
-#include <cstdio>
-#include <functional>
-#include <locale>
-#include <memory>
-#include <stdexcept>
-#include <string>
-#include <vector>
-
-#include <gtest/gtest.h>
-
-#include "arrow/array.h"
-#include "arrow/buffer.h"
-#include "arrow/compute/context.h"
-#include "arrow/compute/kernel.h"
-#include "arrow/compute/kernels/isin.h"
-#include "arrow/compute/kernels/util_internal.h"
-#include "arrow/compute/test_util.h"
-#include "arrow/memory_pool.h"
-#include "arrow/status.h"
-#include "arrow/table.h"
-#include "arrow/testing/gtest_common.h"
-#include "arrow/testing/util.h"
-#include "arrow/type.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/decimal.h"
-
-namespace arrow {
-namespace compute {
-
-// ----------------------------------------------------------------------
-// IsIn tests
-
-template <typename Type, typename T = typename TypeTraits<Type>::c_type>
-void CheckIsIn(FunctionContext* ctx, const std::shared_ptr<DataType>& type,
-               const std::vector<T>& in_values, const std::vector<bool>& in_is_valid,
-               const std::vector<T>& member_set_values,
-               const std::vector<bool>& member_set_is_valid,
-               const std::vector<bool>& out_values,
-               const std::vector<bool>& out_is_valid) {
-  std::shared_ptr<Array> input = _MakeArray<Type, T>(type, in_values, in_is_valid);
-  std::shared_ptr<Array> member_set =
-      _MakeArray<Type, T>(type, member_set_values, member_set_is_valid);
-  std::shared_ptr<Array> expected =
-      _MakeArray<BooleanType, bool>(boolean(), out_values, out_is_valid);
-
-  Datum datum_out;
-  ASSERT_OK(IsIn(ctx, input, member_set, &datum_out));
-  std::shared_ptr<Array> result = datum_out.make_array();
-  ASSERT_OK(result->ValidateFull());
-  ASSERT_ARRAYS_EQUAL(*expected, *result);
-}
-
-class TestIsInKernel : public ComputeFixture, public TestBase {};
-
-template <typename Type>
-class TestIsInKernelPrimitive : public ComputeFixture, public TestBase {};
-
-typedef ::testing::Types<Int8Type, UInt8Type, Int16Type, UInt16Type, Int32Type,
-                         UInt32Type, Int64Type, UInt64Type, FloatType, DoubleType,
-                         Date32Type, Date64Type>
-    PrimitiveDictionaries;
-
-TYPED_TEST_SUITE(TestIsInKernelPrimitive, PrimitiveDictionaries);
-
-TYPED_TEST(TestIsInKernelPrimitive, IsIn) {
-  using T = typename TypeParam::c_type;
-  auto type = TypeTraits<TypeParam>::type_singleton();
-
-  // No Nulls
-  CheckIsIn<TypeParam, T>(&this->ctx_, type, {2, 1, 2, 1, 2, 3},
-                          {true, true, true, true, true, true}, {2, 1, 2, 3},
-                          {true, true, true, true, true},
-                          {true, true, true, true, true, true}, {});
-  // Nulls in left array
-  CheckIsIn<TypeParam, T>(&this->ctx_, type, {2, 1, 2, 1, 2, 3},
-                          {false, false, false, false, false, false}, {2, 1, 2, 1, 3}, {},
-                          {false, false, false, false, false, false},
-                          {false, false, false, false, false, false});
-  // Nulls in right array
-  CheckIsIn<TypeParam, T>(&this->ctx_, type, {2, 1, 2, 1, 2, 3}, {}, {2, 1, 2, 3},
-                          {false, false, false, false},
-                          {false, false, false, false, false, false}, {});
-  // Nulls in both the arrays
-  CheckIsIn<TypeParam, T>(&this->ctx_, type, {2, 1, 2, 3}, {false, false, false, false},
-                          {2, 1, 2, 1, 2, 3, 3},
-                          {false, false, false, false, false, false, false},
-                          {true, true, true, true}, {});
-  // No Match
-  CheckIsIn<TypeParam, T>(&this->ctx_, type, {2, 1, 7, 3, 8},
-                          {true, false, true, true, true}, {2, 1, 2, 1, 6, 3, 3},
-                          {true, false, true, false, true, true, true},
-                          {true, true, false, true, false}, {});
-
-  // Empty Arrays
-  CheckIsIn<TypeParam, T>(&this->ctx_, type, {}, {}, {}, {}, {}, {});
-}
-
-TYPED_TEST(TestIsInKernelPrimitive, PrimitiveResizeTable) {
-  using T = typename TypeParam::c_type;
-
-  const int64_t kTotalValues = std::min<int64_t>(INT16_MAX, 1UL << sizeof(T) / 2);
-  const int64_t kRepeats = 5;
-
-  std::vector<T> values;
-  std::vector<T> member_set;
-  std::vector<bool> expected;
-  for (int64_t i = 0; i < kTotalValues * kRepeats; i++) {
-    const auto val = static_cast<T>(i % kTotalValues);
-    values.push_back(val);
-    member_set.push_back(val);
-    expected.push_back(static_cast<bool>(true));
-  }
-
-  auto type = TypeTraits<TypeParam>::type_singleton();
-  CheckIsIn<TypeParam, T>(&this->ctx_, type, values, {}, member_set, {}, expected, {});
-}
-
-TEST_F(TestIsInKernel, IsInNull) {
-  CheckIsIn<NullType, std::nullptr_t>(&this->ctx_, null(), {0, 0, 0},
-                                      {false, false, false}, {0, 0, 0},
-                                      {false, false, false}, {true, true, true}, {});
-
-  CheckIsIn<NullType, std::nullptr_t>(&this->ctx_, null(), {NULL, NULL, NULL}, {},
-                                      {NULL, NULL, NULL, NULL}, {}, {true, true, true},
-                                      {});
-
-  CheckIsIn<NullType, std::nullptr_t>(&this->ctx_, null(), {nullptr, nullptr, nullptr},
-                                      {}, {nullptr}, {}, {true, true, true}, {});
-
-  // Empty left array
-  CheckIsIn<NullType, std::nullptr_t>(&this->ctx_, null(), {}, {},
-                                      {nullptr, nullptr, nullptr}, {}, {}, {});
-
-  // Empty right array
-  CheckIsIn<NullType, std::nullptr_t>(&this->ctx_, null(), {nullptr, nullptr, nullptr},
-                                      {}, {}, {}, {false, false, false},
-                                      {false, false, false});
-
-  // Empty arrays
-  CheckIsIn<NullType, std::nullptr_t>(&this->ctx_, null(), {}, {}, {}, {}, {}, {});
-}
-
-TEST_F(TestIsInKernel, IsInTimeTimestamp) {
-  CheckIsIn<Time32Type, int32_t>(
-      &this->ctx_, time32(TimeUnit::SECOND), {2, 1, 5, 1}, {true, false, true, true},
-      {2, 1, 2, 1}, {true, false, true, true}, {true, true, false, true}, {});
-
-  // Right array has no Nulls
-  CheckIsIn<Time32Type, int32_t>(&this->ctx_, time32(TimeUnit::SECOND), {2, 1, 5, 1},
-                                 {true, false, true, true}, {2, 1, 1}, {true, true, true},
-                                 {true, false, false, true}, {true, false, true, true});
-
-  // No match
-  CheckIsIn<Time32Type, int32_t>(&this->ctx_, time32(TimeUnit::SECOND), {3, 5, 5, 3},
-                                 {true, false, true, true}, {2, 1, 2, 1, 2},
-                                 {true, true, true, true, true},
-                                 {false, false, false, false}, {true, false, true, true});
-
-  // Empty arrays
-  CheckIsIn<Time32Type, int32_t>(&this->ctx_, time32(TimeUnit::SECOND), {}, {}, {}, {},
-                                 {}, {});
-
-  CheckIsIn<Time64Type, int64_t>(&this->ctx_, time64(TimeUnit::NANO), {2, 1, 2, 1},
-                                 {true, false, true, true}, {2, 1, 1},
-                                 {true, false, true}, {true, true, true, true}, {});
-
-  CheckIsIn<TimestampType, int64_t>(
-      &this->ctx_, timestamp(TimeUnit::NANO), {2, 1, 2, 1}, {true, false, true, true},
-      {2, 1, 2, 1}, {true, false, true, true}, {true, true, true, true}, {});
-
-  // Empty left array
-  CheckIsIn<TimestampType, int64_t>(&this->ctx_, timestamp(TimeUnit::NANO), {}, {},
-                                    {2, 1, 2, 1}, {true, false, true, true}, {}, {});
-
-  // Empty right array
-  CheckIsIn<TimestampType, int64_t>(
-      &this->ctx_, timestamp(TimeUnit::NANO), {2, 1, 2, 1}, {true, false, true, true}, {},
-      {}, {false, false, false, false}, {true, false, true, true});
-
-  // Both array have Nulls
-  CheckIsIn<Time32Type, int32_t>(&this->ctx_, time32(TimeUnit::SECOND), {2, 1, 2, 1},
-                                 {false, false, false, false}, {2, 1}, {false, false},
-                                 {true, true, true, true}, {});
-}
-
-TEST_F(TestIsInKernel, IsInBoolean) {
-  CheckIsIn<BooleanType, bool>(&this->ctx_, boolean(), {false, true, false, true},
-                               {true, false, true, true}, {true, false, true},
-                               {false, true, true}, {true, true, true, true}, {});
-
-  CheckIsIn<BooleanType, bool>(
-      &this->ctx_, boolean(), {false, true, false, true}, {true, false, true, true},
-      {false, true, false, true, false}, {true, true, false, true, false},
-      {true, true, true, true}, {});
-
-  // No Nulls
-  CheckIsIn<BooleanType, bool>(&this->ctx_, boolean(), {true, true, false, true}, {},
-                               {false, true}, {}, {true, true, true, true}, {});
-
-  CheckIsIn<BooleanType, bool>(&this->ctx_, boolean(), {false, true, false, true}, {},
-                               {true, true, true, true}, {}, {false, true, false, true},
-                               {});
-
-  // No match
-  CheckIsIn<BooleanType, bool>(&this->ctx_, boolean(), {true, true, true, true}, {},
-                               {false, false, false, false, false}, {},
-                               {false, false, false, false}, {});
-
-  // Nulls in left array
-  CheckIsIn<BooleanType, bool>(
-      &this->ctx_, boolean(), {false, true, false, true}, {false, false, false, false},
-      {true, true}, {}, {false, false, false, false}, {false, false, false, false});
-
-  // Nulls in right array
-  CheckIsIn<BooleanType, bool>(&this->ctx_, boolean(), {true, true, false, true}, {},
-                               {true, true, false, true, true},
-                               {false, false, false, false, false},
-                               {false, false, false, false}, {});
-
-  // Both array have Nulls
-  CheckIsIn<BooleanType, bool>(&this->ctx_, boolean(), {false, true, false, true},
-                               {false, false, false, false}, {true, true, true, true},
-                               {false, false, false, false}, {true, true, true, true},
-                               {});
-}
-
-template <typename Type>
-class TestIsInKernelBinary : public ComputeFixture, public TestBase {};
-
-using BinaryTypes = ::testing::Types<BinaryType, StringType>;
-TYPED_TEST_SUITE(TestIsInKernelBinary, BinaryTypes);
-
-TYPED_TEST(TestIsInKernelBinary, IsInBinary) {
-  auto type = TypeTraits<TypeParam>::type_singleton();
-  CheckIsIn<TypeParam, std::string>(&this->ctx_, type, {"test", "", "test2", "test"},
-                                    {true, false, true, true}, {"test", "", "test2"},
-                                    {true, false, true}, {true, true, true, true}, {});
-
-  // No match
-  CheckIsIn<TypeParam, std::string>(
-      &this->ctx_, type, {"test", "", "test2", "test"}, {true, false, true, true},
-      {"test3", "test4", "test3", "test4"}, {true, true, true, true},
-      {false, false, false, false}, {true, false, true, true});
-
-  // Nulls in left array
-  CheckIsIn<TypeParam, std::string>(
-      &this->ctx_, type, {"test", "", "test2", "test"}, {false, false, false, false},
-      {"test", "test2", "test"}, {true, true, true}, {false, false, false, false},
-      {false, false, false, false});
-
-  // Nulls in right array
-  CheckIsIn<TypeParam, std::string>(&this->ctx_, type, {"test", "test2", "test"},
-                                    {true, true, true}, {"test", "", "test2", "test"},
-                                    {false, false, false, false}, {false, false, false},
-                                    {});
-
-  // Both array have Nulls
-  CheckIsIn<TypeParam, std::string>(
-      &this->ctx_, type, {"test", "", "test2", "test"}, {false, false, false, false},
-      {"test", "", "test2", "test"}, {false, false, false, false},
-      {true, true, true, true}, {});
-
-  // Empty arrays
-  CheckIsIn<TypeParam, std::string>(&this->ctx_, type, {}, {}, {}, {}, {}, {});
-
-  // Empty left array
-  CheckIsIn<TypeParam, std::string>(&this->ctx_, type, {}, {},
-                                    {"test", "", "test2", "test"},
-                                    {true, false, true, false}, {}, {});
-
-  // Empty right array
-  CheckIsIn<TypeParam, std::string>(
-      &this->ctx_, type, {"test", "", "test2", "test"}, {true, false, true, true}, {}, {},
-      {false, false, false, false}, {true, false, true, true});
-}
-
-TEST_F(TestIsInKernel, BinaryResizeTable) {
-  const int32_t kTotalValues = 10000;
-#if !defined(ARROW_VALGRIND)
-  const int32_t kRepeats = 10;
-#else
-  // Mitigate Valgrind's slowness
-  const int32_t kRepeats = 3;
-#endif
-
-  std::vector<std::string> values;
-  std::vector<std::string> member_set;
-  std::vector<bool> expected;
-  char buf[20] = "test";
-
-  for (int32_t i = 0; i < kTotalValues * kRepeats; i++) {
-    int32_t index = i % kTotalValues;
-
-    ASSERT_GE(snprintf(buf + 4, sizeof(buf) - 4, "%d", index), 0);
-    values.emplace_back(buf);
-    member_set.emplace_back(buf);
-    expected.push_back(true);
-  }
-
-  CheckIsIn<BinaryType, std::string>(&this->ctx_, binary(), values, {}, member_set, {},
-                                     expected, {});
-
-  CheckIsIn<StringType, std::string>(&this->ctx_, utf8(), values, {}, member_set, {},
-                                     expected, {});
-}
-
-TEST_F(TestIsInKernel, IsInFixedSizeBinary) {
-  CheckIsIn<FixedSizeBinaryType, std::string>(
-      &this->ctx_, fixed_size_binary(5), {"bbbbb", "", "aaaaa", "ccccc"},
-      {true, false, true, true}, {"bbbbb", "", "bbbbb", "aaaaa", "ccccc"},
-      {true, false, true, true, true}, {true, true, true, true}, {});
-
-  // Nulls in left
-  CheckIsIn<FixedSizeBinaryType, std::string>(
-      &this->ctx_, fixed_size_binary(5), {"bbbbb", "", "bbbbb", "aaaaa", "ccccc"},
-      {false, false, false, false, false}, {"bbbbb", "aabbb", "bbbbb", "aaaaa", "ccccc"},
-      {true, true, true, true, true}, {false, false, false, false, false},
-      {false, false, false, false, false});
-
-  // Nulls in right
-  CheckIsIn<FixedSizeBinaryType, std::string>(
-      &this->ctx_, fixed_size_binary(5), {"bbbbb", "", "bbbbb", "aaaaa", "ccccc"},
-      {true, false, true, true, true}, {"bbbbb", "", "bbbbb"}, {false, false, false},
-      {false, true, false, false, false}, {});
-
-  // Both array have Nulls
-  CheckIsIn<FixedSizeBinaryType, std::string>(
-      &this->ctx_, fixed_size_binary(5), {"bbbbb", "", "bbbbb", "aaaaa", "ccccc"},
-      {false, false, false, false, false}, {"", "", "bbbbb", "aaaaa"},
-      {false, false, false, false}, {true, true, true, true, true}, {});
-
-  // No match
-  CheckIsIn<FixedSizeBinaryType, std::string>(
-      &this->ctx_, fixed_size_binary(5), {"bbbbc", "bbbbc", "aaaad", "cccca"},
-      {true, true, true, true}, {"bbbbb", "", "bbbbb", "aaaaa", "ddddd"},
-      {true, false, true, true, true}, {false, false, false, false}, {});
-
-  // Empty left array
-  CheckIsIn<FixedSizeBinaryType, std::string>(&this->ctx_, fixed_size_binary(5), {}, {},
-                                              {"bbbbb", "", "bbbbb", "aaaaa", "ccccc"},
-                                              {true, false, true, true, true}, {}, {});
-
-  // Empty right array
-  CheckIsIn<FixedSizeBinaryType, std::string>(
-      &this->ctx_, fixed_size_binary(5), {"bbbbb", "", "bbbbb", "aaaaa", "ccccc"},
-      {true, false, true, true, true}, {}, {}, {false, false, false, false, false},
-      {true, false, true, true, true});
-
-  // Empty arrays
-  CheckIsIn<FixedSizeBinaryType, std::string>(&this->ctx_, fixed_size_binary(0), {}, {},
-                                              {}, {}, {}, {});
-}
-
-TEST_F(TestIsInKernel, IsInDecimal) {
-  std::vector<Decimal128> input{12, 12, 11, 12};
-  std::vector<Decimal128> member_set{12, 12, 11, 12};
-  std::vector<bool> expected{true, true, true, true};
-
-  CheckIsIn<Decimal128Type, Decimal128>(&this->ctx_, decimal(2, 0), input,
-                                        {true, false, true, true}, member_set,
-                                        {true, false, true, true}, expected, {});
-}
-
-TEST_F(TestIsInKernel, IsInChunkedArrayInvoke) {
-  std::vector<std::string> values1 = {"foo", "bar", "foo"};
-  std::vector<std::string> values2 = {"bar", "baz", "quuux", "foo"};
-  std::vector<std::string> values3 = {"foo", "bar", "foo"};
-  std::vector<std::string> values4 = {"bar", "baz", "barr", "foo"};
-
-  auto type = utf8();
-  auto a1 = _MakeArray<StringType, std::string>(type, values1, {});
-  auto a2 = _MakeArray<StringType, std::string>(type, values2, {true, true, true, false});
-  auto a3 = _MakeArray<StringType, std::string>(type, values3, {});
-  auto a4 = _MakeArray<StringType, std::string>(type, values4, {});
-
-  ArrayVector array1 = {a1, a2};
-  auto carr = std::make_shared<ChunkedArray>(array1);
-  ArrayVector array2 = {a3, a4};
-  auto member_set = std::make_shared<ChunkedArray>(array2);
-
-  auto i1 = _MakeArray<BooleanType, bool>(boolean(), {true, true, true}, {});
-  auto i2 = _MakeArray<BooleanType, bool>(boolean(), {true, true, false, false},
-                                          {true, true, true, false});
-
-  ArrayVector expected = {i1, i2};
-  auto expected_carr = std::make_shared<ChunkedArray>(expected);
-
-  Datum encoded_out;
-  ASSERT_OK(IsIn(&this->ctx_, carr, member_set, &encoded_out));
-  ASSERT_EQ(Datum::CHUNKED_ARRAY, encoded_out.kind());
-
-  AssertChunkedEqual(*expected_carr, *encoded_out.chunked_array());
-}
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/match.cc b/cpp/src/arrow/compute/kernels/match.cc
deleted file mode 100644
index 024140aaeda..00000000000
--- a/cpp/src/arrow/compute/kernels/match.cc
+++ /dev/null
@@ -1,281 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/kernels/match.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <cstring>
-#include <memory>
-#include <sstream>
-#include <string>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "arrow/array.h"
-#include "arrow/array/dict_internal.h"
-#include "arrow/buffer.h"
-#include "arrow/builder.h"
-#include "arrow/compute/context.h"
-#include "arrow/compute/kernel.h"
-#include "arrow/compute/kernels/util_internal.h"
-#include "arrow/memory_pool.h"
-#include "arrow/type.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/hashing.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/macros.h"
-#include "arrow/util/string_view.h"
-#include "arrow/visitor_inline.h"
-
-namespace arrow {
-
-using internal::checked_cast;
-using internal::DictionaryTraits;
-using internal::HashTraits;
-
-namespace compute {
-
-class MatchKernelImpl : public UnaryKernel {
- public:
-  std::shared_ptr<DataType> out_type() const override { return int32(); }
-
-  virtual Status Init(const Datum& needles) = 0;
-};
-
-template <typename Type, typename Scalar>
-class MatchKernel : public MatchKernelImpl {
- public:
-  MatchKernel(std::shared_ptr<DataType> type, MemoryPool* pool)
-      : type_(std::move(type)), pool_(pool) {}
-
-  Status Call(FunctionContext* ctx, const Datum& haystack, Datum* out) override {
-    if (!haystack.is_arraylike()) {
-      return Status::Invalid("Haystack input to match kernel was not array-like");
-    }
-
-    Int32Builder indices_builder;
-    RETURN_NOT_OK(indices_builder.Reserve(haystack.length()));
-
-    auto lookup_value = [&](util::optional<Scalar> v) {
-      if (v.has_value()) {
-        // check if value in haystack array is in the needles_table_
-        if (needles_table_->Get(*v) != -1) {
-          // matching needle; output index from needles_table_
-          indices_builder.UnsafeAppend(needles_table_->Get(*v));
-        } else {
-          // no matching needle; output null
-          indices_builder.UnsafeAppendNull();
-        }
-      } else {
-        if (needles_table_->GetNull() != -1) {
-          // needles include null; output index from needles_table_
-          indices_builder.UnsafeAppend(needles_table_->GetNull());
-        } else {
-          // needles do not include null; output null
-          indices_builder.UnsafeAppendNull();
-        }
-      }
-    };
-
-    if (haystack.kind() == Datum::ARRAY) {
-      VisitArrayDataInline<Type>(*haystack.array(), lookup_value);
-    }
-
-    if (haystack.kind() == Datum::CHUNKED_ARRAY) {
-      for (const auto& chunk : haystack.chunked_array()->chunks()) {
-        VisitArrayDataInline<Type>(*chunk->data(), lookup_value);
-      }
-    }
-
-    std::shared_ptr<ArrayData> out_data;
-    RETURN_NOT_OK(indices_builder.FinishInternal(&out_data));
-    out->value = std::move(out_data);
-    return Status::OK();
-  }
-
-  Status Init(const Datum& needles) override {
-    if (!needles.is_arraylike()) {
-      return Status::Invalid("Needles input to match kernel was not array-like");
-    }
-
-    needles_table_.reset(new MemoTable(pool_, 0));
-
-    auto insert_value = [&](util::optional<Scalar> v) {
-      if (v.has_value()) {
-        int32_t unused_memo_index;
-        return needles_table_->GetOrInsert(*v, &unused_memo_index);
-      }
-      needles_table_->GetOrInsertNull();
-      return Status::OK();
-    };
-
-    if (needles.kind() == Datum::ARRAY) {
-      return VisitArrayDataInline<Type>(*needles.array(), insert_value);
-    }
-
-    for (const auto& chunk : needles.chunked_array()->chunks()) {
-      RETURN_NOT_OK(VisitArrayDataInline<Type>(*chunk->data(), insert_value));
-    }
-    return Status::OK();
-  }
-
- protected:
-  using MemoTable = typename HashTraits<Type>::MemoTableType;
-  std::unique_ptr<MemoTable> needles_table_;
-  std::shared_ptr<DataType> type_;
-  MemoryPool* pool_;
-};
-
-// ----------------------------------------------------------------------
-// (NullType has a separate implementation)
-
-class NullMatchKernel : public MatchKernelImpl {
- public:
-  NullMatchKernel(const std::shared_ptr<DataType>& type, MemoryPool* pool) {}
-
-  Status Call(FunctionContext* ctx, const Datum& haystack, Datum* out) override {
-    if (!haystack.is_arraylike()) {
-      return Status::Invalid("Haystack input to match kernel was not array-like");
-    }
-
-    Int32Builder indices_builder;
-    if (haystack.length() != 0) {
-      if (needles_null_count_ == 0) {
-        RETURN_NOT_OK(indices_builder.AppendNulls(haystack.length()));
-      } else {
-        RETURN_NOT_OK(indices_builder.Reserve(haystack.length()));
-
-        for (int64_t i = 0; i < haystack.length(); ++i) {
-          indices_builder.UnsafeAppend(0);
-        }
-      }
-    }
-
-    std::shared_ptr<ArrayData> out_data;
-    RETURN_NOT_OK(indices_builder.FinishInternal(&out_data));
-    out->value = std::move(out_data);
-    return Status::OK();
-  }
-
-  Status Init(const Datum& needles) override {
-    if (!needles.is_arraylike()) {
-      return Status::Invalid("Needles input to match kernel was not array-like");
-    }
-
-    needles_null_count_ = needles.length();
-    return Status::OK();
-  }
-
- private:
-  int64_t needles_null_count_{};
-};
-
-// ----------------------------------------------------------------------
-// Kernel wrapper for generic hash table kernels
-
-template <typename Type, typename Enable = void>
-struct MatchKernelTraits;
-
-template <>
-struct MatchKernelTraits<NullType> {
-  using MatchKernelImpl = NullMatchKernel;
-};
-
-template <typename Type>
-struct MatchKernelTraits<Type, enable_if_has_c_type<Type>> {
-  using MatchKernelImpl = MatchKernel<Type, typename Type::c_type>;
-};
-
-template <>
-struct MatchKernelTraits<BooleanType> {
-  using MatchKernelImpl = MatchKernel<BooleanType, bool>;
-};
-
-template <typename Type>
-struct MatchKernelTraits<Type, enable_if_base_binary<Type>> {
-  using MatchKernelImpl = MatchKernel<Type, util::string_view>;
-};
-
-template <typename Type>
-struct MatchKernelTraits<Type, enable_if_fixed_size_binary<Type>> {
-  using MatchKernelImpl = MatchKernel<Type, util::string_view>;
-};
-
-Status GetMatchKernel(FunctionContext* ctx, const std::shared_ptr<DataType>& type,
-                      const Datum& needles, std::unique_ptr<MatchKernelImpl>* out) {
-  std::unique_ptr<MatchKernelImpl> kernel;
-
-#define MATCH_CASE(InType)                                                \
-  case InType::type_id:                                                   \
-    kernel.reset(new typename MatchKernelTraits<InType>::MatchKernelImpl( \
-        type, ctx->memory_pool()));                                       \
-    break
-
-  switch (type->id()) {
-    MATCH_CASE(NullType);
-    MATCH_CASE(BooleanType);
-    MATCH_CASE(UInt8Type);
-    MATCH_CASE(Int8Type);
-    MATCH_CASE(UInt16Type);
-    MATCH_CASE(Int16Type);
-    MATCH_CASE(UInt32Type);
-    MATCH_CASE(Int32Type);
-    MATCH_CASE(UInt64Type);
-    MATCH_CASE(Int64Type);
-    MATCH_CASE(FloatType);
-    MATCH_CASE(DoubleType);
-    MATCH_CASE(Date32Type);
-    MATCH_CASE(Date64Type);
-    MATCH_CASE(Time32Type);
-    MATCH_CASE(Time64Type);
-    MATCH_CASE(TimestampType);
-    MATCH_CASE(BinaryType);
-    MATCH_CASE(StringType);
-    MATCH_CASE(FixedSizeBinaryType);
-    MATCH_CASE(Decimal128Type);
-    default:
-      break;
-  }
-#undef MATCH_CASE
-
-  if (!kernel) {
-    return Status::NotImplemented("Match is not implemented for ", type->ToString());
-  }
-  RETURN_NOT_OK(kernel->Init(needles));
-  *out = std::move(kernel);
-  return Status::OK();
-}
-
-Status Match(FunctionContext* ctx, const Datum& haystack, const Datum& needles,
-             Datum* out) {
-  DCHECK(haystack.type()->Equals(needles.type()));
-  std::vector<Datum> outputs;
-  std::unique_ptr<MatchKernelImpl> kernel;
-
-  RETURN_NOT_OK(GetMatchKernel(ctx, haystack.type(), needles, &kernel));
-  RETURN_NOT_OK(detail::InvokeUnaryArrayKernel(ctx, kernel.get(), haystack, &outputs));
-
-  *out = detail::WrapDatumsLike(haystack, kernel->out_type(), outputs);
-  return Status::OK();
-}
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/match.h b/cpp/src/arrow/compute/kernels/match.h
deleted file mode 100644
index d3d703d73e3..00000000000
--- a/cpp/src/arrow/compute/kernels/match.h
+++ /dev/null
@@ -1,57 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <memory>
-
-#include "arrow/array.h"
-#include "arrow/compute/context.h"
-#include "arrow/compute/kernel.h"
-#include "arrow/status.h"
-#include "arrow/type.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-namespace compute {
-
-/// \brief Match examines each slot in the haystack against a needles array.
-/// If the value is not found in needles, null will be output.
-/// If found, the index of occurrence within needles (ignoring duplicates)
-/// will be output.
-///
-/// For example given haystack = [99, 42, 3, null] and
-/// needles = [3, 3, 99], the output will be = [1, null, 0, null]
-///
-/// Note: Null in the haystack is considered to match
-/// a null in the needles array. For example given
-/// haystack = [99, 42, 3, null] and needles = [3, 99, null],
-/// the output will be = [1, null, 0, 2]
-///
-/// \param[in] context the FunctionContext
-/// \param[in] haystack array-like input
-/// \param[in] needles array-like input
-/// \param[out] out resulting datum
-///
-/// \since 1.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Status Match(FunctionContext* context, const Datum& haystack, const Datum& needles,
-             Datum* out);
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/match_test.cc b/cpp/src/arrow/compute/kernels/match_test.cc
deleted file mode 100644
index de261e00d80..00000000000
--- a/cpp/src/arrow/compute/kernels/match_test.cc
+++ /dev/null
@@ -1,389 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <algorithm>
-#include <cstdint>
-#include <cstdio>
-#include <functional>
-#include <locale>
-#include <memory>
-#include <stdexcept>
-#include <string>
-#include <vector>
-
-#include <gtest/gtest.h>
-
-#include "arrow/array.h"
-#include "arrow/buffer.h"
-#include "arrow/compute/context.h"
-#include "arrow/compute/kernel.h"
-#include "arrow/compute/kernels/match.h"
-#include "arrow/compute/kernels/util_internal.h"
-#include "arrow/compute/test_util.h"
-#include "arrow/memory_pool.h"
-#include "arrow/status.h"
-#include "arrow/table.h"
-#include "arrow/testing/gtest_common.h"
-#include "arrow/testing/random.h"
-#include "arrow/testing/util.h"
-#include "arrow/type.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/decimal.h"
-
-namespace arrow {
-namespace compute {
-
-// ----------------------------------------------------------------------
-// Match tests
-
-class TestMatchKernel : public ComputeFixture, public TestBase {
- public:
-  void CheckMatch(const std::shared_ptr<DataType>& type, const std::string& haystack_json,
-                  const std::string& needles_json, const std::string& expected_json) {
-    std::shared_ptr<Array> haystack = ArrayFromJSON(type, haystack_json);
-    std::shared_ptr<Array> needles = ArrayFromJSON(type, needles_json);
-    std::shared_ptr<Array> expected = ArrayFromJSON(int32(), expected_json);
-
-    Datum actual_datum;
-    ASSERT_OK(Match(&this->ctx_, haystack, needles, &actual_datum));
-    std::shared_ptr<Array> actual = actual_datum.make_array();
-    ASSERT_ARRAYS_EQUAL(*expected, *actual);
-  }
-};
-
-template <typename Type>
-class TestMatchKernelPrimitive : public TestMatchKernel {};
-
-using PrimitiveDictionaries =
-    ::testing::Types<Int8Type, UInt8Type, Int16Type, UInt16Type, Int32Type, UInt32Type,
-                     Int64Type, UInt64Type, FloatType, DoubleType, Date32Type,
-                     Date64Type>;
-
-TYPED_TEST_SUITE(TestMatchKernelPrimitive, PrimitiveDictionaries);
-
-TYPED_TEST(TestMatchKernelPrimitive, Match) {
-  auto type = TypeTraits<TypeParam>::type_singleton();
-
-  // No Nulls
-  this->CheckMatch(type,
-                   /* haystack= */ "[2, 1, 2, 1, 2, 3]",
-                   /* needles= */ "[2, 1, 2, 3]",
-                   /* expected= */ "[0, 1, 0, 1, 0, 2]");
-
-  // Haystack array all null
-  this->CheckMatch(type,
-                   /* haystack= */ "[null, null, null, null, null, null]",
-                   /* needles= */ "[2, 1, 3]",
-                   /* expected= */ "[null, null, null, null, null, null]");
-
-  // Needles array all null
-  this->CheckMatch(type,
-                   /* haystack= */ "[2, 1, 2, 1, 2, 3]",
-                   /* needles= */ "[null, null, null, null]",
-                   /* expected= */ "[null, null, null, null, null, null]");
-
-  // Both arrays all null
-  this->CheckMatch(type,
-                   /* haystack= */ "[null, null, null, null]",
-                   /* needles= */ "[null, null]",
-                   /* expected= */ "[0, 0, 0, 0]");
-
-  // No Match
-  this->CheckMatch(type,
-                   /* haystack= */ "[2, null, 7, 3, 8]",
-                   /* needles= */ "[2, null, 2, null, 6, 3, 3]",
-                   /* expected= */ "[0, 1, null, 3, null]");
-
-  // Empty Arrays
-  this->CheckMatch(type, "[]", "[]", "[]");
-}
-
-TYPED_TEST(TestMatchKernelPrimitive, PrimitiveResizeTable) {
-  using T = typename TypeParam::c_type;
-
-  const int64_t kTotalValues = std::min<int64_t>(INT16_MAX, 1UL << sizeof(T) / 2);
-  const int64_t kRepeats = 5;
-
-  Int32Builder expected_builder;
-  NumericBuilder<TypeParam> haystack_builder;
-  ASSERT_OK(expected_builder.Resize(kTotalValues * kRepeats));
-  ASSERT_OK(haystack_builder.Resize(kTotalValues * kRepeats));
-
-  for (int64_t i = 0; i < kTotalValues * kRepeats; i++) {
-    const auto index = i % kTotalValues;
-
-    haystack_builder.UnsafeAppend(static_cast<T>(index));
-    expected_builder.UnsafeAppend(static_cast<int32_t>(index));
-  }
-
-  std::shared_ptr<Array> haystack, needles, expected;
-  ASSERT_OK(haystack_builder.Finish(&haystack));
-  needles = haystack;
-  ASSERT_OK(expected_builder.Finish(&expected));
-
-  Datum actual_datum;
-  ASSERT_OK(Match(&this->ctx_, haystack, needles, &actual_datum));
-  std::shared_ptr<Array> actual = actual_datum.make_array();
-  ASSERT_ARRAYS_EQUAL(*expected, *actual);
-}
-
-TEST_F(TestMatchKernel, MatchNull) {
-  CheckMatch(null(), "[null, null, null]", "[null, null]", "[0, 0, 0]");
-
-  CheckMatch(null(), "[null, null, null]", "[]", "[null, null, null]");
-
-  CheckMatch(null(), "[]", "[null, null]", "[]");
-
-  CheckMatch(null(), "[]", "[]", "[]");
-}
-
-TEST_F(TestMatchKernel, MatchTimeTimestamp) {
-  CheckMatch(time32(TimeUnit::SECOND),
-             /* haystack= */ "[1, null, 5, 1, 2]",
-             /* needles= */ "[2, 1, null, 1]",
-             /* expected= */ "[1, 2, null, 1, 0]");
-
-  // Needles array has no nulls
-  CheckMatch(time32(TimeUnit::SECOND),
-             /* haystack= */ "[2, null, 5, 1]",
-             /* needles= */ "[2, 1, 1]",
-             /* expected= */ "[0, null, null, 1]");
-
-  // No match
-  CheckMatch(time32(TimeUnit::SECOND), "[3, null, 5, 3]", "[2, 1, 2, 1, 2]",
-             "[null, null, null, null]");
-
-  // Empty arrays
-  CheckMatch(time32(TimeUnit::SECOND), "[]", "[]", "[]");
-
-  CheckMatch(time64(TimeUnit::NANO), "[2, null, 2, 1]", "[2, null, 1]", "[0, 1, 0, 2]");
-
-  CheckMatch(timestamp(TimeUnit::NANO), "[2, null, 2, 1]", "[2, null, 2, 1]",
-             "[0, 1, 0, 2]");
-
-  // Empty haystack array
-  CheckMatch(timestamp(TimeUnit::NANO), "[]", "[2, null, 2, 1]", "[]");
-
-  // Empty needles array
-  CheckMatch(timestamp(TimeUnit::NANO), "[2, null, 2, 1]", "[]",
-             "[null, null, null, null]");
-
-  // Both array are all null
-  CheckMatch(time32(TimeUnit::SECOND), "[null, null, null, null]", "[null, null]",
-             "[0, 0, 0, 0]");
-}
-
-TEST_F(TestMatchKernel, MatchBoolean) {
-  CheckMatch(boolean(),
-             /* haystack= */ "[false, null, false, true]",
-             /* needles= */ "[null, false, true]",
-             /* expected= */ "[1, 0, 1, 2]");
-
-  CheckMatch(boolean(), "[false, null, false, true]", "[false, true, null, true, null]",
-             "[0, 2, 0, 1]");
-
-  // No Nulls
-  CheckMatch(boolean(), "[true, true, false, true]", "[false, true]", "[1, 1, 0, 1]");
-
-  CheckMatch(boolean(), "[false, true, false, true]", "[true, true, true, true]",
-             "[null, 0, null, 0]");
-
-  // No match
-  CheckMatch(boolean(), "[true, true, true, true]", "[false, false, false]",
-             "[null, null, null, null]");
-
-  // Nulls in haystack array
-  CheckMatch(boolean(), "[null, null, null, null]", "[true, true]",
-             "[null, null, null, null]");
-
-  // Nulls in needles array
-  CheckMatch(boolean(), "[true, true, false, true]",
-             "[null, null, null, null, null, null]", "[null, null, null, null]");
-
-  // Both array have Nulls
-  CheckMatch(boolean(), "[null, null, null, null]", "[null, null, null, null]",
-             "[0, 0, 0, 0]");
-}
-
-template <typename Type>
-class TestMatchKernelBinary : public TestMatchKernel {};
-
-using BinaryTypes = ::testing::Types<BinaryType, StringType>;
-TYPED_TEST_SUITE(TestMatchKernelBinary, BinaryTypes);
-
-TYPED_TEST(TestMatchKernelBinary, MatchBinary) {
-  auto type = TypeTraits<TypeParam>::type_singleton();
-  this->CheckMatch(type, R"(["foo", null, "bar", "foo"])", R"(["foo", null, "bar"])",
-                   R"([0, 1, 2, 0])");
-
-  // No match
-  this->CheckMatch(type,
-                   /* haystack= */ R"(["foo", null, "bar", "foo"])",
-                   /* needles= */ R"(["baz", "bazzz", "baz", "bazzz"])",
-                   /* expected= */ R"([null, null, null, null])");
-
-  // Nulls in haystack array
-  this->CheckMatch(type,
-                   /* haystack= */ R"([null, null, null, null])",
-                   /* needles= */ R"(["foo", "bar", "foo"])",
-                   /* expected= */ R"([null, null, null, null])");
-
-  // Nulls in needles array
-  this->CheckMatch(type, R"(["foo", "bar", "foo"])", R"([null, null, null])",
-                   R"([null, null, null])");
-
-  // Both array have Nulls
-  this->CheckMatch(type,
-                   /* haystack= */ R"([null, null, null, null])",
-                   /* needles= */ R"([null, null, null, null])",
-                   /* expected= */ R"([0, 0, 0, 0])");
-
-  // Empty arrays
-  this->CheckMatch(type, R"([])", R"([])", R"([])");
-
-  // Empty haystack array
-  this->CheckMatch(type, R"([])", R"(["foo", null, "bar", null])", "[]");
-
-  // Empty needles array
-  this->CheckMatch(type, R"(["foo", null, "bar", "foo"])", "[]",
-                   R"([null, null, null, null])");
-}
-
-TEST_F(TestMatchKernel, BinaryResizeTable) {
-  const int32_t kTotalValues = 10000;
-#if !defined(ARROW_VALGRIND)
-  const int32_t kRepeats = 10;
-#else
-  // Mitigate Valgrind's slowness
-  const int32_t kRepeats = 3;
-#endif
-
-  const int32_t kBufSize = 20;
-
-  Int32Builder expected_builder;
-  StringBuilder haystack_builder;
-  ASSERT_OK(expected_builder.Resize(kTotalValues * kRepeats));
-  ASSERT_OK(haystack_builder.Resize(kTotalValues * kRepeats));
-  ASSERT_OK(haystack_builder.ReserveData(kBufSize * kTotalValues * kRepeats));
-
-  for (int32_t i = 0; i < kTotalValues * kRepeats; i++) {
-    int32_t index = i % kTotalValues;
-
-    char buf[kBufSize] = "test";
-    ASSERT_GE(snprintf(buf + 4, sizeof(buf) - 4, "%d", index), 0);
-
-    haystack_builder.UnsafeAppend(util::string_view(buf));
-    expected_builder.UnsafeAppend(index);
-  }
-
-  std::shared_ptr<Array> haystack, needles, expected;
-  ASSERT_OK(haystack_builder.Finish(&haystack));
-  needles = haystack;
-  ASSERT_OK(expected_builder.Finish(&expected));
-
-  Datum actual_datum;
-  ASSERT_OK(Match(&this->ctx_, haystack, needles, &actual_datum));
-  std::shared_ptr<Array> actual = actual_datum.make_array();
-  ASSERT_ARRAYS_EQUAL(*expected, *actual);
-}
-
-TEST_F(TestMatchKernel, MatchFixedSizeBinary) {
-  CheckMatch(fixed_size_binary(5),
-             /* haystack= */ R"(["bbbbb", null, "aaaaa", "ccccc"])",
-             /* needles= */ R"(["bbbbb", null, "bbbbb", "aaaaa", "ccccc"])",
-             /* expected= */ R"([0, 1, 2, 3])");
-
-  // Nulls in haystack
-  CheckMatch(fixed_size_binary(5),
-             /* haystack= */ R"([null, null, null, null, null])",
-             /* needles= */ R"(["bbbbb", "aabbb", "bbbbb", "aaaaa", "ccccc"])",
-             /* expected= */ R"([null, null, null, null, null])");
-
-  // Nulls in needles
-  CheckMatch(fixed_size_binary(5),
-             /* haystack= */ R"(["bbbbb", null, "bbbbb", "aaaaa", "ccccc"])",
-             /* needles= */ R"([null, null, null])",
-             /* expected= */ R"([null, 0, null, null, null])");
-
-  // Both array have Nulls
-  CheckMatch(fixed_size_binary(5),
-             /* haystack= */ R"([null, null, null, null, null])",
-             /* needles= */ R"([null, null, null, null])",
-             /* expected= */ R"([0, 0, 0, 0, 0])");
-
-  // No match
-  CheckMatch(fixed_size_binary(5),
-             /* haystack= */ R"(["bbbbc", "bbbbc", "aaaad", "cccca"])",
-             /* needles= */ R"(["bbbbb", null, "bbbbb", "aaaaa", "ddddd"])",
-             /* expected= */ R"([null, null, null, null])");
-
-  // Empty haystack array
-  CheckMatch(fixed_size_binary(5), R"([])",
-             R"(["bbbbb", null, "bbbbb", "aaaaa", "ccccc"])", R"([])");
-
-  // Empty needles array
-  CheckMatch(fixed_size_binary(5), R"(["bbbbb", null, "bbbbb", "aaaaa", "ccccc"])",
-             R"([])", R"([null, null, null, null, null])");
-
-  // Empty arrays
-  CheckMatch(fixed_size_binary(0), R"([])", R"([])", R"([])");
-}
-
-TEST_F(TestMatchKernel, MatchDecimal) {
-  std::vector<Decimal128> input{12, 12, 11, 12};
-  std::vector<Decimal128> member_set{12, 12, 11, 12};
-  std::vector<int32_t> expected{0, 1, 2, 0};
-
-  CheckMatch(decimal(2, 0),
-             /* haystack= */ R"(["12", null, "11", "12"])",
-             /* needles= */ R"(["12", null, "11", "12"])",
-             /* expected= */ R"([0, 1, 2, 0])");
-}
-
-TEST_F(TestMatchKernel, MatchChunkedArrayInvoke) {
-  std::vector<std::string> values1 = {"foo", "bar", "foo"};
-  std::vector<std::string> values2 = {"bar", "baz", "quuux", "foo"};
-  std::vector<std::string> values3 = {"foo", "bar", "foo"};
-  std::vector<std::string> values4 = {"bar", "baz", "barr", "foo"};
-
-  auto type = utf8();
-  auto a1 = _MakeArray<StringType, std::string>(type, values1, {});
-  auto a2 = _MakeArray<StringType, std::string>(type, values2, {true, true, true, false});
-  auto a3 = _MakeArray<StringType, std::string>(type, values3, {});
-  auto a4 = _MakeArray<StringType, std::string>(type, values4, {});
-
-  ArrayVector array1 = {a1, a2};
-  auto carr = std::make_shared<ChunkedArray>(array1);
-  ArrayVector array2 = {a3, a4};
-  auto member_set = std::make_shared<ChunkedArray>(array2);
-
-  auto i1 = _MakeArray<Int32Type, int32_t>(int32(), {0, 1, 0}, {});
-  auto i2 =
-      _MakeArray<Int32Type, int32_t>(int32(), {1, 2, 2, 2}, {true, true, false, false});
-
-  ArrayVector expected = {i1, i2};
-  auto expected_carr = std::make_shared<ChunkedArray>(expected);
-
-  Datum encoded_out;
-  ASSERT_OK(Match(&this->ctx_, carr, member_set, &encoded_out));
-  ASSERT_EQ(Datum::CHUNKED_ARRAY, encoded_out.kind());
-
-  AssertChunkedEqual(*expected_carr, *encoded_out.chunked_array());
-}
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/mean.cc b/cpp/src/arrow/compute/kernels/mean.cc
deleted file mode 100644
index c61e23bf56f..00000000000
--- a/cpp/src/arrow/compute/kernels/mean.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/kernels/mean.h"
-
-#include <algorithm>
-
-#include "arrow/compute/kernels/sum_internal.h"
-
-namespace arrow {
-namespace compute {
-
-template <typename ArrowType,
-          typename SumType = typename FindAccumulatorType<ArrowType>::Type>
-struct MeanState {
-  using ThisType = MeanState<ArrowType, SumType>;
-
-  ThisType operator+(const ThisType& rhs) const {
-    return ThisType(this->count + rhs.count, this->sum + rhs.sum);
-  }
-
-  ThisType& operator+=(const ThisType& rhs) {
-    this->count += rhs.count;
-    this->sum += rhs.sum;
-
-    return *this;
-  }
-
-  std::shared_ptr<Scalar> Finalize() const {
-    using ScalarType = typename TypeTraits<DoubleType>::ScalarType;
-
-    const bool is_valid = count > 0;
-    const double divisor = static_cast<double>(is_valid ? count : 1UL);
-    const double mean = static_cast<double>(sum) / divisor;
-
-    if (!is_valid) return std::make_shared<ScalarType>();
-    return std::make_shared<ScalarType>(mean);
-  }
-
-  static std::shared_ptr<DataType> out_type() {
-    return TypeTraits<DoubleType>::type_singleton();
-  }
-
-  size_t count = 0;
-  typename SumType::c_type sum = 0;
-};
-
-#define MEAN_AGG_FN_CASE(T)                             \
-  case T::type_id:                                      \
-    return std::static_pointer_cast<AggregateFunction>( \
-        std::make_shared<SumAggregateFunction<T, MeanState<T>>>());
-
-std::shared_ptr<AggregateFunction> MakeMeanAggregateFunction(const DataType& type,
-                                                             FunctionContext* ctx) {
-  switch (type.id()) {
-    MEAN_AGG_FN_CASE(UInt8Type);
-    MEAN_AGG_FN_CASE(Int8Type);
-    MEAN_AGG_FN_CASE(UInt16Type);
-    MEAN_AGG_FN_CASE(Int16Type);
-    MEAN_AGG_FN_CASE(UInt32Type);
-    MEAN_AGG_FN_CASE(Int32Type);
-    MEAN_AGG_FN_CASE(UInt64Type);
-    MEAN_AGG_FN_CASE(Int64Type);
-    MEAN_AGG_FN_CASE(FloatType);
-    MEAN_AGG_FN_CASE(DoubleType);
-    default:
-      return nullptr;
-  }
-
-#undef MEAN_AGG_FN_CASE
-}
-
-static Status GetMeanKernel(FunctionContext* ctx, const DataType& type,
-                            std::shared_ptr<AggregateUnaryKernel>& kernel) {
-  std::shared_ptr<AggregateFunction> aggregate = MakeMeanAggregateFunction(type, ctx);
-  if (!aggregate) return Status::Invalid("No mean for type ", type);
-
-  kernel = std::make_shared<AggregateUnaryKernel>(aggregate);
-
-  return Status::OK();
-}
-
-Status Mean(FunctionContext* ctx, const Datum& value, Datum* out) {
-  std::shared_ptr<AggregateUnaryKernel> kernel;
-
-  auto data_type = value.type();
-  if (data_type == nullptr)
-    return Status::Invalid("Datum must be array-like");
-  else if (!is_integer(data_type->id()) && !is_floating(data_type->id()))
-    return Status::Invalid("Datum must contain a NumericType");
-
-  RETURN_NOT_OK(GetMeanKernel(ctx, *data_type, kernel));
-
-  return kernel->Call(ctx, value, out);
-}
-
-Status Mean(FunctionContext* ctx, const Array& array, Datum* out) {
-  return Mean(ctx, array.data(), out);
-}
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/mean.h b/cpp/src/arrow/compute/kernels/mean.h
deleted file mode 100644
index 0402c26f801..00000000000
--- a/cpp/src/arrow/compute/kernels/mean.h
+++ /dev/null
@@ -1,66 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <memory>
-#include <type_traits>
-
-#include "arrow/status.h"
-#include "arrow/type.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-
-class Array;
-class DataType;
-
-namespace compute {
-
-struct Datum;
-class FunctionContext;
-class AggregateFunction;
-
-ARROW_EXPORT
-std::shared_ptr<AggregateFunction> MakeMeanAggregateFunction(const DataType& type,
-                                                             FunctionContext* context);
-
-/// \brief Compute the mean of a numeric array.
-///
-/// \param[in] context the FunctionContext
-/// \param[in] value datum to compute the mean, expecting Array
-/// \param[out] mean datum of the computed mean as a DoubleScalar
-///
-/// \since 0.13.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Status Mean(FunctionContext* context, const Datum& value, Datum* mean);
-
-/// \brief Compute the mean of a numeric array.
-///
-/// \param[in] context the FunctionContext
-/// \param[in] array to compute the mean
-/// \param[out] mean datum of the computed mean as a DoubleScalar
-///
-/// \since 0.13.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Status Mean(FunctionContext* context, const Array& array, Datum* mean);
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/minmax.cc b/cpp/src/arrow/compute/kernels/minmax.cc
index fb6f8106f8b..0db5d9e738a 100644
--- a/cpp/src/arrow/compute/kernels/minmax.cc
+++ b/cpp/src/arrow/compute/kernels/minmax.cc
@@ -19,8 +19,6 @@
 #include <limits>
 #include <utility>
 
-#include "arrow/compute/kernels/aggregate.h"
-#include "arrow/compute/kernels/minmax.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/checked_cast.h"
 
@@ -30,13 +28,17 @@ using internal::checked_cast;
 
 namespace compute {
 
+struct MinMaxState : public KernelState {
+  virtual void Consume(KernelContext* ctx, const ExecBatch& batch) = 0;
+};
+
 template <typename ArrowType, typename Enable = void>
 struct MinMaxState {};
 
 template <typename ArrowType>
-struct MinMaxState<ArrowType, enable_if_integer<ArrowType>> {
+struct MinMaxState<ArrowType, enable_if_integer<ArrowType>> : public KernelState {
   using ThisType = MinMaxState<ArrowType>;
-  using c_type = typename ArrowType::c_type;
+  using T = typename ArrowType::c_type;
 
   ThisType& operator+=(const ThisType& rhs) {
     this->has_nulls |= rhs.has_nulls;
@@ -45,20 +47,20 @@ struct MinMaxState<ArrowType, enable_if_integer<ArrowType>> {
     return *this;
   }
 
-  void MergeOne(c_type value) {
+  void MergeOne(T value) {
     this->min = std::min(this->min, value);
     this->max = std::max(this->max, value);
   }
 
-  c_type min = std::numeric_limits<c_type>::max();
-  c_type max = std::numeric_limits<c_type>::min();
+  T min = std::numeric_limits<T>::max();
+  T max = std::numeric_limits<T>::min();
   bool has_nulls = false;
 };
 
 template <typename ArrowType>
-struct MinMaxState<ArrowType, enable_if_floating_point<ArrowType>> {
+struct MinMaxState<ArrowType, enable_if_floating_point<ArrowType>> : public KernelState {
   using ThisType = MinMaxState<ArrowType>;
-  using c_type = typename ArrowType::c_type;
+  using T = typename ArrowType::c_type;
 
   ThisType& operator+=(const ThisType& rhs) {
     this->has_nulls |= rhs.has_nulls;
@@ -67,54 +69,36 @@ struct MinMaxState<ArrowType, enable_if_floating_point<ArrowType>> {
     return *this;
   }
 
-  void MergeOne(c_type value) {
+  void MergeOne(T value) {
     this->min = std::fmin(this->min, value);
     this->max = std::fmax(this->max, value);
   }
 
-  c_type min = std::numeric_limits<c_type>::infinity();
-  c_type max = -std::numeric_limits<c_type>::infinity();
+  T min = std::numeric_limits<T>::infinity();
+  T max = -std::numeric_limits<T>::infinity();
   bool has_nulls = false;
 };
 
-template <typename ArrowType>
-class MinMaxAggregateFunction final
-    : public AggregateFunctionStaticState<MinMaxState<ArrowType>> {
- public:
-  using StateType = MinMaxState<ArrowType>;
-
-  explicit MinMaxAggregateFunction(const MinMaxOptions& options) : options_(options) {}
+struct StateVisitor {
+  std::unique_ptr<KernelState> result;
 
-  Status Consume(const Array& array, StateType* state) const override {
-    StateType local;
+  Status Visit(const DataType&) { return Status::NotImplemented("NYI"); }
 
-    local.has_nulls = array.null_count() > 0;
-    if (local.has_nulls && options_.null_handling == MinMaxOptions::OUTPUT_NULL) {
-      *state = local;
-      return Status::OK();
-    }
+  template <typename Type>
+  enable_if_number<Type, Status> Visit(const Type&) {
+    using StateType = MinMaxState<Type>;
+    result.reset(new StateType());
+  }
+};
 
-    const auto values =
-        checked_cast<const typename TypeTraits<ArrowType>::ArrayType&>(array)
-            .raw_values();
-    if (array.null_count() > 0) {
-      internal::BitmapReader reader(array.null_bitmap_data(), array.offset(),
-                                    array.length());
-      for (int64_t i = 0; i < array.length(); i++) {
-        if (reader.IsSet()) {
-          local.MergeOne(values[i]);
-        }
-        reader.Next();
-      }
-    } else {
-      for (int64_t i = 0; i < array.length(); i++) {
-        local.MergeOne(values[i]);
-      }
-    }
+std::unique_ptr<KernelState> MinMaxInit(KernelContext* ctx, const Kernel& kernel,
+                                        const FunctionOptions&) {
+  StateVisitor state_init;
+  ctx->SetStatus(VisitTypeInline(/*type*/, &state_init));
+}
 
-    *state = local;
-    return Status::OK();
-  }
+void MinMaxConsume(KernelContext* ctx, const ExecBatch& batch) {
+  checked_cast<MinMaxState*>(ctx->state())->Consume(batch);
 
   Status Merge(const StateType& src, StateType* dst) const override {
     *dst += src;
@@ -133,71 +117,15 @@ class MinMaxAggregateFunction final
     return Status::OK();
   }
 
-  std::shared_ptr<DataType> out_type() const override {
-    return TypeTraits<ArrowType>::type_singleton();
-  }
-
  private:
   MinMaxOptions options_;
-};
-
-#define MINMAX_AGG_FN_CASE(T)                           \
-  case T::type_id:                                      \
-    return std::static_pointer_cast<AggregateFunction>( \
-        std::make_shared<MinMaxAggregateFunction<T>>(options));
-
-std::shared_ptr<AggregateFunction> MakeMinMaxAggregateFunction(
-    const DataType& type, FunctionContext* ctx, const MinMaxOptions& options) {
-  switch (type.id()) {
-    MINMAX_AGG_FN_CASE(UInt8Type);
-    MINMAX_AGG_FN_CASE(Int8Type);
-    MINMAX_AGG_FN_CASE(UInt16Type);
-    MINMAX_AGG_FN_CASE(Int16Type);
-    MINMAX_AGG_FN_CASE(UInt32Type);
-    MINMAX_AGG_FN_CASE(Int32Type);
-    MINMAX_AGG_FN_CASE(UInt64Type);
-    MINMAX_AGG_FN_CASE(Int64Type);
-    MINMAX_AGG_FN_CASE(FloatType);
-    MINMAX_AGG_FN_CASE(DoubleType);
-    default:
-      return nullptr;
-  }
-
-#undef MINMAX_AGG_FN_CASE
-}
-
-static Status GetMinMaxKernel(FunctionContext* ctx, const DataType& type,
-                              const MinMaxOptions& options,
-                              std::shared_ptr<AggregateUnaryKernel>& kernel) {
-  std::shared_ptr<AggregateFunction> aggregate =
-      MakeMinMaxAggregateFunction(type, ctx, options);
-  if (!aggregate) return Status::Invalid("No min/max for type ", type);
-
-  kernel = std::make_shared<AggregateUnaryKernel>(aggregate);
-
-  return Status::OK();
 }
 
-Status MinMax(FunctionContext* ctx, const MinMaxOptions& options, const Datum& value,
-              Datum* out) {
-  std::shared_ptr<AggregateUnaryKernel> kernel;
-
-  auto data_type = value.type();
-  if (data_type == nullptr) {
-    return Status::Invalid("Datum must be array-like");
-  } else if (!is_integer(data_type->id()) && !is_floating(data_type->id())) {
-    return Status::Invalid("Datum must contain a NumericType");
-  }
-
-  RETURN_NOT_OK(GetMinMaxKernel(ctx, *data_type, options, kernel));
-
-  return kernel->Call(ctx, value, out);
-}
-
-Status MinMax(FunctionContext* ctx, const MinMaxOptions& options, const Array& array,
-              Datum* out) {
-  return MinMax(ctx, options, array.data(), out);
-}
+// MinMax implemented for
+//
+// * Number types
+//
+// Outputs struct<min: T, max: T>
 
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/minmax.h b/cpp/src/arrow/compute/kernels/minmax.h
deleted file mode 100644
index 237796cc7f9..00000000000
--- a/cpp/src/arrow/compute/kernels/minmax.h
+++ /dev/null
@@ -1,98 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <memory>
-
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-
-class Array;
-class DataType;
-class Status;
-
-namespace compute {
-
-struct Datum;
-class FunctionContext;
-class AggregateFunction;
-
-/// \class MinMaxOptions
-///
-/// The user can control the MinMax kernel behavior with this class. By default,
-/// it will skip null if there is a null value present.
-struct ARROW_EXPORT MinMaxOptions {
-  enum mode {
-    /// skip null values
-    SKIP = 0,
-    /// any nulls will result in null output
-    OUTPUT_NULL
-  };
-
-  explicit MinMaxOptions(enum mode null_handling = SKIP) : null_handling(null_handling) {}
-
-  enum mode null_handling = SKIP;
-};
-
-/// \brief Return a Min/Max Kernel
-///
-/// \param[in] type required to specialize the kernel
-/// \param[in] ctx the FunctionContext
-/// \param[in] options see MinMaxOptions for more information
-///
-/// \since 1.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-std::shared_ptr<AggregateFunction> MakeMinMaxAggregateFunction(
-    const DataType& type, FunctionContext* ctx, const MinMaxOptions& options);
-
-/// \brief Calculate the min / max of a numeric array
-///
-/// This function returns both the min and max as a collection. The resulting
-/// datum thus consists of two scalar datums: {Datum(min), Datum(max)}
-///
-/// \param[in] ctx the FunctionContext
-/// \param[in] options see MinMaxOptions for more information
-/// \param[in] value input datum, expecting Array or ChunkedArray
-/// \param[out] out resulting datum containing a {min, max} collection
-///
-/// \since 1.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Status MinMax(FunctionContext* ctx, const MinMaxOptions& options, const Datum& value,
-              Datum* out);
-
-/// \brief Calculate the min / max of a numeric array.
-///
-/// This function returns both the min and max as a collection. The resulting
-/// datum thus consists of two scalar datums: {Datum(min), Datum(max)}
-///
-/// \param[in] ctx the FunctionContext
-/// \param[in] options see MinMaxOptions for more information
-/// \param[in] array input array
-/// \param[out] out resulting datum containing a {min, max} collection
-///
-/// \since 1.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Status MinMax(FunctionContext* ctx, const MinMaxOptions& options, const Array& array,
-              Datum* out);
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/nth_to_indices.cc b/cpp/src/arrow/compute/kernels/nth_to_indices.cc
deleted file mode 100644
index bd28add603b..00000000000
--- a/cpp/src/arrow/compute/kernels/nth_to_indices.cc
+++ /dev/null
@@ -1,140 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/kernels/nth_to_indices.h"
-
-#include <algorithm>
-#include <utility>
-
-#include "arrow/builder.h"
-#include "arrow/compute/context.h"
-
-namespace arrow {
-
-class Array;
-
-namespace compute {
-
-struct NthToIndicesKernel : public OpKernel {
-  virtual Status Call(FunctionContext* ctx, const Datum& values, int64_t n,
-                      Datum* offsets) = 0;
-  static std::shared_ptr<NthToIndicesKernel> Make(const DataType& value_type);
-};
-
-template <typename ArrowType>
-class NthToIndicesKernelImpl final : public NthToIndicesKernel {
-  using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
-
- public:
-  Status Call(FunctionContext* ctx, const Datum& values, int64_t n,
-              Datum* offsets) override {
-    if (!values.is_array()) {
-      return Status::Invalid("NthToIndicesKernel expects array values");
-    }
-    auto values_array = std::static_pointer_cast<ArrayType>(values.make_array());
-    std::shared_ptr<Array> offsets_array;
-    RETURN_NOT_OK(NthToIndices(ctx, values_array, n, &offsets_array));
-    *offsets = offsets_array;
-    return Status::OK();
-  }
-
-  std::shared_ptr<DataType> out_type() const override { return type_; }
-
- private:
-  std::shared_ptr<DataType> type_;
-
-  Status NthToIndices(FunctionContext* ctx, const std::shared_ptr<ArrayType>& values,
-                      int64_t n, std::shared_ptr<Array>* offsets) {
-    if (n > values->length()) {
-      return Status::IndexError("NthToIndices index out of bound");
-    }
-
-    int64_t buf_size = values->length() * sizeof(uint64_t);
-    ARROW_ASSIGN_OR_RAISE(auto indices_buf, AllocateBuffer(buf_size, ctx->memory_pool()));
-
-    int64_t* indices_begin = reinterpret_cast<int64_t*>(indices_buf->mutable_data());
-    int64_t* indices_end = indices_begin + values->length();
-
-    std::iota(indices_begin, indices_end, 0);
-    *offsets = std::make_shared<UInt64Array>(values->length(), std::move(indices_buf));
-
-    if (n == values->length()) {
-      return Status::OK();
-    }
-
-    auto nulls_begin = indices_end;
-    if (values->null_count()) {
-      nulls_begin =
-          std::stable_partition(indices_begin, indices_end,
-                                [&values](uint64_t ind) { return !values->IsNull(ind); });
-    }
-
-    auto nth_begin = indices_begin + n;
-    if (nth_begin < nulls_begin) {
-      std::nth_element(indices_begin, nth_begin, nulls_begin,
-                       [&values](uint64_t left, uint64_t right) {
-                         return values->GetView(left) < values->GetView(right);
-                       });
-    }
-    return Status::OK();
-  }
-};
-
-#define NTH_MAKE_CASE(T) \
-  case T::type_id:       \
-    return std::shared_ptr<NthToIndicesKernel>(new NthToIndicesKernelImpl<T>());
-
-std::shared_ptr<NthToIndicesKernel> NthToIndicesKernel::Make(const DataType& value_type) {
-  switch (value_type.id()) {
-    NTH_MAKE_CASE(UInt8Type);
-    NTH_MAKE_CASE(Int8Type);
-    NTH_MAKE_CASE(UInt16Type);
-    NTH_MAKE_CASE(Int16Type);
-    NTH_MAKE_CASE(UInt32Type);
-    NTH_MAKE_CASE(Int32Type);
-    NTH_MAKE_CASE(UInt64Type);
-    NTH_MAKE_CASE(Int64Type);
-    NTH_MAKE_CASE(FloatType);
-    NTH_MAKE_CASE(DoubleType);
-    NTH_MAKE_CASE(BinaryType);
-    NTH_MAKE_CASE(StringType);
-    default:
-      return NULLPTR;
-  }
-}
-
-static Status NthToIndices(FunctionContext* ctx, const Datum& values, int64_t n,
-                           Datum* offsets) {
-  auto type = values.type();
-  ARROW_CHECK_NE(type, NULLPTR);
-  auto kernel = NthToIndicesKernel::Make(*type);
-  if (!kernel) {
-    return Status::TypeError("No NthToIndices kernel for this type ", *type);
-  }
-  return kernel->Call(ctx, values, n, offsets);
-}
-
-Status NthToIndices(FunctionContext* ctx, const Array& values, int64_t n,
-                    std::shared_ptr<Array>* offsets) {
-  Datum offsets_datum;
-  RETURN_NOT_OK(NthToIndices(ctx, Datum(values.data()), n, &offsets_datum));
-  *offsets = offsets_datum.make_array();
-  return Status::OK();
-}
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/nth_to_indices.h b/cpp/src/arrow/compute/kernels/nth_to_indices.h
deleted file mode 100644
index 0a0747340a9..00000000000
--- a/cpp/src/arrow/compute/kernels/nth_to_indices.h
+++ /dev/null
@@ -1,53 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <memory>
-
-#include "arrow/compute/kernel.h"
-#include "arrow/status.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-
-class Array;
-
-namespace compute {
-
-class FunctionContext;
-
-/// \brief Returns indices that partition an array around n-th
-/// sorted element.
-///
-/// Find index of n-th(0 based) smallest value and perform indirect
-/// partition of an array around that element. Output indices[0 ~ n-1]
-/// holds values no greater than n-th element, and indices[n+1 ~ end]
-/// holds values no less than n-th element. Elements in each partition
-/// is not sorted. Nulls will be partitioned to the end of the output.
-/// Output is not guaranteed to be stable.
-///
-/// \param[in] ctx the FunctionContext
-/// \param[in] values array to be partitioned
-/// \param[in] n pivot array around sorted n-th element
-/// \param[out] offsets indices that would partition an array
-ARROW_EXPORT
-Status NthToIndices(FunctionContext* ctx, const Array& values, int64_t n,
-                    std::shared_ptr<Array>* offsets);
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/sort_to_indices.h b/cpp/src/arrow/compute/kernels/registry.h
similarity index 53%
rename from cpp/src/arrow/compute/kernels/sort_to_indices.h
rename to cpp/src/arrow/compute/kernels/registry.h
index 0d8c5d9aaa6..f835de0cd08 100644
--- a/cpp/src/arrow/compute/kernels/sort_to_indices.h
+++ b/cpp/src/arrow/compute/kernels/registry.h
@@ -15,37 +15,32 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#pragma once
+// Internal APIs for adding kernels to the central registry
 
-#include <memory>
+#pragma once
 
-#include "arrow/compute/kernel.h"
 #include "arrow/status.h"
-#include "arrow/util/visibility.h"
 
 namespace arrow {
+namespace compute {
+namespace internal {
 
-class Array;
+// Built-in scalar / elementwise functions
+void RegisterArithmeticFunctions(FunctionRegistry* registry);
+void RegisterBooleanFunctions(FunctionRegistry* registry);
+void RegisterComparisonFunctions(FunctionRegistry* registry);
+void RegisterSetLookupFunctions(FunctionRegistry* registry);
 
-namespace compute {
+// Vector functions
+void RegisterVectorFilterFunctions(FunctionRegistry* registry);
+void RegisterVectorHashFunctions(FunctionRegistry* registry);
+void RegisterVectorPartitionFunctions(FunctionRegistry* registry);
+void RegisterVectorSortFunctions(FunctionRegistry* registry);
+void RegisterVectorTakeFunctions(FunctionRegistry* registry);
 
-class FunctionContext;
-
-/// \brief Returns the indices that would sort an array.
-///
-/// Perform an indirect sort of array. The output array will contain
-/// indices that would sort an array, which would be the same length
-/// as input. Nulls will be stably partitioned to the end of the output.
-///
-/// For example given values = [null, 1, 3.3, null, 2, 5.3], the output
-/// will be [1, 4, 2, 5, 0, 3]
-///
-/// \param[in] ctx the FunctionContext
-/// \param[in] values array to sort
-/// \param[out] offsets indices that would sort an array
-ARROW_EXPORT
-Status SortToIndices(FunctionContext* ctx, const Array& values,
-                     std::shared_ptr<Array>* offsets);
+// Aggregate functions
+void RegisterBasicAggregateFunctions(FunctionRegistry* registry);
 
+}  // namespace internal
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/context.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
similarity index 51%
rename from cpp/src/arrow/compute/context.cc
rename to cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
index dade2464a3d..c6ebceb840f 100644
--- a/cpp/src/arrow/compute/context.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
@@ -15,35 +15,38 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "arrow/compute/context.h"
-
-#include <memory>
-
-#include "arrow/buffer.h"
-#include "arrow/result.h"
-#include "arrow/util/cpu_info.h"
+#include "arrow/compute/kernels/common.h"
 
 namespace arrow {
 namespace compute {
 
-FunctionContext::FunctionContext(MemoryPool* pool)
-    : pool_(pool), cpu_info_(internal::CpuInfo::GetInstance()) {}
-
-MemoryPool* FunctionContext::memory_pool() const { return pool_; }
+struct Add {
+  template <typename OUT, typename ARG0, typename ARG1>
+  static constexpr OUT Call(KernelContext*, ARG0 left, ARG1 right) {
+    return left + right;
+  }
+};
 
-Status FunctionContext::Allocate(const int64_t nbytes, std::shared_ptr<Buffer>* out) {
-  return AllocateBuffer(nbytes, pool_).Value(out);
-}
+namespace codegen {
 
-void FunctionContext::SetStatus(const Status& status) {
-  if (ARROW_PREDICT_FALSE(!status_.ok())) {
-    return;
+template <typename Op>
+void MakeBinaryFunction(std::string name, FunctionRegistry* registry) {
+  auto func = std::make_shared<ScalarFunction>(name, /*arity=*/2);
+  for (const std::shared_ptr<DataType>& ty : codegen::NumericTypes()) {
+    DCHECK_OK(func->AddKernel({InputType::Array(ty), InputType::Array(ty)}, ty,
+                              NumericEqualTypes::MakeBinary<Op>(*ty)));
   }
-  status_ = status;
+  DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
-/// \brief Clear any error status
-void FunctionContext::ResetStatus() { status_ = Status::OK(); }
+}  // namespace codegen
+
+namespace internal {
+
+void RegisterArithmeticFunctions(FunctionRegistry* registry) {
+  codegen::MakeBinaryFunction<Add>("add", registry);
+}
 
+}  // namespace internal
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/add_test.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc
similarity index 82%
rename from cpp/src/arrow/compute/kernels/add_test.cc
rename to cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc
index 7ee27600d5c..b94a9a94d8e 100644
--- a/cpp/src/arrow/compute/kernels/add_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc
@@ -24,8 +24,7 @@
 #include <gtest/gtest.h>
 
 #include "arrow/array.h"
-#include "arrow/compute/kernel.h"
-#include "arrow/compute/kernels/add.h"
+#include "arrow/compute/api.h"
 #include "arrow/compute/test_util.h"
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
@@ -39,12 +38,12 @@ namespace arrow {
 namespace compute {
 
 template <typename ArrowType>
-class TestArithmeticKernel : public ComputeFixture, public TestBase {
+class TestArithmeticKernel : public TestBase {
  private:
   void AssertAddArrays(const std::shared_ptr<Array> lhs, const std::shared_ptr<Array> rhs,
                        const std::shared_ptr<Array> expected) {
-    std::shared_ptr<Array> actual;
-    ASSERT_OK(arrow::compute::Add(&this->ctx_, *lhs, *rhs, &actual));
+    ASSERT_OK_AND_ASSIGN(Datum out, arrow::compute::Add(lhs, rhs));
+    std::shared_ptr<Array> actual = out.make_array();
     ASSERT_OK(actual->ValidateFull());
     AssertArraysEqual(*expected, *actual);
   }
@@ -59,14 +58,14 @@ class TestArithmeticKernel : public ComputeFixture, public TestBase {
 };
 
 template <typename ArrowType>
-class TestArithmeticKernelForReal : public TestArithmeticKernel<ArrowType> {};
-TYPED_TEST_SUITE(TestArithmeticKernelForReal, RealArrowTypes);
+class TestArithmeticKernelFloating : public TestArithmeticKernel<ArrowType> {};
+TYPED_TEST_SUITE(TestArithmeticKernelFloating, RealArrowTypes);
 
 template <typename ArrowType>
-class TestArithmeticKernelForIntegral : public TestArithmeticKernel<ArrowType> {};
-TYPED_TEST_SUITE(TestArithmeticKernelForIntegral, IntegralArrowTypes);
+class TestArithmeticKernelIntegral : public TestArithmeticKernel<ArrowType> {};
+TYPED_TEST_SUITE(TestArithmeticKernelIntegral, IntegralArrowTypes);
 
-TYPED_TEST(TestArithmeticKernelForReal, SortReal) {
+TYPED_TEST(TestArithmeticKernelFloating, Add) {
   this->AssertAdd("[]", "[]", "[]");
 
   this->AssertAdd("[3.4, 2.6, 6.3]", "[1, 0, 2]", "[4.4, 2.6, 8.3]");
@@ -84,7 +83,7 @@ TYPED_TEST(TestArithmeticKernelForReal, SortReal) {
                   "[null, 5, 5.3, null, 2, 8.3]");
 }
 
-TYPED_TEST(TestArithmeticKernelForIntegral, SortIntegral) {
+TYPED_TEST(TestArithmeticKernelIntegral, Add) {
   this->AssertAdd("[]", "[]", "[]");
 
   this->AssertAdd("[3, 2, 6]", "[1, 0, 2]", "[4, 2, 8]");
diff --git a/cpp/src/arrow/compute/kernels/scalar_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_boolean.cc
new file mode 100644
index 00000000000..bee58e2f237
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/scalar_boolean.cc
@@ -0,0 +1,183 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/kernels/common.h"
+#include "arrow/util/bit_util.h"
+
+namespace arrow {
+namespace compute {
+
+namespace {
+
+enum class ResolveNull { KLEENE_LOGIC, PROPAGATE };
+
+enum BitmapIndex { LEFT_VALID, LEFT_DATA, RIGHT_VALID, RIGHT_DATA };
+
+template <typename ComputeWord>
+void ComputeKleene(ComputeWord&& compute_word, KernelContext* ctx, const ArrayData& left,
+                   const ArrayData& right, ArrayData* out) {
+  using internal::Bitmap;
+  DCHECK(left.null_count != 0 || right.null_count != 0);
+
+  Bitmap bitmaps[4];
+  bitmaps[LEFT_VALID] = {left.buffers[0], left.offset, left.length};
+  bitmaps[LEFT_DATA] = {left.buffers[1], left.offset, left.length};
+
+  bitmaps[RIGHT_VALID] = {right.buffers[0], right.offset, right.length};
+  bitmaps[RIGHT_DATA] = {right.buffers[1], right.offset, right.length};
+
+  auto out_validity = out->GetMutableValues<uint64_t>(0);
+  auto out_data = out->GetMutableValues<uint64_t>(1);
+
+  int64_t i = 0;
+  auto apply = [&](uint64_t left_valid, uint64_t left_data, uint64_t right_valid,
+                   uint64_t right_data) {
+    auto left_true = left_valid & left_data;
+    auto left_false = left_valid & ~left_data;
+
+    auto right_true = right_valid & right_data;
+    auto right_false = right_valid & ~right_data;
+
+    compute_word(left_true, left_false, right_true, right_false, &out_validity[i],
+                 &out_data[i]);
+    ++i;
+  };
+
+  if (right.null_count == 0 || left.null_count == 0) {
+    if (left.null_count == 0) {
+      // ensure only bitmaps[RIGHT_VALID].buffer might be null
+      std::swap(bitmaps[LEFT_VALID], bitmaps[RIGHT_VALID]);
+      std::swap(bitmaps[LEFT_DATA], bitmaps[RIGHT_DATA]);
+    }
+    // override bitmaps[RIGHT_VALID] to make it safe for Visit()
+    bitmaps[RIGHT_VALID] = bitmaps[RIGHT_DATA];
+
+    Bitmap::VisitWords(bitmaps, [&](std::array<uint64_t, 4> words) {
+      apply(words[LEFT_VALID], words[LEFT_DATA], ~uint64_t(0), words[RIGHT_DATA]);
+    });
+  } else {
+    Bitmap::VisitWords(bitmaps, [&](std::array<uint64_t, 4> words) {
+      apply(words[LEFT_VALID], words[LEFT_DATA], words[RIGHT_VALID], words[RIGHT_DATA]);
+    });
+  }
+}
+
+struct Invert {
+  static void Call(KernelContext* ctx, bool value) {
+    ctx->SetStatus(Status::NotImplemented("NYI"));
+  }
+
+  static void Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) {
+    internal::InvertBitmap(in.buffers[1]->data(), in.offset, in.length,
+                           out->buffers[1]->mutable_data(), out->offset);
+  }
+};
+
+struct And {
+  static void Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
+                   ArrayData* out) {
+    internal::BitmapAnd(left.buffers[1]->data(), left.offset, right.buffers[1]->data(),
+                        right.offset, right.length, out->offset,
+                        out->buffers[1]->mutable_data());
+  }
+};
+
+struct KleeneAnd {
+  static void Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
+                   ArrayData* out) {
+    if (left.GetNullCount() == 0 && right.GetNullCount() == 0) {
+      return And::Call(ctx, left, right, out);
+    }
+    auto compute_word = [](uint64_t left_true, uint64_t left_false, uint64_t right_true,
+                           uint64_t right_false, uint64_t* out_valid,
+                           uint64_t* out_data) {
+      *out_data = left_true & right_true;
+      *out_valid = left_false | right_false | (left_true & right_true);
+    };
+    ComputeKleene(compute_word, ctx, left, right, out);
+  }
+};
+
+struct Or {
+  static void Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
+                   ArrayData* out) {
+    internal::BitmapOr(left.buffers[1]->data(), left.offset, right.buffers[1]->data(),
+                       right.offset, right.length, out->offset,
+                       out->buffers[1]->mutable_data());
+  }
+};
+
+struct KleeneOr {
+  static void Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
+                   ArrayData* out) {
+    if (left.GetNullCount() == 0 && right.GetNullCount() == 0) {
+      return Or::Call(ctx, left, right, out);
+    }
+    static auto compute_word = [](uint64_t left_true, uint64_t left_false,
+                                  uint64_t right_true, uint64_t right_false,
+                                  uint64_t* out_valid, uint64_t* out_data) {
+      *out_data = left_true | right_true;
+      *out_valid = left_true | right_true | (left_false & right_false);
+    };
+
+    return ComputeKleene(compute_word, ctx, left, right, out);
+  }
+};
+
+struct Xor {
+  static void Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
+                   ArrayData* out) {
+    internal::BitmapXor(left.buffers[1]->data(), left.offset, right.buffers[1]->data(),
+                        right.offset, right.length, out->offset,
+                        out->buffers[1]->mutable_data());
+  }
+};
+
+void MakeFunction(std::string name, int arity, ArrayKernelExec exec,
+                  FunctionRegistry* registry, bool can_write_into_slices = true) {
+  auto func = std::make_shared<ScalarFunction>(name, arity);
+
+  // Scalar arguments not yet supported
+  std::vector<InputType> in_types(arity, InputType::Array(boolean()));
+  ScalarKernel kernel(std::move(in_types), boolean(), exec);
+  kernel.can_write_into_slices = can_write_into_slices;
+
+  DCHECK_OK(func->AddKernel(kernel));
+  DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+}  // namespace
+
+namespace internal {
+
+void RegisterBooleanFunctions(FunctionRegistry* registry) {
+  // These functions can write into sliced output bitmaps
+  MakeFunction("invert", 1, SimpleExec::Unary<Invert>, registry);
+  MakeFunction("and", 2, SimpleExec::Binary<And>, registry);
+  MakeFunction("or", 2, SimpleExec::Binary<Or>, registry);
+  MakeFunction("xor", 2, SimpleExec::Binary<Xor>, registry);
+
+  // The Kleene logic kernels cannot write into sliced output bitmaps
+  MakeFunction("and_kleene", 2, SimpleExec::Binary<KleeneAnd>, registry,
+               /*can_write_into_slices=*/false);
+  MakeFunction("or_kleene", 2, SimpleExec::Binary<KleeneOr>, registry,
+               /*can_write_into_slices=*/false);
+}
+
+}  // namespace internal
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/boolean_test.cc b/cpp/src/arrow/compute/kernels/scalar_boolean_test.cc
similarity index 89%
rename from cpp/src/arrow/compute/kernels/boolean_test.cc
rename to cpp/src/arrow/compute/kernels/scalar_boolean_test.cc
index 77b7e6473d7..8566d93ab7c 100644
--- a/cpp/src/arrow/compute/kernels/boolean_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_boolean_test.cc
@@ -26,32 +26,28 @@
 #include "arrow/testing/gtest_common.h"
 #include "arrow/testing/gtest_util.h"
 
-#include "arrow/compute/context.h"
+#include "arrow/compute/api_eager.h"
 #include "arrow/compute/kernel.h"
-#include "arrow/compute/kernels/boolean.h"
-#include "arrow/compute/kernels/util_internal.h"
 #include "arrow/compute/test_util.h"
 
 namespace arrow {
 namespace compute {
 
 using BinaryKernelFunc =
-    std::function<Status(FunctionContext*, const Datum&, const Datum&, Datum* out)>;
+    std::function<Result<Datum>(const Datum&, const Datum&, ExecContext*)>;
 
-class TestBooleanKernel : public ComputeFixture, public TestBase {
+class TestBooleanKernel : public TestBase {
  public:
   void TestArrayBinary(const BinaryKernelFunc& kernel, const std::shared_ptr<Array>& left,
                        const std::shared_ptr<Array>& right,
                        const std::shared_ptr<Array>& expected) {
-    Datum result;
-
-    ASSERT_OK(kernel(&this->ctx_, left, right, &result));
+    ASSERT_OK_AND_ASSIGN(Datum result, kernel(left, right, &ctx_));
     ASSERT_EQ(Datum::ARRAY, result.kind());
     std::shared_ptr<Array> result_array = result.make_array();
     ASSERT_OK(result_array->ValidateFull());
     ASSERT_ARRAYS_EQUAL(*expected, *result_array);
 
-    ASSERT_OK(kernel(&this->ctx_, right, left, &result));
+    ASSERT_OK_AND_ASSIGN(result, kernel(right, left, &ctx_));
     ASSERT_EQ(Datum::ARRAY, result.kind());
     result_array = result.make_array();
     ASSERT_OK(result_array->ValidateFull());
@@ -62,17 +58,15 @@ class TestBooleanKernel : public ComputeFixture, public TestBase {
                               const std::shared_ptr<ChunkedArray>& left,
                               const std::shared_ptr<ChunkedArray>& right,
                               const std::shared_ptr<ChunkedArray>& expected) {
-    Datum result;
-
-    ASSERT_OK(kernel(&this->ctx_, left, right, &result));
+    ASSERT_OK_AND_ASSIGN(Datum result, kernel(left, right, &ctx_));
     ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind());
     std::shared_ptr<ChunkedArray> result_ca = result.chunked_array();
-    ASSERT_TRUE(result_ca->Equals(expected));
+    AssertChunkedEquivalent(*expected, *result_ca);
 
-    ASSERT_OK(kernel(&this->ctx_, right, left, &result));
+    ASSERT_OK_AND_ASSIGN(result, kernel(right, left, &ctx_));
     ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind());
     result_ca = result.chunked_array();
-    ASSERT_TRUE(result_ca->Equals(expected));
+    AssertChunkedEquivalent(*expected, *result_ca);
   }
 
   void TestBinaryKernel(const BinaryKernelFunc& kernel,
@@ -109,6 +103,9 @@ class TestBooleanKernel : public ComputeFixture, public TestBase {
                      _MakeArray<BooleanType, bool>(type, right, right),
                      _MakeArray<BooleanType, bool>(type, expected, expected_nulls));
   }
+
+ protected:
+  ExecContext ctx_;
 };
 
 TEST_F(TestBooleanKernel, Invert) {
@@ -120,13 +117,12 @@ TEST_F(TestBooleanKernel, Invert) {
   auto a2 = _MakeArray<BooleanType, bool>(type, values2, {true, true, true, false});
 
   // Plain array
-  Datum result;
-  ASSERT_OK(Invert(&this->ctx_, a1, &result));
+  ASSERT_OK_AND_ASSIGN(Datum result, Invert(a1));
   ASSERT_EQ(Datum::ARRAY, result.kind());
   ASSERT_ARRAYS_EQUAL(*a2, *result.make_array());
 
   // Array with offset
-  ASSERT_OK(Invert(&this->ctx_, a1->Slice(1), &result));
+  ASSERT_OK_AND_ASSIGN(result, Invert(a1->Slice(1)));
   ASSERT_EQ(Datum::ARRAY, result.kind());
   ASSERT_ARRAYS_EQUAL(*a2->Slice(1), *result.make_array());
 
@@ -135,10 +131,14 @@ TEST_F(TestBooleanKernel, Invert) {
   auto ca1 = std::make_shared<ChunkedArray>(ca1_arrs);
   std::vector<std::shared_ptr<Array>> ca2_arrs = {a2, a2->Slice(1)};
   auto ca2 = std::make_shared<ChunkedArray>(ca2_arrs);
-  ASSERT_OK(Invert(&this->ctx_, ca1, &result));
+  ASSERT_OK_AND_ASSIGN(result, Invert(ca1));
   ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind());
   std::shared_ptr<ChunkedArray> result_ca = result.chunked_array();
-  AssertChunkedEqual(*ca2, *result_ca);
+
+  // Contiguous preallocation, so a single output chunk even though there were
+  // two input chunks
+  ASSERT_EQ(1, result_ca->num_chunks());
+  AssertChunkedEquivalent(*ca2, *result_ca);
 }
 
 TEST_F(TestBooleanKernel, InvertEmptyArray) {
@@ -147,8 +147,7 @@ TEST_F(TestBooleanKernel, InvertEmptyArray) {
   input.value = ArrayData::Make(boolean(), 0 /* length */, std::move(data_buffers),
                                 0 /* null_count */);
 
-  Datum result;
-  ASSERT_OK(Invert(&this->ctx_, input, &result));
+  ASSERT_OK_AND_ASSIGN(Datum result, Invert(input));
   ASSERT_ARRAYS_EQUAL(*input.make_array(), *result.make_array());
 }
 
@@ -159,8 +158,7 @@ TEST_F(TestBooleanKernel, BinaryOpOnEmptyArray) {
   input.value = ArrayData::Make(boolean(), 0 /* length */, std::move(data_buffers),
                                 0 /* null_count */);
 
-  Datum result;
-  ASSERT_OK(And(&this->ctx_, input, input, &result));
+  ASSERT_OK_AND_ASSIGN(Datum result, And(input, input));
   // Result should be empty as well.
   ASSERT_ARRAYS_EQUAL(*input.make_array(), *result.make_array());
 }
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast.cc b/cpp/src/arrow/compute/kernels/scalar_cast.cc
new file mode 100644
index 00000000000..bc82418befb
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/scalar_cast.cc
@@ -0,0 +1,449 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/kernels/cast.h"
+
+#include <cstdint>
+#include <cstring>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/buffer.h"
+#include "arrow/builder.h"
+#include "arrow/compute/exec.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/formatting.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/time.h"
+#include "arrow/util/utf8.h"
+#include "arrow/util/value_parsing.h"  // IWYU pragma: keep
+#include "arrow/visitor_inline.h"
+
+#include "arrow/compute/kernel.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::CopyBitmap;
+
+namespace compute {
+
+Status CastNotImplemented(const DataType& in_type, const DataType& out_type) {
+  return Status::NotImplemented("No cast implemented from ", in_type.ToString(), " to ",
+                                out_type.ToString());
+}
+
+// ----------------------------------------------------------------------
+// Dictionary to null
+
+template <>
+struct CastFunctor<NullType, DictionaryType> {
+  void operator()(FunctionContext* ctx, const CastOptions& options,
+                  const ArrayData& input, ArrayData* output) {
+    output->buffers = {nullptr};
+    output->null_count = output->length;
+  }
+};
+
+// ----------------------------------------------------------------------
+// Null to other things
+
+class FromNullCastKernel : public CastKernelBase {
+ public:
+  explicit FromNullCastKernel(std::shared_ptr<DataType> out_type)
+      : CastKernelBase(std::move(out_type)) {}
+
+  Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override {
+    DCHECK_EQ(Datum::ARRAY, input.kind());
+
+    const ArrayData& in_data = *input.array();
+    DCHECK_EQ(Type::NA, in_data.type->id());
+    auto length = in_data.length;
+
+    // A ArrayData may be preallocated for the output (see InvokeUnaryArrayKernel),
+    // however, it doesn't have any actual data, so throw it away and start anew.
+    std::unique_ptr<ArrayBuilder> builder;
+    RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), out_type_, &builder));
+    NullBuilderVisitor visitor = {length, builder.get()};
+    RETURN_NOT_OK(VisitTypeInline(*out_type_, &visitor));
+
+    std::shared_ptr<Array> out_array;
+    RETURN_NOT_OK(visitor.builder_->Finish(&out_array));
+    out->value = out_array->data();
+    return Status::OK();
+  }
+
+  struct NullBuilderVisitor {
+    // Generic implementation
+    Status Visit(const DataType& type) { return builder_->AppendNulls(length_); }
+
+    Status Visit(const StructType& type) {
+      RETURN_NOT_OK(builder_->AppendNulls(length_));
+      auto& struct_builder = checked_cast<StructBuilder&>(*builder_);
+      // Append nulls to all child builders too
+      for (int i = 0; i < struct_builder.num_fields(); ++i) {
+        NullBuilderVisitor visitor = {length_, struct_builder.field_builder(i)};
+        RETURN_NOT_OK(VisitTypeInline(*type.field(i)->type(), &visitor));
+      }
+      return Status::OK();
+    }
+
+    Status Visit(const DictionaryType& type) {
+      // XXX (ARROW-5215): Cannot implement this easily, as DictionaryBuilder
+      // disregards the index type given in the dictionary type, and instead
+      // chooses the smallest possible index type.
+      return CastNotImplemented(*null(), type);
+    }
+
+    Status Visit(const UnionType& type) { return CastNotImplemented(*null(), type); }
+
+    int64_t length_;
+    ArrayBuilder* builder_;
+  };
+};
+
+// ----------------------------------------------------------------------
+
+class IdentityCast : public CastKernelBase {
+ public:
+  using CastKernelBase::CastKernelBase;
+
+  Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override {
+    DCHECK_EQ(input.kind(), Datum::ARRAY);
+    out->value = input.array()->Copy();
+    return Status::OK();
+  }
+};
+
+class ZeroCopyCast : public CastKernelBase {
+ public:
+  using CastKernelBase::CastKernelBase;
+
+  Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override {
+    DCHECK_EQ(input.kind(), Datum::ARRAY);
+    auto result = input.array()->Copy();
+    result->type = out_type_;
+    out->value = result;
+    return Status::OK();
+  }
+};
+
+class ExtensionCastKernel : public CastKernelBase {
+ public:
+  static Status Make(const DataType& in_type, std::shared_ptr<DataType> out_type,
+                     const CastOptions& options,
+                     std::unique_ptr<CastKernelBase>* kernel) {
+    const auto storage_type = checked_cast<const ExtensionType&>(in_type).storage_type();
+
+    std::unique_ptr<UnaryKernel> storage_caster;
+    RETURN_NOT_OK(GetCastFunction(*storage_type, out_type, options, &storage_caster));
+    kernel->reset(
+        new ExtensionCastKernel(std::move(storage_caster), std::move(out_type)));
+
+    return Status::OK();
+  }
+
+  Status Init(const DataType& in_type) override {
+    auto& type = checked_cast<const ExtensionType&>(in_type);
+    storage_type_ = type.storage_type();
+    extension_name_ = type.extension_name();
+    return Status::OK();
+  }
+
+  Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override {
+    DCHECK_EQ(input.kind(), Datum::ARRAY);
+
+    // validate: type is the same as the type the kernel was constructed with
+    const auto& input_type = checked_cast<const ExtensionType&>(*input.type());
+    if (input_type.extension_name() != extension_name_) {
+      return Status::TypeError(
+          "The cast kernel was constructed to cast from the extension type named '",
+          extension_name_, "' but input has extension type named '",
+          input_type.extension_name(), "'");
+    }
+    if (!input_type.storage_type()->Equals(storage_type_)) {
+      return Status::TypeError("The cast kernel was constructed with a storage type: ",
+                               storage_type_->ToString(),
+                               ", but it is called with a different storage type:",
+                               input_type.storage_type()->ToString());
+    }
+
+    // construct an ArrayData object with the underlying storage type
+    auto new_input = input.array()->Copy();
+    new_input->type = storage_type_;
+    return InvokeWithAllocation(ctx, storage_caster_.get(), new_input, out);
+  }
+
+ protected:
+  ExtensionCastKernel(std::unique_ptr<UnaryKernel> storage_caster,
+                      std::shared_ptr<DataType> out_type)
+      : CastKernelBase(std::move(out_type)), storage_caster_(std::move(storage_caster)) {}
+
+  std::string extension_name_;
+  std::shared_ptr<DataType> storage_type_;
+  std::unique_ptr<UnaryKernel> storage_caster_;
+};
+
+class CastKernel : public CastKernelBase {
+ public:
+  CastKernel(const CastOptions& options, const CastFunction& func,
+             std::shared_ptr<DataType> out_type)
+      : CastKernelBase(std::move(out_type)), options_(options), func_(func) {}
+
+  Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override {
+    DCHECK_EQ(input.kind(), Datum::ARRAY);
+    DCHECK_EQ(out->kind(), Datum::ARRAY);
+
+    const ArrayData& in_data = *input.array();
+    ArrayData* result = out->array().get();
+
+    RETURN_NOT_OK(detail::PropagateNulls(ctx, in_data, result));
+
+    func_(ctx, options_, in_data, result);
+    ARROW_RETURN_IF_ERROR(ctx);
+    return Status::OK();
+  }
+
+ private:
+  CastOptions options_;
+  CastFunction func_;
+};
+
+class DictionaryCastKernel : public CastKernel {
+ public:
+  using CastKernel::CastKernel;
+
+  Status Init(const DataType& in_type) override {
+    const auto value_type = checked_cast<const DictionaryType&>(in_type).value_type();
+    if (!out_type_->Equals(value_type)) {
+      return CastNotImplemented(in_type, *out_type_);
+    }
+    return Status::OK();
+  }
+};
+
+#define CAST_CASE(InType, OutType)                                                      \
+  case OutType::type_id:                                                                \
+    func = [](FunctionContext* ctx, const CastOptions& options, const ArrayData& input, \
+              ArrayData* out) {                                                         \
+      CastFunctor<OutType, InType> func;                                                \
+      func(ctx, options, input, out);                                                   \
+    };                                                                                  \
+    break;
+
+#define GET_CAST_FUNCTION(CASE_GENERATOR, InType, KernelType)           \
+  static std::unique_ptr<CastKernelBase> Get##InType##CastFunc(         \
+      std::shared_ptr<DataType> out_type, const CastOptions& options) { \
+    CastFunction func;                                                  \
+    switch (out_type->id()) {                                           \
+      CASE_GENERATOR(CAST_CASE);                                        \
+      default:                                                          \
+        break;                                                          \
+    }                                                                   \
+    if (func != nullptr) {                                              \
+      return std::unique_ptr<CastKernelBase>(                           \
+          new KernelType(options, func, std::move(out_type)));          \
+    }                                                                   \
+    return nullptr;                                                     \
+  }
+
+#include "generated/cast_codegen_internal.h"  // NOLINT
+
+GET_CAST_FUNCTION(BOOLEAN_CASES, BooleanType, CastKernel)
+GET_CAST_FUNCTION(UINT8_CASES, UInt8Type, CastKernel)
+GET_CAST_FUNCTION(INT8_CASES, Int8Type, CastKernel)
+GET_CAST_FUNCTION(UINT16_CASES, UInt16Type, CastKernel)
+GET_CAST_FUNCTION(INT16_CASES, Int16Type, CastKernel)
+GET_CAST_FUNCTION(UINT32_CASES, UInt32Type, CastKernel)
+GET_CAST_FUNCTION(INT32_CASES, Int32Type, CastKernel)
+GET_CAST_FUNCTION(UINT64_CASES, UInt64Type, CastKernel)
+GET_CAST_FUNCTION(INT64_CASES, Int64Type, CastKernel)
+GET_CAST_FUNCTION(FLOAT_CASES, FloatType, CastKernel)
+GET_CAST_FUNCTION(DOUBLE_CASES, DoubleType, CastKernel)
+GET_CAST_FUNCTION(DECIMAL128_CASES, Decimal128Type, CastKernel)
+GET_CAST_FUNCTION(DATE32_CASES, Date32Type, CastKernel)
+GET_CAST_FUNCTION(DATE64_CASES, Date64Type, CastKernel)
+GET_CAST_FUNCTION(TIME32_CASES, Time32Type, CastKernel)
+GET_CAST_FUNCTION(TIME64_CASES, Time64Type, CastKernel)
+GET_CAST_FUNCTION(TIMESTAMP_CASES, TimestampType, CastKernel)
+GET_CAST_FUNCTION(DURATION_CASES, DurationType, CastKernel)
+GET_CAST_FUNCTION(BINARY_CASES, BinaryType, CastKernel)
+GET_CAST_FUNCTION(STRING_CASES, StringType, CastKernel)
+GET_CAST_FUNCTION(LARGEBINARY_CASES, LargeBinaryType, CastKernel)
+GET_CAST_FUNCTION(LARGESTRING_CASES, LargeStringType, CastKernel)
+GET_CAST_FUNCTION(DICTIONARY_CASES, DictionaryType, DictionaryCastKernel)
+
+#define CAST_FUNCTION_CASE(InType)                          \
+  case InType::type_id:                                     \
+    cast_kernel = Get##InType##CastFunc(out_type, options); \
+    break
+
+namespace {
+
+template <typename TypeClass>
+Status GetListCastFunc(const DataType& in_type, std::shared_ptr<DataType> out_type,
+                       const CastOptions& options,
+                       std::unique_ptr<CastKernelBase>* kernel) {
+  if (out_type->id() != TypeClass::type_id) {
+    return Status::Invalid("Cannot cast from ", in_type.ToString(), " to ",
+                           out_type->ToString());
+  }
+  const DataType& in_value_type = *checked_cast<const TypeClass&>(in_type).value_type();
+  std::shared_ptr<DataType> out_value_type =
+      checked_cast<const TypeClass&>(*out_type).value_type();
+  std::unique_ptr<UnaryKernel> child_caster;
+  RETURN_NOT_OK(GetCastFunction(in_value_type, out_value_type, options, &child_caster));
+  *kernel = std::unique_ptr<CastKernelBase>(
+      new ListCastKernel<TypeClass>(std::move(child_caster), std::move(out_type)));
+  return Status::OK();
+}
+
+}  // namespace
+
+inline bool IsZeroCopyCast(Type::type in_type, Type::type out_type) {
+  switch (in_type) {
+    case Type::INT32:
+      return (out_type == Type::DATE32) || (out_type == Type::TIME32);
+    case Type::INT64:
+      return ((out_type == Type::DATE64) || (out_type == Type::TIME64) ||
+              (out_type == Type::TIMESTAMP) || (out_type == Type::DURATION));
+    case Type::DATE32:
+    case Type::TIME32:
+      return out_type == Type::INT32;
+    case Type::DATE64:
+    case Type::TIME64:
+    case Type::TIMESTAMP:
+    case Type::DURATION:
+      return out_type == Type::INT64;
+    default:
+      break;
+  }
+  return false;
+}
+
+Status GetCastFunction(const DataType& in_type, std::shared_ptr<DataType> out_type,
+                       const CastOptions& options, std::unique_ptr<UnaryKernel>* kernel) {
+  if (in_type.Equals(out_type)) {
+    kernel->reset(new IdentityCast(std::move(out_type)));
+    return Status::OK();
+  }
+
+  if (IsZeroCopyCast(in_type.id(), out_type->id())) {
+    kernel->reset(new ZeroCopyCast(std::move(out_type)));
+    return Status::OK();
+  }
+
+  std::unique_ptr<CastKernelBase> cast_kernel;
+  switch (in_type.id()) {
+    CAST_FUNCTION_CASE(BooleanType);
+    CAST_FUNCTION_CASE(UInt8Type);
+    CAST_FUNCTION_CASE(Int8Type);
+    CAST_FUNCTION_CASE(UInt16Type);
+    CAST_FUNCTION_CASE(Int16Type);
+    CAST_FUNCTION_CASE(UInt32Type);
+    CAST_FUNCTION_CASE(Int32Type);
+    CAST_FUNCTION_CASE(UInt64Type);
+    CAST_FUNCTION_CASE(Int64Type);
+    CAST_FUNCTION_CASE(FloatType);
+    CAST_FUNCTION_CASE(DoubleType);
+    CAST_FUNCTION_CASE(Decimal128Type);
+    CAST_FUNCTION_CASE(Date32Type);
+    CAST_FUNCTION_CASE(Date64Type);
+    CAST_FUNCTION_CASE(Time32Type);
+    CAST_FUNCTION_CASE(Time64Type);
+    CAST_FUNCTION_CASE(TimestampType);
+    CAST_FUNCTION_CASE(DurationType);
+    CAST_FUNCTION_CASE(BinaryType);
+    CAST_FUNCTION_CASE(StringType);
+    CAST_FUNCTION_CASE(LargeBinaryType);
+    CAST_FUNCTION_CASE(LargeStringType);
+    CAST_FUNCTION_CASE(DictionaryType);
+    case Type::NA:
+      cast_kernel.reset(new FromNullCastKernel(out_type));
+      break;
+    case Type::LIST:
+      RETURN_NOT_OK(GetListCastFunc<ListType>(in_type, out_type, options, &cast_kernel));
+      break;
+    case Type::LARGE_LIST:
+      RETURN_NOT_OK(
+          GetListCastFunc<LargeListType>(in_type, out_type, options, &cast_kernel));
+      break;
+    case Type::EXTENSION:
+      RETURN_NOT_OK(
+          ExtensionCastKernel::Make(std::move(in_type), out_type, options, &cast_kernel));
+      break;
+    default:
+      break;
+  }
+  if (cast_kernel == nullptr) {
+    return CastNotImplemented(in_type, *out_type);
+  }
+  Status st = cast_kernel->Init(in_type);
+  if (st.ok()) {
+    *kernel = std::move(cast_kernel);
+  }
+  return st;
+}
+
+Status Cast(FunctionContext* ctx, const Datum& value, std::shared_ptr<DataType> out_type,
+            const CastOptions& options, Datum* out) {
+  const DataType& in_type = *value.type();
+
+  // Dynamic dispatch to obtain right cast function
+  std::unique_ptr<UnaryKernel> func;
+  RETURN_NOT_OK(GetCastFunction(in_type, std::move(out_type), options, &func));
+  return InvokeWithAllocation(ctx, func.get(), value, out);
+}
+
+Status Cast(FunctionContext* ctx, const Array& array, std::shared_ptr<DataType> out_type,
+            const CastOptions& options, std::shared_ptr<Array>* out) {
+  Datum datum_out;
+  RETURN_NOT_OK(Cast(ctx, Datum(array.data()), std::move(out_type), options, &datum_out));
+  DCHECK_EQ(Datum::ARRAY, datum_out.kind());
+  *out = MakeArray(datum_out.array());
+  return Status::OK();
+}
+// ----------------------------------------------------------------------
+// Casting
+
+Result<std::shared_ptr<Array>> Cast(const Array& value, std::shared_ptr<DataType> to_type,
+                                    const CastOptions& options, ExecContext* context) {
+  return Status::NotImplemented("NYI");
+}
+
+Result<Datum> Cast(const Datum& value, std::shared_ptr<DataType> to_type,
+                   const CastOptions& options, ExecContext* context) {
+  return Status::NotImplemented("NYI");
+}
+
+bool CanCast(const DataType& from_type, const DataType& to_type) {
+  // TODO
+  return false;
+}
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
new file mode 100644
index 00000000000..85e81548772
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
@@ -0,0 +1,82 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Cast types to boolean
+
+namespace arrow {
+namespace compute {
+
+struct IsNonZero {
+  template <typename InType, typename OutType>
+  static OutType Call(FunctionContext*, InType val) {
+    return val != 0;
+  }
+};
+
+struct ParseBooleanString {
+  template <typename OutType = bool>
+  static OutType Call(FunctionContext* ctx, util::string_view val) {
+    internal::StringConverter<BooleanType> converter;
+  }
+};
+
+void RegisterBooleanCasts(FunctionRegistry* registry) {
+  ScalarDispatcher dispatcher("cast_boolean", /*num_args=*/1);
+  auto out_type = boolean();
+  for (const auto& in_type : kNumberTypes) {
+    auto func = codegen::MakePrimitiveUnary<BooleanType, IsNonZero>(in_type));
+    dispatcher.Add(ScalarKernel({in_type}, out_type, func));
+  }
+}
+
+// String to Boolean
+template <typename I>
+struct CastFunctor<BooleanType, I, enable_if_t<is_string_like_type<I>::value>> {
+  void operator()(FunctionContext* ctx, const CastOptions& options,
+                  const ArrayData& input, ArrayData* output) {
+    typename TypeTraits<I>::ArrayType input_array(input.Copy());
+    internal::FirstTimeBitmapWriter writer(output->buffers[1]->mutable_data(),
+                                           output->offset, input.length);
+
+    for (int64_t i = 0; i < input.length; ++i) {
+      if (input_array.IsNull(i)) {
+        writer.Next();
+        continue;
+      }
+
+      bool value;
+      auto str = input_array.GetView(i);
+      if (!converter(str.data(), str.length(), &value)) {
+        ctx->SetStatus(Status::Invalid("Failed to cast String '",
+                                       input_array.GetString(i), "' into ",
+                                       output->type->ToString()));
+        return;
+      }
+
+      if (value) {
+        writer.Set();
+      } else {
+        writer.Clear();
+      }
+      writer.Next();
+    }
+    writer.Finish();
+  }
+};
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_decimal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_decimal.cc
new file mode 100644
index 00000000000..928d5984292
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_decimal.cc
@@ -0,0 +1,88 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Implementation of casting to decimal types (including between
+// decimal types)
+
+namespace arrow {
+namespace compute {
+
+// Decimal to Decimal
+
+template <>
+struct CastFunctor<Decimal128Type, Decimal128Type> {
+  void operator()(FunctionContext* ctx, const CastOptions& options,
+                  const ArrayData& input, ArrayData* output) {
+    const auto& in_type_inst = checked_cast<const Decimal128Type&>(*input.type);
+    const auto& out_type_inst = checked_cast<const Decimal128Type&>(*output->type);
+    auto in_scale = in_type_inst.scale();
+    auto out_scale = out_type_inst.scale();
+
+    auto out_data = output->GetMutableValues<uint8_t>(1);
+
+    const auto write_zero = [](uint8_t* out_data) { memset(out_data, 0, 16); };
+
+    if (options.allow_decimal_truncate) {
+      if (in_scale < out_scale) {
+        // Unsafe upscale
+        auto convert_value = [&](util::optional<util::string_view> v) {
+          if (v.has_value()) {
+            auto dec_value = Decimal128(reinterpret_cast<const uint8_t*>(v->data()));
+            dec_value.IncreaseScaleBy(out_scale - in_scale).ToBytes(out_data);
+          } else {
+            write_zero(out_data);
+          }
+          out_data += 16;
+        };
+        VisitArrayDataInline<Decimal128Type>(input, std::move(convert_value));
+      } else {
+        // Unsafe downscale
+        auto convert_value = [&](util::optional<util::string_view> v) {
+          if (v.has_value()) {
+            auto dec_value = Decimal128(reinterpret_cast<const uint8_t*>(v->data()));
+            dec_value.ReduceScaleBy(in_scale - out_scale, false).ToBytes(out_data);
+          } else {
+            write_zero(out_data);
+          }
+          out_data += 16;
+        };
+        VisitArrayDataInline<Decimal128Type>(input, std::move(convert_value));
+      }
+    } else {
+      // Safe rescale
+      auto convert_value = [&](util::optional<util::string_view> v) {
+        if (v.has_value()) {
+          auto dec_value = Decimal128(reinterpret_cast<const uint8_t*>(v->data()));
+          auto result = dec_value.Rescale(in_scale, out_scale);
+          if (ARROW_PREDICT_FALSE(!result.ok())) {
+            ctx->SetStatus(result.status());
+            write_zero(out_data);
+          } else {
+            (*std::move(result)).ToBytes(out_data);
+          }
+        } else {
+          write_zero(out_data);
+        }
+        out_data += 16;
+      };
+      VisitArrayDataInline<Decimal128Type>(input, std::move(convert_value));
+    }
+  }
+};
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
new file mode 100644
index 00000000000..c5ef9c36676
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
@@ -0,0 +1,222 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#ifdef ARROW_EXTRA_ERROR_CONTEXT
+
+#define FUNC_RETURN_NOT_OK(expr)                     \
+  do {                                               \
+    Status _st = (expr);                             \
+    if (ARROW_PREDICT_FALSE(!_st.ok())) {            \
+      _st.AddContextLine(__FILE__, __LINE__, #expr); \
+      ctx->SetStatus(_st);                           \
+      return;                                        \
+    }                                                \
+  } while (0)
+
+#else
+
+#define FUNC_RETURN_NOT_OK(expr)          \
+  do {                                    \
+    Status _st = (expr);                  \
+    if (ARROW_PREDICT_FALSE(!_st.ok())) { \
+      ctx->SetStatus(_st);                \
+      return;                             \
+    }                                     \
+  } while (0)
+
+#endif  // ARROW_EXTRA_ERROR_CONTEXT
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::CopyBitmap;
+
+namespace compute {
+
+template <typename OutType, typename InType, typename Enable = void>
+struct CastFunctor {};
+
+typedef std::function<void(FunctionContext*, const CastOptions& options, const ArrayData&,
+                           ArrayData*)>
+    CastFunction;
+
+// ----------------------------------------------------------------------
+// Dictionary to other things
+
+template <typename T, typename IndexType, typename Enable = void>
+struct FromDictVisitor {};
+
+// Visitor for Dict<FixedSizeBinaryType>
+template <typename T, typename IndexType>
+struct FromDictVisitor<T, IndexType, enable_if_fixed_size_binary<T>> {
+  using ArrayType = typename TypeTraits<T>::ArrayType;
+
+  FromDictVisitor(FunctionContext* ctx, const ArrayType& dictionary, ArrayData* output)
+      : dictionary_(dictionary),
+        byte_width_(dictionary.byte_width()),
+        out_(output->buffers[1]->mutable_data() + byte_width_ * output->offset) {}
+
+  Status Init() { return Status::OK(); }
+
+  Status VisitNull() {
+    memset(out_, 0, byte_width_);
+    out_ += byte_width_;
+    return Status::OK();
+  }
+
+  Status VisitValue(typename IndexType::c_type dict_index) {
+    const uint8_t* value = dictionary_.Value(dict_index);
+    memcpy(out_, value, byte_width_);
+    out_ += byte_width_;
+    return Status::OK();
+  }
+
+  Status Finish() { return Status::OK(); }
+
+  const ArrayType& dictionary_;
+  int32_t byte_width_;
+  uint8_t* out_;
+};
+
+// Visitor for Dict<BinaryType>
+template <typename T, typename IndexType>
+struct FromDictVisitor<T, IndexType, enable_if_base_binary<T>> {
+  using ArrayType = typename TypeTraits<T>::ArrayType;
+
+  FromDictVisitor(FunctionContext* ctx, const ArrayType& dictionary, ArrayData* output)
+      : ctx_(ctx), dictionary_(dictionary), output_(output) {}
+
+  Status Init() {
+    RETURN_NOT_OK(MakeBuilder(ctx_->memory_pool(), output_->type, &builder_));
+    binary_builder_ = checked_cast<BinaryBuilder*>(builder_.get());
+    return Status::OK();
+  }
+
+  Status VisitNull() { return binary_builder_->AppendNull(); }
+
+  Status VisitValue(typename IndexType::c_type dict_index) {
+    return binary_builder_->Append(dictionary_.GetView(dict_index));
+  }
+
+  Status Finish() {
+    std::shared_ptr<Array> plain_array;
+    RETURN_NOT_OK(binary_builder_->Finish(&plain_array));
+    // Copy all buffer except the valid bitmap
+    DCHECK_EQ(output_->buffers.size(), 1);
+    for (size_t i = 1; i < plain_array->data()->buffers.size(); i++) {
+      output_->buffers.push_back(plain_array->data()->buffers[i]);
+    }
+    return Status::OK();
+  }
+
+  FunctionContext* ctx_;
+  const ArrayType& dictionary_;
+  ArrayData* output_;
+  std::unique_ptr<ArrayBuilder> builder_;
+  BinaryBuilder* binary_builder_;
+};
+
+// Visitor for Dict<NumericType | TemporalType>
+template <typename T, typename IndexType>
+struct FromDictVisitor<
+    T, IndexType, enable_if_t<is_number_type<T>::value || is_temporal_type<T>::value>> {
+  using ArrayType = typename TypeTraits<T>::ArrayType;
+
+  using value_type = typename T::c_type;
+
+  FromDictVisitor(FunctionContext* ctx, const ArrayType& dictionary, ArrayData* output)
+      : dictionary_(dictionary), out_(output->GetMutableValues<value_type>(1)) {}
+
+  Status Init() { return Status::OK(); }
+
+  Status VisitNull() {
+    *out_++ = value_type{};  // Zero-initialize
+    return Status::OK();
+  }
+
+  Status VisitValue(typename IndexType::c_type dict_index) {
+    *out_++ = dictionary_.Value(dict_index);
+    return Status::OK();
+  }
+
+  Status Finish() { return Status::OK(); }
+
+  const ArrayType& dictionary_;
+  value_type* out_;
+};
+
+template <typename T>
+struct FromDictUnpackHelper {
+  using ArrayType = typename TypeTraits<T>::ArrayType;
+
+  template <typename IndexType>
+  Status Unpack(FunctionContext* ctx, const ArrayData& indices,
+                const ArrayType& dictionary, ArrayData* output) {
+    FromDictVisitor<T, IndexType> visitor{ctx, dictionary, output};
+    RETURN_NOT_OK(visitor.Init());
+    RETURN_NOT_OK(ArrayDataVisitor<IndexType>::Visit(indices, &visitor));
+    return visitor.Finish();
+  }
+};
+
+// Dispatch dictionary casts to UnpackHelper
+template <typename T>
+struct CastFunctor<T, DictionaryType> {
+  void operator()(FunctionContext* ctx, const CastOptions& options,
+                  const ArrayData& input, ArrayData* output) {
+    using ArrayType = typename TypeTraits<T>::ArrayType;
+
+    const DictionaryType& type = checked_cast<const DictionaryType&>(*input.type);
+    const Array& dictionary = *input.dictionary;
+    const DataType& values_type = *dictionary.type();
+
+    // Check if values and output type match
+    DCHECK(values_type.Equals(*output->type))
+        << "Dictionary type: " << values_type << " target type: " << (*output->type);
+
+    FromDictUnpackHelper<T> unpack_helper;
+    switch (type.index_type()->id()) {
+      case Type::INT8:
+        FUNC_RETURN_NOT_OK(unpack_helper.template Unpack<Int8Type>(
+            ctx, input, static_cast<const ArrayType&>(dictionary), output));
+        break;
+      case Type::INT16:
+        FUNC_RETURN_NOT_OK(unpack_helper.template Unpack<Int16Type>(
+            ctx, input, static_cast<const ArrayType&>(dictionary), output));
+        break;
+      case Type::INT32:
+        FUNC_RETURN_NOT_OK(unpack_helper.template Unpack<Int32Type>(
+            ctx, input, static_cast<const ArrayType&>(dictionary), output));
+        break;
+      case Type::INT64:
+        FUNC_RETURN_NOT_OK(unpack_helper.template Unpack<Int64Type>(
+            ctx, input, static_cast<const ArrayType&>(dictionary), output));
+        break;
+      default:
+        ctx->SetStatus(
+            Status::TypeError("Invalid index type: ", type.index_type()->ToString()));
+        return;
+    }
+  }
+};
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
new file mode 100644
index 00000000000..acbc6be30c6
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
@@ -0,0 +1,64 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Implementation of casting to (or between) list types
+
+namespace arrow {
+namespace compute {
+
+template <typename TypeClass>
+class ListCastKernel : public CastKernelBase {
+ public:
+  ListCastKernel(std::unique_ptr<UnaryKernel> child_caster,
+                 std::shared_ptr<DataType> out_type)
+      : CastKernelBase(std::move(out_type)), child_caster_(std::move(child_caster)) {}
+
+  Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override {
+    DCHECK_EQ(Datum::ARRAY, input.kind());
+
+    const ArrayData& in_data = *input.array();
+    DCHECK_EQ(TypeClass::type_id, in_data.type->id());
+    ArrayData* result;
+
+    if (in_data.offset != 0) {
+      return Status::NotImplemented(
+          "Casting sliced lists (non-zero offset) not yet implemented");
+    }
+
+    if (out->kind() == Datum::NONE) {
+      out->value = ArrayData::Make(out_type_, in_data.length);
+    }
+
+    result = out->array().get();
+
+    // Copy buffers from parent
+    result->buffers = in_data.buffers;
+
+    Datum casted_child;
+    RETURN_NOT_OK(InvokeWithAllocation(ctx, child_caster_.get(), in_data.child_data[0],
+                                       &casted_child));
+    DCHECK_EQ(Datum::ARRAY, casted_child.kind());
+    result->child_data.push_back(casted_child.array());
+    return Status::OK();
+  }
+
+ private:
+  std::unique_ptr<UnaryKernel> child_caster_;
+};
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
new file mode 100644
index 00000000000..045e05f1eed
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
@@ -0,0 +1,425 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Implementation of casting to integer or floating point types
+
+namespace arrow {
+namespace compute {
+
+// ----------------------------------------------------------------------
+// Boolean to other things
+
+// Cast from Boolean to other numbers
+template <typename T>
+struct CastFunctor<T, BooleanType, enable_if_number<T>> {
+  void operator()(FunctionContext* ctx, const CastOptions& options,
+                  const ArrayData& input, ArrayData* output) {
+    using c_type = typename T::c_type;
+    constexpr auto kOne = static_cast<c_type>(1);
+    constexpr auto kZero = static_cast<c_type>(0);
+
+    if (input.length == 0) return;
+
+    internal::BitmapReader bit_reader(input.buffers[1]->data(), input.offset,
+                                      input.length);
+    auto out = output->GetMutableValues<c_type>(1);
+    for (int64_t i = 0; i < input.length; ++i) {
+      *out++ = bit_reader.IsSet() ? kOne : kZero;
+      bit_reader.Next();
+    }
+  }
+};
+
+// ----------------------------------------------------------------------
+// Integers and Floating Point
+
+// Conversions pairs (<O, I>) are partitioned in 4 type traits:
+// - is_number_downcast
+// - is_integral_signed_to_unsigned
+// - is_integral_unsigned_to_signed
+// - is_float_truncate
+//
+// Each class has a different way of validation if the conversion is safe
+// (either with bounded intervals or with explicit C casts)
+
+template <typename O, typename I, typename Enable = void>
+struct is_number_downcast {
+  static constexpr bool value = false;
+};
+
+template <typename O, typename I>
+struct is_number_downcast<
+    O, I, enable_if_t<is_number_type<O>::value && is_number_type<I>::value>> {
+  using O_T = typename O::c_type;
+  using I_T = typename I::c_type;
+
+  static constexpr bool value =
+      ((!std::is_same<O, I>::value) &&
+       // Both types are of the same sign-ness.
+       ((std::is_signed<O_T>::value == std::is_signed<I_T>::value) &&
+        // Both types are of the same integral-ness.
+        (std::is_floating_point<O_T>::value == std::is_floating_point<I_T>::value)) &&
+       // Smaller output size
+       (sizeof(O_T) < sizeof(I_T)));
+};
+
+template <typename O, typename I, typename Enable = void>
+struct is_integral_signed_to_unsigned {
+  static constexpr bool value = false;
+};
+
+template <typename O, typename I>
+struct is_integral_signed_to_unsigned<
+    O, I, enable_if_t<is_integer_type<O>::value && is_integer_type<I>::value>> {
+  using O_T = typename O::c_type;
+  using I_T = typename I::c_type;
+
+  static constexpr bool value =
+      ((!std::is_same<O, I>::value) &&
+       ((std::is_unsigned<O_T>::value && std::is_signed<I_T>::value)));
+};
+
+template <typename O, typename I, typename Enable = void>
+struct is_integral_unsigned_to_signed {
+  static constexpr bool value = false;
+};
+
+template <typename O, typename I>
+struct is_integral_unsigned_to_signed<
+    O, I, enable_if_t<is_integer_type<O>::value && is_integer_type<I>::value>> {
+  using O_T = typename O::c_type;
+  using I_T = typename I::c_type;
+
+  static constexpr bool value =
+      ((!std::is_same<O, I>::value) &&
+       ((std::is_signed<O_T>::value && std::is_unsigned<I_T>::value)));
+};
+
+// This set of functions SafeMinimum/SafeMaximum would be simplified with
+// C++17 and `if constexpr`.
+
+// clang-format doesn't handle this construct properly. Thus the macro, but it
+// also improves readability.
+//
+// The effective return type of the function is always `I::c_type`, this is
+// just how enable_if works with functions.
+#define RET_TYPE(TRAIT) enable_if_t<TRAIT<O, I>::value, typename I::c_type>
+
+template <typename O, typename I>
+constexpr RET_TYPE(is_number_downcast) SafeMinimum() {
+  using out_type = typename O::c_type;
+
+  return std::numeric_limits<out_type>::lowest();
+}
+
+template <typename O, typename I>
+constexpr RET_TYPE(is_number_downcast) SafeMaximum() {
+  using out_type = typename O::c_type;
+
+  return std::numeric_limits<out_type>::max();
+}
+
+template <typename O, typename I>
+constexpr RET_TYPE(is_integral_unsigned_to_signed) SafeMinimum() {
+  return 0;
+}
+
+template <typename O, typename I>
+constexpr RET_TYPE(is_integral_unsigned_to_signed) SafeMaximum() {
+  using in_type = typename I::c_type;
+  using out_type = typename O::c_type;
+
+  // Equality is missing because in_type::max() > out_type::max() when types
+  // are of the same width.
+  return static_cast<in_type>(sizeof(in_type) < sizeof(out_type)
+                                  ? std::numeric_limits<in_type>::max()
+                                  : std::numeric_limits<out_type>::max());
+}
+
+template <typename O, typename I>
+constexpr RET_TYPE(is_integral_signed_to_unsigned) SafeMinimum() {
+  return 0;
+}
+
+template <typename O, typename I>
+constexpr RET_TYPE(is_integral_signed_to_unsigned) SafeMaximum() {
+  using in_type = typename I::c_type;
+  using out_type = typename O::c_type;
+
+  return static_cast<in_type>(sizeof(in_type) <= sizeof(out_type)
+                                  ? std::numeric_limits<in_type>::max()
+                                  : std::numeric_limits<out_type>::max());
+}
+
+#undef RET_TYPE
+
+template <typename O, typename I>
+struct CastFunctor<O, I,
+                   enable_if_t<is_number_downcast<O, I>::value ||
+                               is_integral_signed_to_unsigned<O, I>::value ||
+                               is_integral_unsigned_to_signed<O, I>::value>> {
+  void operator()(FunctionContext* ctx, const CastOptions& options,
+                  const ArrayData& input, ArrayData* output) {
+    using in_type = typename I::c_type;
+    using out_type = typename O::c_type;
+
+    auto in_offset = input.offset;
+
+    const in_type* in_data = input.GetValues<in_type>(1);
+    auto out_data = output->GetMutableValues<out_type>(1);
+
+    if (!options.allow_int_overflow) {
+      constexpr in_type kMax = SafeMaximum<O, I>();
+      constexpr in_type kMin = SafeMinimum<O, I>();
+
+      // Null count may be -1 if the input array had been sliced
+      if (input.null_count != 0) {
+        internal::BitmapReader is_valid_reader(input.buffers[0]->data(), in_offset,
+                                               input.length);
+        for (int64_t i = 0; i < input.length; ++i) {
+          if (ARROW_PREDICT_FALSE(is_valid_reader.IsSet() &&
+                                  (*in_data > kMax || *in_data < kMin))) {
+            ctx->SetStatus(Status::Invalid("Integer value out of bounds"));
+          }
+          *out_data++ = static_cast<out_type>(*in_data++);
+          is_valid_reader.Next();
+        }
+      } else {
+        for (int64_t i = 0; i < input.length; ++i) {
+          if (ARROW_PREDICT_FALSE(*in_data > kMax || *in_data < kMin)) {
+            ctx->SetStatus(Status::Invalid("Integer value out of bounds"));
+          }
+          *out_data++ = static_cast<out_type>(*in_data++);
+        }
+      }
+    } else {
+      for (int64_t i = 0; i < input.length; ++i) {
+        *out_data++ = static_cast<out_type>(*in_data++);
+      }
+    }
+  }
+};
+
+// Float to Integer or Integer to Float
+template <typename O, typename I, typename Enable = void>
+struct is_float_truncate {
+  static constexpr bool value = false;
+};
+
+template <typename O, typename I>
+struct is_float_truncate<
+    O, I,
+    enable_if_t<(is_integer_type<O>::value && is_floating_type<I>::value) ||
+                (is_integer_type<I>::value && is_floating_type<O>::value)>> {
+  static constexpr bool value = true;
+};
+
+template <typename O, typename I>
+struct CastFunctor<O, I, enable_if_t<is_float_truncate<O, I>::value>> {
+  ARROW_DISABLE_UBSAN("float-cast-overflow")
+  void operator()(FunctionContext* ctx, const CastOptions& options,
+                  const ArrayData& input, ArrayData* output) {
+    using in_type = typename I::c_type;
+    using out_type = typename O::c_type;
+
+    auto in_offset = input.offset;
+    const in_type* in_data = input.GetValues<in_type>(1);
+    auto out_data = output->GetMutableValues<out_type>(1);
+
+    if (options.allow_float_truncate) {
+      // unsafe cast
+      for (int64_t i = 0; i < input.length; ++i) {
+        *out_data++ = static_cast<out_type>(*in_data++);
+      }
+    } else {
+      // safe cast
+      if (input.null_count != 0) {
+        internal::BitmapReader is_valid_reader(input.buffers[0]->data(), in_offset,
+                                               input.length);
+        for (int64_t i = 0; i < input.length; ++i) {
+          auto out_value = static_cast<out_type>(*in_data);
+          if (ARROW_PREDICT_FALSE(is_valid_reader.IsSet() &&
+                                  static_cast<in_type>(out_value) != *in_data)) {
+            ctx->SetStatus(Status::Invalid("Floating point value truncated"));
+          }
+          *out_data++ = out_value;
+          in_data++;
+          is_valid_reader.Next();
+        }
+      } else {
+        for (int64_t i = 0; i < input.length; ++i) {
+          auto out_value = static_cast<out_type>(*in_data);
+          if (ARROW_PREDICT_FALSE(static_cast<in_type>(out_value) != *in_data)) {
+            ctx->SetStatus(Status::Invalid("Floating point value truncated"));
+          }
+          *out_data++ = out_value;
+          in_data++;
+        }
+      }
+    }
+  }
+};
+
+// Leftover of Number combinations that are safe to cast.
+template <typename O, typename I, typename Enable = void>
+struct is_safe_numeric_cast {
+  static constexpr bool value = false;
+};
+
+template <typename O, typename I>
+struct is_safe_numeric_cast<
+    O, I, enable_if_t<is_number_type<O>::value && is_number_type<I>::value>> {
+  using O_T = typename O::c_type;
+  using I_T = typename I::c_type;
+
+  static constexpr bool value =
+      (std::is_signed<O_T>::value == std::is_signed<I_T>::value) &&
+      (std::is_integral<O_T>::value == std::is_integral<I_T>::value) &&
+      (sizeof(O_T) >= sizeof(I_T)) && (!std::is_same<O, I>::value);
+};
+
+template <typename O, typename I>
+struct CastFunctor<
+    O, I,
+    enable_if_t<is_safe_numeric_cast<O, I>::value && !is_float_truncate<O, I>::value &&
+                !is_number_downcast<O, I>::value>> {
+  void operator()(FunctionContext* ctx, const CastOptions& options,
+                  const ArrayData& input, ArrayData* output) {
+    using in_type = typename I::c_type;
+    using out_type = typename O::c_type;
+
+    const in_type* in_data = input.GetValues<in_type>(1);
+    auto out_data = output->GetMutableValues<out_type>(1);
+    for (int64_t i = 0; i < input.length; ++i) {
+      // Due to various checks done via type-trait, the cast is safe and bear
+      // no truncation.
+      *out_data++ = static_cast<out_type>(*in_data++);
+    }
+  }
+};
+// ----------------------------------------------------------------------
+// Decimals
+
+// Decimal to Integer
+
+template <typename O>
+struct CastFunctor<O, Decimal128Type, enable_if_t<is_integer_type<O>::value>> {
+  void operator()(FunctionContext* ctx, const CastOptions& options,
+                  const ArrayData& input, ArrayData* output) {
+    using out_type = typename O::c_type;
+    const auto& in_type_inst = checked_cast<const Decimal128Type&>(*input.type);
+    auto in_scale = in_type_inst.scale();
+
+    auto out_data = output->GetMutableValues<out_type>(1);
+
+    constexpr auto min_value = std::numeric_limits<out_type>::min();
+    constexpr auto max_value = std::numeric_limits<out_type>::max();
+    constexpr auto zero = out_type{};
+
+    if (options.allow_decimal_truncate) {
+      if (in_scale < 0) {
+        // Unsafe upscale
+        auto convert_value = [&](util::optional<util::string_view> v) {
+          *out_data = zero;
+          if (v.has_value()) {
+            auto dec_value = Decimal128(reinterpret_cast<const uint8_t*>(v->data()));
+            auto converted = dec_value.IncreaseScaleBy(-in_scale);
+            if (!options.allow_int_overflow &&
+                ARROW_PREDICT_FALSE(converted < min_value || converted > max_value)) {
+              ctx->SetStatus(Status::Invalid("Integer value out of bounds"));
+            } else {
+              *out_data = static_cast<out_type>(converted.low_bits());
+            }
+          }
+          ++out_data;
+        };
+        VisitArrayDataInline<Decimal128Type>(input, std::move(convert_value));
+      } else {
+        // Unsafe downscale
+        auto convert_value = [&](util::optional<util::string_view> v) {
+          *out_data = zero;
+          if (v.has_value()) {
+            auto dec_value = Decimal128(reinterpret_cast<const uint8_t*>(v->data()));
+            auto converted = dec_value.ReduceScaleBy(in_scale, false);
+            if (!options.allow_int_overflow &&
+                ARROW_PREDICT_FALSE(converted < min_value || converted > max_value)) {
+              ctx->SetStatus(Status::Invalid("Integer value out of bounds"));
+            } else {
+              *out_data = static_cast<out_type>(converted.low_bits());
+            }
+          }
+          ++out_data;
+        };
+        VisitArrayDataInline<Decimal128Type>(input, std::move(convert_value));
+      }
+    } else {
+      // Safe rescale
+      auto convert_value = [&](util::optional<util::string_view> v) {
+        *out_data = zero;
+        if (v.has_value()) {
+          auto dec_value = Decimal128(reinterpret_cast<const uint8_t*>(v->data()));
+          auto result = dec_value.Rescale(in_scale, 0);
+          if (ARROW_PREDICT_FALSE(!result.ok())) {
+            ctx->SetStatus(result.status());
+          } else {
+            auto converted = *std::move(result);
+            if (!options.allow_int_overflow &&
+                ARROW_PREDICT_FALSE(converted < min_value || converted > max_value)) {
+              ctx->SetStatus(Status::Invalid("Integer value out of bounds"));
+            } else {
+              *out_data = static_cast<out_type>(converted.low_bits());
+            }
+          }
+        }
+        ++out_data;
+      };
+      VisitArrayDataInline<Decimal128Type>(input, std::move(convert_value));
+    }
+  }
+};
+
+// ----------------------------------------------------------------------
+// String to Number
+
+template <typename I, typename O>
+struct CastFunctor<
+    O, I, enable_if_t<is_string_like_type<I>::value && is_number_type<O>::value>> {
+  void operator()(FunctionContext* ctx, const CastOptions& options,
+                  const ArrayData& input, ArrayData* output) {
+    using out_type = typename O::c_type;
+
+    typename TypeTraits<I>::ArrayType input_array(input.Copy());
+    auto out_data = output->GetMutableValues<out_type>(1);
+    internal::StringConverter<O> converter;
+
+    for (int64_t i = 0; i < input.length; ++i, ++out_data) {
+      if (input_array.IsNull(i)) {
+        continue;
+      }
+
+      auto str = input_array.GetView(i);
+      if (!converter(str.data(), str.length(), out_data)) {
+        ctx->SetStatus(Status::Invalid("Failed to cast String '", str, "' into ",
+                                       output->type->ToString()));
+        return;
+      }
+    }
+  }
+};
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
new file mode 100644
index 00000000000..415c42c7f42
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
@@ -0,0 +1,110 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Implementation of casting to integer or floating point types
+
+namespace arrow {
+namespace compute {
+
+// ----------------------------------------------------------------------
+// Number / Boolean to String
+
+template <typename I, typename O>
+struct CastFunctor<O, I,
+                   enable_if_t<is_string_like_type<O>::value &&
+                               (is_number_type<I>::value || is_boolean_type<I>::value)>> {
+  void operator()(FunctionContext* ctx, const CastOptions& options,
+                  const ArrayData& input, ArrayData* output) {
+    ctx->SetStatus(Convert(ctx, options, input, output));
+  }
+
+  Status Convert(FunctionContext* ctx, const CastOptions& options, const ArrayData& input,
+                 ArrayData* output) {
+    using value_type = typename TypeTraits<I>::CType;
+    using BuilderType = typename TypeTraits<O>::BuilderType;
+    using FormatterType = typename internal::StringFormatter<I>;
+
+    FormatterType formatter(input.type);
+    BuilderType builder(input.type, ctx->memory_pool());
+
+    auto convert_value = [&](util::optional<value_type> v) {
+      if (v.has_value()) {
+        return formatter(*v, [&](util::string_view v) { return builder.Append(v); });
+      } else {
+        return builder.AppendNull();
+      }
+    };
+    RETURN_NOT_OK(VisitArrayDataInline<I>(input, std::move(convert_value)));
+
+    std::shared_ptr<Array> output_array;
+    RETURN_NOT_OK(builder.Finish(&output_array));
+    *output = std::move(*output_array->data());
+    return Status::OK();
+  }
+};
+
+// ----------------------------------------------------------------------
+// Binary to String
+//
+
+#if defined(_MSC_VER)
+// Silence warning: """'visitor': unreferenced local variable"""
+#pragma warning(push)
+#pragma warning(disable : 4101)
+#endif
+
+template <typename I, typename O>
+struct BinaryToStringSameWidthCastFunctor {
+  void operator()(FunctionContext* ctx, const CastOptions& options,
+                  const ArrayData& input, ArrayData* output) {
+    if (!options.allow_invalid_utf8) {
+      util::InitializeUTF8();
+
+      ArrayDataVisitor<I> visitor;
+      Status st = visitor.Visit(input, this);
+      if (!st.ok()) {
+        ctx->SetStatus(st);
+        return;
+      }
+    }
+    ZeroCopyData(input, output);
+  }
+
+  Status VisitNull() { return Status::OK(); }
+
+  Status VisitValue(util::string_view str) {
+    if (ARROW_PREDICT_FALSE(!arrow::util::ValidateUTF8(str))) {
+      return Status::Invalid("Invalid UTF8 payload");
+    }
+    return Status::OK();
+  }
+};
+
+template <>
+struct CastFunctor<StringType, BinaryType>
+    : public BinaryToStringSameWidthCastFunctor<StringType, BinaryType> {};
+
+template <>
+struct CastFunctor<LargeStringType, LargeBinaryType>
+    : public BinaryToStringSameWidthCastFunctor<LargeStringType, LargeBinaryType> {};
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
new file mode 100644
index 00000000000..6b79527dd9f
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
@@ -0,0 +1,276 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Implementation of casting to (or between) temporal types
+
+namespace arrow {
+namespace compute {
+
+constexpr int64_t kMillisecondsInDay = 86400000;
+
+// ----------------------------------------------------------------------
+// From one timestamp to another
+
+template <typename in_type, typename out_type>
+void ShiftTime(FunctionContext* ctx, const CastOptions& options,
+               const util::DivideOrMultiply factor_op, const int64_t factor,
+               const ArrayData& input, ArrayData* output) {
+  const in_type* in_data = input.GetValues<in_type>(1);
+  auto out_data = output->GetMutableValues<out_type>(1);
+
+  if (factor == 1) {
+    for (int64_t i = 0; i < input.length; i++) {
+      out_data[i] = static_cast<out_type>(in_data[i]);
+    }
+  } else if (factor_op == util::MULTIPLY) {
+    if (options.allow_time_overflow) {
+      for (int64_t i = 0; i < input.length; i++) {
+        out_data[i] = static_cast<out_type>(in_data[i] * factor);
+      }
+    } else {
+#define RAISE_OVERFLOW_CAST(VAL)                                                  \
+  ctx->SetStatus(Status::Invalid("Casting from ", input.type->ToString(), " to ", \
+                                 output->type->ToString(), " would result in ",   \
+                                 "out of bounds timestamp: ", VAL));
+
+      int64_t max_val = std::numeric_limits<int64_t>::max() / factor;
+      int64_t min_val = std::numeric_limits<int64_t>::min() / factor;
+      if (input.null_count != 0) {
+        internal::BitmapReader bit_reader(input.buffers[0]->data(), input.offset,
+                                          input.length);
+        for (int64_t i = 0; i < input.length; i++) {
+          if (bit_reader.IsSet() && (in_data[i] < min_val || in_data[i] > max_val)) {
+            RAISE_OVERFLOW_CAST(in_data[i]);
+            break;
+          }
+          out_data[i] = static_cast<out_type>(in_data[i] * factor);
+          bit_reader.Next();
+        }
+      } else {
+        for (int64_t i = 0; i < input.length; i++) {
+          if (in_data[i] < min_val || in_data[i] > max_val) {
+            RAISE_OVERFLOW_CAST(in_data[i]);
+            break;
+          }
+          out_data[i] = static_cast<out_type>(in_data[i] * factor);
+        }
+      }
+
+#undef RAISE_OVERFLOW_CAST
+    }
+  } else {
+    if (options.allow_time_truncate) {
+      for (int64_t i = 0; i < input.length; i++) {
+        out_data[i] = static_cast<out_type>(in_data[i] / factor);
+      }
+    } else {
+#define RAISE_INVALID_CAST(VAL)                                                   \
+  ctx->SetStatus(Status::Invalid("Casting from ", input.type->ToString(), " to ", \
+                                 output->type->ToString(), " would lose data: ", VAL));
+
+      if (input.null_count != 0) {
+        internal::BitmapReader bit_reader(input.buffers[0]->data(), input.offset,
+                                          input.length);
+        for (int64_t i = 0; i < input.length; i++) {
+          out_data[i] = static_cast<out_type>(in_data[i] / factor);
+          if (bit_reader.IsSet() && (out_data[i] * factor != in_data[i])) {
+            RAISE_INVALID_CAST(in_data[i]);
+            break;
+          }
+          bit_reader.Next();
+        }
+      } else {
+        for (int64_t i = 0; i < input.length; i++) {
+          out_data[i] = static_cast<out_type>(in_data[i] / factor);
+          if (out_data[i] * factor != in_data[i]) {
+            RAISE_INVALID_CAST(in_data[i]);
+            break;
+          }
+        }
+      }
+
+#undef RAISE_INVALID_CAST
+    }
+  }
+}
+
+// <TimestampType, TimestampType> and <DurationType, DurationType>
+template <typename O, typename I>
+struct CastFunctor<
+    O, I,
+    enable_if_t<(is_timestamp_type<O>::value && is_timestamp_type<I>::value) ||
+                (is_duration_type<O>::value && is_duration_type<I>::value)>> {
+  void operator()(FunctionContext* ctx, const CastOptions& options,
+                  const ArrayData& input, ArrayData* output) {
+    // If units are the same, zero copy, otherwise convert
+    const auto& in_type = checked_cast<const I&>(*input.type);
+    const auto& out_type = checked_cast<const O&>(*output->type);
+
+    if (in_type.unit() == out_type.unit()) {
+      ZeroCopyData(input, output);
+      return;
+    }
+
+    auto conversion = util::kTimestampConversionTable[static_cast<int>(in_type.unit())]
+                                                     [static_cast<int>(out_type.unit())];
+    ShiftTime<int64_t, int64_t>(ctx, options, conversion.first, conversion.second, input,
+                                output);
+  }
+};
+
+template <>
+struct CastFunctor<Date32Type, TimestampType> {
+  void operator()(FunctionContext* ctx, const CastOptions& options,
+                  const ArrayData& input, ArrayData* output) {
+    const auto& in_type = checked_cast<const TimestampType&>(*input.type);
+
+    static const int64_t kTimestampToDateFactors[4] = {
+        86400LL,                             // SECOND
+        86400LL * 1000LL,                    // MILLI
+        86400LL * 1000LL * 1000LL,           // MICRO
+        86400LL * 1000LL * 1000LL * 1000LL,  // NANO
+    };
+
+    const int64_t factor = kTimestampToDateFactors[static_cast<int>(in_type.unit())];
+    ShiftTime<int64_t, int32_t>(ctx, options, util::DIVIDE, factor, input, output);
+  }
+};
+
+template <>
+struct CastFunctor<Date64Type, TimestampType> {
+  void operator()(FunctionContext* ctx, const CastOptions& options,
+                  const ArrayData& input, ArrayData* output) {
+    const auto& in_type = checked_cast<const TimestampType&>(*input.type);
+
+    auto conversion = util::kTimestampConversionTable[static_cast<int>(in_type.unit())]
+                                                     [static_cast<int>(TimeUnit::MILLI)];
+    ShiftTime<int64_t, int64_t>(ctx, options, conversion.first, conversion.second, input,
+                                output);
+    if (!ctx->status().ok()) {
+      return;
+    }
+
+    // Ensure that intraday milliseconds have been zeroed out
+    auto out_data = output->GetMutableValues<int64_t>(1);
+
+    if (input.null_count != 0) {
+      internal::BitmapReader bit_reader(input.buffers[0]->data(), input.offset,
+                                        input.length);
+
+      for (int64_t i = 0; i < input.length; ++i) {
+        const int64_t remainder = out_data[i] % kMillisecondsInDay;
+        if (ARROW_PREDICT_FALSE(!options.allow_time_truncate && bit_reader.IsSet() &&
+                                remainder > 0)) {
+          ctx->SetStatus(
+              Status::Invalid("Timestamp value had non-zero intraday milliseconds"));
+          break;
+        }
+        out_data[i] -= remainder;
+        bit_reader.Next();
+      }
+    } else {
+      for (int64_t i = 0; i < input.length; ++i) {
+        const int64_t remainder = out_data[i] % kMillisecondsInDay;
+        if (ARROW_PREDICT_FALSE(!options.allow_time_truncate && remainder > 0)) {
+          ctx->SetStatus(
+              Status::Invalid("Timestamp value had non-zero intraday milliseconds"));
+          break;
+        }
+        out_data[i] -= remainder;
+      }
+    }
+  }
+};
+
+// ----------------------------------------------------------------------
+// From one time32 or time64 to another
+
+template <typename O, typename I>
+struct CastFunctor<O, I, enable_if_t<is_time_type<I>::value && is_time_type<O>::value>> {
+  void operator()(FunctionContext* ctx, const CastOptions& options,
+                  const ArrayData& input, ArrayData* output) {
+    using in_t = typename I::c_type;
+    using out_t = typename O::c_type;
+
+    // If units are the same, zero copy, otherwise convert
+    const auto& in_type = checked_cast<const I&>(*input.type);
+    const auto& out_type = checked_cast<const O&>(*output->type);
+
+    if (in_type.unit() == out_type.unit()) {
+      ZeroCopyData(input, output);
+      return;
+    }
+
+    auto conversion = util::kTimestampConversionTable[static_cast<int>(in_type.unit())]
+                                                     [static_cast<int>(out_type.unit())];
+
+    ShiftTime<in_t, out_t>(ctx, options, conversion.first, conversion.second, input,
+                           output);
+  }
+};
+
+// ----------------------------------------------------------------------
+// Between date32 and date64
+
+template <>
+struct CastFunctor<Date64Type, Date32Type> {
+  void operator()(FunctionContext* ctx, const CastOptions& options,
+                  const ArrayData& input, ArrayData* output) {
+    ShiftTime<int32_t, int64_t>(ctx, options, util::MULTIPLY, kMillisecondsInDay, input,
+                                output);
+  }
+};
+
+template <>
+struct CastFunctor<Date32Type, Date64Type> {
+  void operator()(FunctionContext* ctx, const CastOptions& options,
+                  const ArrayData& input, ArrayData* output) {
+    ShiftTime<int64_t, int32_t>(ctx, options, util::DIVIDE, kMillisecondsInDay, input,
+                                output);
+  }
+};
+
+// ----------------------------------------------------------------------
+// String to Timestamp
+
+template <typename I>
+struct CastFunctor<TimestampType, I, enable_if_t<is_string_like_type<I>::value>> {
+  void operator()(FunctionContext* ctx, const CastOptions& options,
+                  const ArrayData& input, ArrayData* output) {
+    using out_type = TimestampType::c_type;
+
+    typename TypeTraits<I>::ArrayType input_array(input.Copy());
+    auto out_data = output->GetMutableValues<out_type>(1);
+    internal::StringConverter<TimestampType> converter(output->type);
+
+    for (int64_t i = 0; i < input.length; ++i, ++out_data) {
+      if (input_array.IsNull(i)) {
+        continue;
+      }
+
+      const auto str = input_array.GetView(i);
+      if (!converter(str.data(), str.length(), out_data)) {
+        ctx->SetStatus(Status::Invalid("Failed to cast String '", str, "' into ",
+                                       output->type->ToString()));
+        return;
+      }
+    }
+  }
+};
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
similarity index 96%
rename from cpp/src/arrow/compute/kernels/cast_test.cc
rename to cpp/src/arrow/compute/kernels/scalar_cast_test.cc
index 94f3689ff93..3efab54fe55 100644
--- a/cpp/src/arrow/compute/kernels/cast_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
@@ -40,12 +40,10 @@
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/decimal.h"
 
-#include "arrow/compute/context.h"
+#include "arrow/compute/cast.h"
 #include "arrow/compute/kernel.h"
-#include "arrow/compute/kernels/cast.h"
-#include "arrow/compute/kernels/hash.h"
-#include "arrow/compute/kernels/util_internal.h"
 #include "arrow/compute/test_util.h"
+#include "arrow/compute/util_internal.h"
 
 namespace arrow {
 namespace compute {
@@ -63,12 +61,11 @@ static void AssertBufferSame(const Array& left, const Array& right, int buffer_i
             right.data()->buffers[buffer_index].get());
 }
 
-class TestCast : public ComputeFixture, public TestBase {
+class TestCast : public TestBase {
  public:
   void CheckPass(const Array& input, const Array& expected,
                  const std::shared_ptr<DataType>& out_type, const CastOptions& options) {
-    std::shared_ptr<Array> result;
-    ASSERT_OK(Cast(&ctx_, input, out_type, options, &result));
+    ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> result, Cast(input, out_type, options));
     ASSERT_OK(result->ValidateFull());
     ASSERT_ARRAYS_EQUAL(expected, *result);
   }
@@ -77,18 +74,17 @@ class TestCast : public ComputeFixture, public TestBase {
   void CheckFails(const std::shared_ptr<DataType>& in_type,
                   const std::vector<I_TYPE>& in_values, const std::vector<bool>& is_valid,
                   const std::shared_ptr<DataType>& out_type, const CastOptions& options) {
-    std::shared_ptr<Array> input, result;
+    std::shared_ptr<Array> input;
     if (is_valid.size() > 0) {
       ArrayFromVector<InType, I_TYPE>(in_type, is_valid, in_values, &input);
     } else {
       ArrayFromVector<InType, I_TYPE>(in_type, in_values, &input);
     }
-    ASSERT_RAISES(Invalid, Cast(&ctx_, *input, out_type, options, &result));
+    ASSERT_RAISES(Invalid, Cast(*input, out_type, options));
   }
 
   void CheckZeroCopy(const Array& input, const std::shared_ptr<DataType>& out_type) {
-    std::shared_ptr<Array> result;
-    ASSERT_OK(Cast(&ctx_, input, out_type, {}, &result));
+    ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> result, Cast(input, out_type));
     ASSERT_OK(result->ValidateFull());
     ASSERT_EQ(input.data()->buffers.size(), result->data()->buffers.size());
     for (size_t i = 0; i < input.data()->buffers.size(); ++i) {
@@ -296,8 +292,7 @@ class TestCast : public ComputeFixture, public TestBase {
 
 TEST_F(TestCast, SameTypeZeroCopy) {
   std::shared_ptr<Array> arr = ArrayFromJSON(int32(), "[0, null, 2, 3, 4]");
-  std::shared_ptr<Array> result;
-  ASSERT_OK(Cast(&this->ctx_, *arr, int32(), {}, &result));
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> result, Cast(*arr, int32()));
 
   AssertBufferSame(*arr, *result, 0);
   AssertBufferSame(*arr, *result, 1);
@@ -305,8 +300,7 @@ TEST_F(TestCast, SameTypeZeroCopy) {
 
 TEST_F(TestCast, ZeroChunks) {
   auto chunked_i32 = std::make_shared<ChunkedArray>(ArrayVector{}, int32());
-  Datum result;
-  ASSERT_OK(Cast(&this->ctx_, chunked_i32, utf8(), {}, &result));
+  ASSERT_OK_AND_ASSIGN(Datum result, Cast(chunked_i32, utf8()));
 
   ASSERT_EQ(result.kind(), Datum::CHUNKED_ARRAY);
   AssertChunkedEqual(*result.chunked_array(), ChunkedArray({}, utf8()));
@@ -1199,8 +1193,7 @@ TEST_F(TestCast, ChunkedArray) {
 
   CastOptions options;
 
-  Datum out;
-  ASSERT_OK(Cast(&this->ctx_, carr, out_type, options, &out));
+  ASSERT_OK_AND_ASSIGN(Datum out, Cast(carr, out_type, options));
   ASSERT_EQ(Datum::CHUNKED_ARRAY, out.kind());
 
   auto out_carr = out.chunked_array();
@@ -1223,8 +1216,7 @@ TEST_F(TestCast, UnsupportedTarget) {
   std::shared_ptr<Array> arr;
   ArrayFromVector<Int32Type, int32_t>(int32(), is_valid, v1, &arr);
 
-  std::shared_ptr<Array> result;
-  ASSERT_RAISES(NotImplemented, Cast(&this->ctx_, *arr, list(utf8()), {}, &result));
+  ASSERT_RAISES(NotImplemented, Cast(*arr, list(utf8())));
 }
 
 TEST_F(TestCast, DateTimeZeroCopy) {
@@ -1266,12 +1258,12 @@ TEST_F(TestCast, PreallocatedMemory) {
 
   auto out_data = ArrayData::Make(out_type, length);
 
-  std::shared_ptr<Buffer> out_values;
-  ASSERT_OK(this->ctx_.Allocate(length * sizeof(int64_t), &out_values));
+  ASSERT_OK_AND_ASSIGN(auto out_values, AllocateBuffer(length * sizeof(int64_t)));
 
   out_data->buffers.push_back(arr->data()->buffers[0]);
   out_data->buffers.push_back(out_values);
 
+  // TODO
   Datum out(out_data);
   ASSERT_OK(kernel->Call(&this->ctx_, arr, &out));
 
@@ -1286,7 +1278,7 @@ TEST_F(TestCast, PreallocatedMemory) {
 }
 
 template <typename InType, typename InT, typename OutType, typename OutT>
-void CheckOffsetOutputCase(FunctionContext* ctx, const std::shared_ptr<DataType>& in_type,
+void CheckOffsetOutputCase(const std::shared_ptr<DataType>& in_type,
                            const std::vector<InT>& in_values,
                            const std::shared_ptr<DataType>& out_type,
                            const std::vector<OutT>& out_values) {
@@ -1300,8 +1292,8 @@ void CheckOffsetOutputCase(FunctionContext* ctx, const std::shared_ptr<DataType>
   ArrayFromVector<InType, InT>(in_type, in_values, &arr);
   ArrayFromVector<OutType, OutT>(out_type, out_values, &expected);
 
-  std::shared_ptr<Buffer> out_buffer;
-  ASSERT_OK(ctx->Allocate(OutTraits::bytes_required(length), &out_buffer));
+  ASSERT_OK_AND_ASSIGN(auto out_buffer,
+                       AllocateBuffer(OutTraits::bytes_required(length)));
 
   std::unique_ptr<UnaryKernel> kernel;
   ASSERT_OK(GetCastFunction(*in_type, out_type, options, &kernel));
@@ -1331,18 +1323,17 @@ TEST_F(TestCast, OffsetOutputBuffer) {
 
   auto in_type = int32();
   auto out_type = int64();
-  CheckOffsetOutputCase<Int32Type, int32_t, Int64Type, int64_t>(&this->ctx_, in_type, v1,
-                                                                out_type, e1);
+  CheckOffsetOutputCase<Int32Type, int32_t, Int64Type, int64_t>(in_type, v1, out_type,
+                                                                e1);
 
   std::vector<bool> e2 = {false, true, true, true, false};
 
   out_type = boolean();
-  CheckOffsetOutputCase<Int32Type, int32_t, BooleanType, bool>(&this->ctx_, in_type, v1,
-                                                               boolean(), e2);
+  CheckOffsetOutputCase<Int32Type, int32_t, BooleanType, bool>(in_type, v1, boolean(),
+                                                               e2);
 
   std::vector<int16_t> e3 = {0, 10000, 2000, 1000, 0};
-  CheckOffsetOutputCase<Int32Type, int32_t, Int16Type, int16_t>(&this->ctx_, in_type, v1,
-                                                                int16(), e3);
+  CheckOffsetOutputCase<Int32Type, int32_t, Int16Type, int16_t>(in_type, v1, int16(), e3);
 }
 
 TEST_F(TestCast, StringToBoolean) {
@@ -1429,11 +1420,8 @@ TEST_F(TestCast, ListToPrimitive) {
   auto from_int = ArrayFromJSON(list(int8()), "[[1, 2], [3, 4]]");
   auto from_binary = ArrayFromJSON(list(binary()), "[[\"1\", \"2\"], [\"3\", \"4\"]]");
 
-  CastOptions options;
-  std::shared_ptr<Array> result;
-
-  ASSERT_RAISES(Invalid, Cast(&ctx_, *from_int, uint8(), options, &result));
-  ASSERT_RAISES(Invalid, Cast(&ctx_, *from_binary, utf8(), options, &result));
+  ASSERT_RAISES(Invalid, Cast(*from_int, uint8()));
+  ASSERT_RAISES(Invalid, Cast(*from_binary, utf8()));
 }
 
 TEST_F(TestCast, ListToList) {
@@ -1449,14 +1437,13 @@ TEST_F(TestCast, ListToList) {
   ASSERT_OK_AND_ASSIGN(auto int32_list_array,
                        ListArray::FromArrays(*offsets, *int32_plain_array, pool_));
 
-  std::shared_ptr<Array> int64_plain_array;
-  ASSERT_OK(Cast(&this->ctx_, *int32_plain_array, int64(), options, &int64_plain_array));
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> int64_plain_array,
+                       Cast(*int32_plain_array, int64(), options));
   ASSERT_OK_AND_ASSIGN(auto int64_list_array,
                        ListArray::FromArrays(*offsets, *int64_plain_array, pool_));
 
-  std::shared_ptr<Array> float64_plain_array;
-  ASSERT_OK(
-      Cast(&this->ctx_, *int32_plain_array, float64(), options, &float64_plain_array));
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> float64_plain_array,
+                       Cast(*int32_plain_array, float64(), options));
   ASSERT_OK_AND_ASSIGN(auto float64_list_array,
                        ListArray::FromArrays(*offsets, *float64_plain_array, pool_));
 
@@ -1484,9 +1471,8 @@ TEST_F(TestCast, LargeListToLargeList) {
   ASSERT_OK_AND_ASSIGN(auto int32_list_array,
                        LargeListArray::FromArrays(*offsets, *int32_plain_array, pool_));
 
-  std::shared_ptr<Array> float64_plain_array;
-  ASSERT_OK(
-      Cast(&this->ctx_, *int32_plain_array, float64(), options, &float64_plain_array));
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> float64_plain_array,
+                       Cast(*int32_plain_array, float64(), options));
   ASSERT_OK_AND_ASSIGN(auto float64_list_array,
                        LargeListArray::FromArrays(*offsets, *float64_plain_array, pool_));
 
@@ -1574,8 +1560,7 @@ TYPED_TEST(TestNullCast, FromNull) {
 
   NullArray arr(length);
 
-  std::shared_ptr<Array> result;
-  ASSERT_OK(Cast(&this->ctx_, arr, out_type, {}, &result));
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> result, Cast(&this->ctx_, arr, out_type));
   ASSERT_OK(result->ValidateFull());
 
   ASSERT_TRUE(result->type()->Equals(*out_type));
@@ -1601,8 +1586,7 @@ TYPED_TEST(TestDictionaryCast, Basic) {
   std::shared_ptr<Array> plain_array =
       TestBase::MakeRandomArray<typename TypeTraits<TypeParam>::ArrayType>(10, 2);
 
-  Datum encoded;
-  ASSERT_OK(DictionaryEncode(&this->ctx_, plain_array->data(), &encoded));
+  ASSERT_OK_AND_ASSIGN(Datum encoded, DictionaryEncode(plain_array->data()));
   ASSERT_EQ(encoded.array()->type->id(), Type::DICTIONARY);
 
   this->CheckPass(*MakeArray(encoded.array()), *plain_array, plain_array->type(),
@@ -1616,14 +1600,12 @@ TYPED_TEST(TestDictionaryCast, NoNulls) {
     return;
   }
 
-  CastOptions options;
   std::shared_ptr<Array> plain_array =
       TestBase::MakeRandomArray<typename TypeTraits<TypeParam>::ArrayType>(10, 0);
   ASSERT_EQ(plain_array->null_count(), 0);
 
   // Dict-encode the plain array
-  Datum encoded;
-  ASSERT_OK(DictionaryEncode(&this->ctx_, plain_array->data(), &encoded));
+  ASSERT_OK_AND_ASSIGN(Datum encoded, DictionaryEncode(&this->ctx_, plain_array->data()));
 
   // Make a new dict array with nullptr bitmap buffer
   auto data = encoded.array()->Copy();
@@ -1632,7 +1614,7 @@ TYPED_TEST(TestDictionaryCast, NoNulls) {
   std::shared_ptr<Array> dict_array = std::make_shared<DictionaryArray>(data);
   ASSERT_OK(dict_array->ValidateFull());
 
-  this->CheckPass(*dict_array, *plain_array, plain_array->type(), options);
+  this->CheckPass(*dict_array, *plain_array, plain_array->type());
 }
 
 TYPED_TEST(TestDictionaryCast, OutTypeError) {
@@ -1696,7 +1678,7 @@ TEST_F(TestCast, ExtensionTypeToIntDowncast) {
   CheckPass(*v2, *e2, uint8(), options);
   // disallow overflow
   options.allow_int_overflow = false;
-  ASSERT_RAISES(Invalid, Cast(&ctx_, *v2, uint8(), options, &result));
+  ASSERT_RAISES(Invalid, Cast(*v2, uint8(), options));
 
   // Smallint(int16) to uint8, with underflow
   auto v3 = SmallintArrayFromJSON("[0, null, -1, 1, 0]");
@@ -1706,7 +1688,7 @@ TEST_F(TestCast, ExtensionTypeToIntDowncast) {
   CheckPass(*v3, *e3, uint8(), options);
   // disallow overflow
   options.allow_int_overflow = false;
-  ASSERT_RAISES(Invalid, Cast(&ctx_, *v3, uint8(), options, &result));
+  ASSERT_RAISES(Invalid, Cast(*v3, uint8(), options));
 
   ASSERT_OK(UnregisterExtensionType("smallint"));
 }
diff --git a/cpp/src/arrow/compute/kernels/scalar_compare.cc b/cpp/src/arrow/compute/kernels/scalar_compare.cc
new file mode 100644
index 00000000000..a48ff0b477b
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/scalar_compare.cc
@@ -0,0 +1,117 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/kernels/common.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+using util::string_view;
+
+namespace compute {
+namespace codegen {
+
+// Currently implemented types for comparison
+// * Boolean
+// * Temporal
+// * BaseBinary
+
+template <typename Op, typename FlippedOp = Op>
+void MakeCompareFunction(std::string name, FunctionRegistry* registry) {
+  auto func = std::make_shared<ScalarFunction>(name, /*arity=*/2);
+
+  auto out_ty = boolean();
+  DCHECK_OK(func->AddKernel(
+      {boolean(), boolean()}, out_ty,
+      ScalarBinary<BooleanType, BooleanType, BooleanType, Op, FlippedOp>::Exec));
+
+  for (const std::shared_ptr<DataType>& ty : NumericTypes()) {
+    auto exec = NumericSetReturn<ScalarBinaryEqualTypes, BooleanType, Op, FlippedOp>(*ty);
+    DCHECK_OK(func->AddKernel({ty, ty}, out_ty, exec));
+  }
+  for (const std::shared_ptr<DataType>& ty : TemporalTypes()) {
+    auto exec =
+        TemporalSetReturn<ScalarBinaryEqualTypes, BooleanType, Op, FlippedOp>(*ty);
+    DCHECK_OK(func->AddKernel({ty, ty}, out_ty, exec));
+  }
+  for (const std::shared_ptr<DataType>& ty : BaseBinaryTypes()) {
+    auto exec =
+        BaseBinarySetReturn<ScalarBinaryEqualTypes, BooleanType, Op, FlippedOp>(*ty);
+    DCHECK_OK(func->AddKernel({ty, ty}, out_ty, exec));
+  }
+  DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+}  // namespace codegen
+
+namespace internal {
+
+struct Equal {
+  template <typename T>
+  static constexpr bool Call(KernelContext*, const T& left, const T& right) {
+    return left == right;
+  }
+};
+
+struct NotEqual {
+  template <typename T>
+  static constexpr bool Call(KernelContext*, const T& left, const T& right) {
+    return left != right;
+  }
+};
+
+struct Greater {
+  template <typename T>
+  static constexpr bool Call(KernelContext*, const T& left, const T& right) {
+    return left > right;
+  }
+};
+
+struct GreaterEqual {
+  template <typename T>
+  static constexpr bool Call(KernelContext*, const T& left, const T& right) {
+    return left >= right;
+  }
+};
+
+struct Less {
+  template <typename T>
+  static constexpr bool Call(KernelContext*, const T& left, const T& right) {
+    return left < right;
+  }
+};
+
+struct LessEqual {
+  template <typename T>
+  static constexpr bool Call(KernelContext*, const T& left, const T& right) {
+    return left <= right;
+  }
+};
+
+void RegisterComparisonFunctions(FunctionRegistry* registry) {
+  codegen::MakeCompareFunction<Equal>("==", registry);
+  codegen::MakeCompareFunction<NotEqual>("!=", registry);
+  codegen::MakeCompareFunction<Less, Greater>("<", registry);
+  codegen::MakeCompareFunction<LessEqual, GreaterEqual>("<=", registry);
+  codegen::MakeCompareFunction<Greater, Less>(">", registry);
+  codegen::MakeCompareFunction<GreaterEqual, LessEqual>(">=", registry);
+}
+
+}  // namespace internal
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/compare_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_compare_benchmark.cc
similarity index 89%
rename from cpp/src/arrow/compute/kernels/compare_benchmark.cc
rename to cpp/src/arrow/compute/kernels/scalar_compare_benchmark.cc
index 8a011bdac52..f20c5199b51 100644
--- a/cpp/src/arrow/compute/kernels/compare_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_compare_benchmark.cc
@@ -21,7 +21,6 @@
 
 #include "arrow/compute/benchmark_util.h"
 #include "arrow/compute/kernel.h"
-#include "arrow/compute/kernels/compare.h"
 #include "arrow/compute/test_util.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/random.h"
@@ -41,11 +40,8 @@ static void CompareArrayScalarKernel(benchmark::State& state) {
 
   CompareOptions ge{GREATER_EQUAL};
 
-  FunctionContext ctx;
   for (auto _ : state) {
-    Datum out;
-    ABORT_NOT_OK(Compare(&ctx, Datum(array), Datum(int64_t(0)), ge, &out));
-    benchmark::DoNotOptimize(out);
+    ABORT_NOT_OK(Compare(array, int64_t(0), ge).status());
   }
 
   state.counters["size"] = static_cast<double>(memory_size);
@@ -64,12 +60,8 @@ static void CompareArrayArrayKernel(benchmark::State& state) {
       rand.Int64(array_size, -100, 100, null_percent));
 
   CompareOptions ge(GREATER_EQUAL);
-
-  FunctionContext ctx;
   for (auto _ : state) {
-    Datum out;
-    ABORT_NOT_OK(Compare(&ctx, Datum(lhs), Datum(rhs), ge, &out));
-    benchmark::DoNotOptimize(out);
+    ABORT_NOT_OK(Compare(lhs, rhs, ge).status());
   }
 
   state.counters["size"] = static_cast<double>(memory_size);
diff --git a/cpp/src/arrow/compute/kernels/compare_test.cc b/cpp/src/arrow/compute/kernels/scalar_compare_test.cc
similarity index 52%
rename from cpp/src/arrow/compute/kernels/compare_test.cc
rename to cpp/src/arrow/compute/kernels/scalar_compare_test.cc
index a564c496e28..14e0a18833b 100644
--- a/cpp/src/arrow/compute/kernels/compare_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_compare_test.cc
@@ -25,8 +25,7 @@
 #include <gtest/gtest.h>
 
 #include "arrow/array.h"
-#include "arrow/compute/kernel.h"
-#include "arrow/compute/kernels/compare.h"
+#include "arrow/compute/api_eager.h"
 #include "arrow/compute/test_util.h"
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
@@ -42,40 +41,36 @@ namespace compute {
 using util::string_view;
 
 template <typename ArrowType>
-static void ValidateCompare(FunctionContext* ctx, CompareOptions options,
-                            const Datum& lhs, const Datum& rhs, const Datum& expected) {
-  Datum result;
-
-  ASSERT_OK(Compare(ctx, lhs, rhs, options, &result));
-  AssertArraysEqual(*expected.make_array(), *result.make_array());
+static void ValidateCompare(CompareOptions options, const Datum& lhs, const Datum& rhs,
+                            const Datum& expected) {
+  ASSERT_OK_AND_ASSIGN(Datum result, Compare(lhs, rhs, options));
+  AssertArraysEqual(*expected.make_array(), *result.make_array(),
+                    /*verbose=*/true);
 }
 
 template <typename ArrowType>
-static void ValidateCompare(FunctionContext* ctx, CompareOptions options,
-                            const char* lhs_str, const Datum& rhs,
+static void ValidateCompare(CompareOptions options, const char* lhs_str, const Datum& rhs,
                             const char* expected_str) {
   auto lhs = ArrayFromJSON(TypeTraits<ArrowType>::type_singleton(), lhs_str);
   auto expected = ArrayFromJSON(TypeTraits<BooleanType>::type_singleton(), expected_str);
-  ValidateCompare<ArrowType>(ctx, options, lhs, rhs, expected);
+  ValidateCompare<ArrowType>(options, lhs, rhs, expected);
 }
 
 template <typename ArrowType>
-static void ValidateCompare(FunctionContext* ctx, CompareOptions options,
-                            const Datum& lhs, const char* rhs_str,
+static void ValidateCompare(CompareOptions options, const Datum& lhs, const char* rhs_str,
                             const char* expected_str) {
   auto rhs = ArrayFromJSON(TypeTraits<ArrowType>::type_singleton(), rhs_str);
   auto expected = ArrayFromJSON(TypeTraits<BooleanType>::type_singleton(), expected_str);
-  ValidateCompare<ArrowType>(ctx, options, lhs, rhs, expected);
+  ValidateCompare<ArrowType>(options, lhs, rhs, expected);
 }
 
 template <typename ArrowType>
-static void ValidateCompare(FunctionContext* ctx, CompareOptions options,
-                            const char* lhs_str, const char* rhs_str,
-                            const char* expected_str) {
+static void ValidateCompare(CompareOptions options, const char* lhs_str,
+                            const char* rhs_str, const char* expected_str) {
   auto lhs = ArrayFromJSON(TypeTraits<ArrowType>::type_singleton(), lhs_str);
   auto rhs = ArrayFromJSON(TypeTraits<ArrowType>::type_singleton(), rhs_str);
   auto expected = ArrayFromJSON(TypeTraits<BooleanType>::type_singleton(), expected_str);
-  ValidateCompare<ArrowType>(ctx, options, lhs, rhs, expected);
+  ValidateCompare<ArrowType>(options, lhs, rhs, expected);
 }
 
 template <typename T>
@@ -233,19 +228,18 @@ Datum SimpleArrayArrayCompare<StringType>(CompareOptions options, const Datum& l
 }
 
 template <typename ArrowType>
-void ValidateCompare(FunctionContext* ctx, CompareOptions options, const Datum& lhs,
-                     const Datum& rhs) {
+void ValidateCompare(CompareOptions options, const Datum& lhs, const Datum& rhs) {
   Datum result;
 
   bool has_scalar = lhs.is_scalar() || rhs.is_scalar();
   Datum expected = has_scalar ? SimpleScalarArrayCompare<ArrowType>(options, lhs, rhs)
                               : SimpleArrayArrayCompare<ArrowType>(options, lhs, rhs);
 
-  ValidateCompare<ArrowType>(ctx, options, lhs, rhs, expected);
+  ValidateCompare<ArrowType>(options, lhs, rhs, expected);
 }
 
 template <typename ArrowType>
-class TestNumericCompareKernel : public ComputeFixture, public TestBase {};
+class TestNumericCompareKernel : public TestBase {};
 
 TYPED_TEST_SUITE(TestNumericCompareKernel, NumericArrowTypes);
 TYPED_TEST(TestNumericCompareKernel, SimpleCompareArrayScalar) {
@@ -255,52 +249,52 @@ TYPED_TEST(TestNumericCompareKernel, SimpleCompareArrayScalar) {
   Datum one(std::make_shared<ScalarType>(CType(1)));
 
   CompareOptions eq(CompareOperator::EQUAL);
-  ValidateCompare<TypeParam>(&this->ctx_, eq, "[]", one, "[]");
-  ValidateCompare<TypeParam>(&this->ctx_, eq, "[null]", one, "[null]");
-  ValidateCompare<TypeParam>(&this->ctx_, eq, "[0,0,1,1,2,2]", one, "[0,0,1,1,0,0]");
-  ValidateCompare<TypeParam>(&this->ctx_, eq, "[0,1,2,3,4,5]", one, "[0,1,0,0,0,0]");
-  ValidateCompare<TypeParam>(&this->ctx_, eq, "[5,4,3,2,1,0]", one, "[0,0,0,0,1,0]");
-  ValidateCompare<TypeParam>(&this->ctx_, eq, "[null,0,1,1]", one, "[null,0,1,1]");
+  ValidateCompare<TypeParam>(eq, "[]", one, "[]");
+  ValidateCompare<TypeParam>(eq, "[null]", one, "[null]");
+  ValidateCompare<TypeParam>(eq, "[0,0,1,1,2,2]", one, "[0,0,1,1,0,0]");
+  ValidateCompare<TypeParam>(eq, "[0,1,2,3,4,5]", one, "[0,1,0,0,0,0]");
+  ValidateCompare<TypeParam>(eq, "[5,4,3,2,1,0]", one, "[0,0,0,0,1,0]");
+  ValidateCompare<TypeParam>(eq, "[null,0,1,1]", one, "[null,0,1,1]");
 
   CompareOptions neq(CompareOperator::NOT_EQUAL);
-  ValidateCompare<TypeParam>(&this->ctx_, neq, "[]", one, "[]");
-  ValidateCompare<TypeParam>(&this->ctx_, neq, "[null]", one, "[null]");
-  ValidateCompare<TypeParam>(&this->ctx_, neq, "[0,0,1,1,2,2]", one, "[1,1,0,0,1,1]");
-  ValidateCompare<TypeParam>(&this->ctx_, neq, "[0,1,2,3,4,5]", one, "[1,0,1,1,1,1]");
-  ValidateCompare<TypeParam>(&this->ctx_, neq, "[5,4,3,2,1,0]", one, "[1,1,1,1,0,1]");
-  ValidateCompare<TypeParam>(&this->ctx_, neq, "[null,0,1,1]", one, "[null,1,0,0]");
+  ValidateCompare<TypeParam>(neq, "[]", one, "[]");
+  ValidateCompare<TypeParam>(neq, "[null]", one, "[null]");
+  ValidateCompare<TypeParam>(neq, "[0,0,1,1,2,2]", one, "[1,1,0,0,1,1]");
+  ValidateCompare<TypeParam>(neq, "[0,1,2,3,4,5]", one, "[1,0,1,1,1,1]");
+  ValidateCompare<TypeParam>(neq, "[5,4,3,2,1,0]", one, "[1,1,1,1,0,1]");
+  ValidateCompare<TypeParam>(neq, "[null,0,1,1]", one, "[null,1,0,0]");
 
   CompareOptions gt(CompareOperator::GREATER);
-  ValidateCompare<TypeParam>(&this->ctx_, gt, "[]", one, "[]");
-  ValidateCompare<TypeParam>(&this->ctx_, gt, "[null]", one, "[null]");
-  ValidateCompare<TypeParam>(&this->ctx_, gt, "[0,0,1,1,2,2]", one, "[0,0,0,0,1,1]");
-  ValidateCompare<TypeParam>(&this->ctx_, gt, "[0,1,2,3,4,5]", one, "[0,0,1,1,1,1]");
-  ValidateCompare<TypeParam>(&this->ctx_, gt, "[4,5,6,7,8,9]", one, "[1,1,1,1,1,1]");
-  ValidateCompare<TypeParam>(&this->ctx_, gt, "[null,0,1,1]", one, "[null,0,0,0]");
+  ValidateCompare<TypeParam>(gt, "[]", one, "[]");
+  ValidateCompare<TypeParam>(gt, "[null]", one, "[null]");
+  ValidateCompare<TypeParam>(gt, "[0,0,1,1,2,2]", one, "[0,0,0,0,1,1]");
+  ValidateCompare<TypeParam>(gt, "[0,1,2,3,4,5]", one, "[0,0,1,1,1,1]");
+  ValidateCompare<TypeParam>(gt, "[4,5,6,7,8,9]", one, "[1,1,1,1,1,1]");
+  ValidateCompare<TypeParam>(gt, "[null,0,1,1]", one, "[null,0,0,0]");
 
   CompareOptions gte(CompareOperator::GREATER_EQUAL);
-  ValidateCompare<TypeParam>(&this->ctx_, gte, "[]", one, "[]");
-  ValidateCompare<TypeParam>(&this->ctx_, gte, "[null]", one, "[null]");
-  ValidateCompare<TypeParam>(&this->ctx_, gte, "[0,0,1,1,2,2]", one, "[0,0,1,1,1,1]");
-  ValidateCompare<TypeParam>(&this->ctx_, gte, "[0,1,2,3,4,5]", one, "[0,1,1,1,1,1]");
-  ValidateCompare<TypeParam>(&this->ctx_, gte, "[4,5,6,7,8,9]", one, "[1,1,1,1,1,1]");
-  ValidateCompare<TypeParam>(&this->ctx_, gte, "[null,0,1,1]", one, "[null,0,1,1]");
+  ValidateCompare<TypeParam>(gte, "[]", one, "[]");
+  ValidateCompare<TypeParam>(gte, "[null]", one, "[null]");
+  ValidateCompare<TypeParam>(gte, "[0,0,1,1,2,2]", one, "[0,0,1,1,1,1]");
+  ValidateCompare<TypeParam>(gte, "[0,1,2,3,4,5]", one, "[0,1,1,1,1,1]");
+  ValidateCompare<TypeParam>(gte, "[4,5,6,7,8,9]", one, "[1,1,1,1,1,1]");
+  ValidateCompare<TypeParam>(gte, "[null,0,1,1]", one, "[null,0,1,1]");
 
   CompareOptions lt(CompareOperator::LESS);
-  ValidateCompare<TypeParam>(&this->ctx_, lt, "[]", one, "[]");
-  ValidateCompare<TypeParam>(&this->ctx_, lt, "[null]", one, "[null]");
-  ValidateCompare<TypeParam>(&this->ctx_, lt, "[0,0,1,1,2,2]", one, "[1,1,0,0,0,0]");
-  ValidateCompare<TypeParam>(&this->ctx_, lt, "[0,1,2,3,4,5]", one, "[1,0,0,0,0,0]");
-  ValidateCompare<TypeParam>(&this->ctx_, lt, "[4,5,6,7,8,9]", one, "[0,0,0,0,0,0]");
-  ValidateCompare<TypeParam>(&this->ctx_, lt, "[null,0,1,1]", one, "[null,1,0,0]");
+  ValidateCompare<TypeParam>(lt, "[]", one, "[]");
+  ValidateCompare<TypeParam>(lt, "[null]", one, "[null]");
+  ValidateCompare<TypeParam>(lt, "[0,0,1,1,2,2]", one, "[1,1,0,0,0,0]");
+  ValidateCompare<TypeParam>(lt, "[0,1,2,3,4,5]", one, "[1,0,0,0,0,0]");
+  ValidateCompare<TypeParam>(lt, "[4,5,6,7,8,9]", one, "[0,0,0,0,0,0]");
+  ValidateCompare<TypeParam>(lt, "[null,0,1,1]", one, "[null,1,0,0]");
 
   CompareOptions lte(CompareOperator::LESS_EQUAL);
-  ValidateCompare<TypeParam>(&this->ctx_, lte, "[]", one, "[]");
-  ValidateCompare<TypeParam>(&this->ctx_, lte, "[null]", one, "[null]");
-  ValidateCompare<TypeParam>(&this->ctx_, lte, "[0,0,1,1,2,2]", one, "[1,1,1,1,0,0]");
-  ValidateCompare<TypeParam>(&this->ctx_, lte, "[0,1,2,3,4,5]", one, "[1,1,0,0,0,0]");
-  ValidateCompare<TypeParam>(&this->ctx_, lte, "[4,5,6,7,8,9]", one, "[0,0,0,0,0,0]");
-  ValidateCompare<TypeParam>(&this->ctx_, lte, "[null,0,1,1]", one, "[null,1,1,1]");
+  ValidateCompare<TypeParam>(lte, "[]", one, "[]");
+  ValidateCompare<TypeParam>(lte, "[null]", one, "[null]");
+  ValidateCompare<TypeParam>(lte, "[0,0,1,1,2,2]", one, "[1,1,1,1,0,0]");
+  ValidateCompare<TypeParam>(lte, "[0,1,2,3,4,5]", one, "[1,1,0,0,0,0]");
+  ValidateCompare<TypeParam>(lte, "[4,5,6,7,8,9]", one, "[0,0,0,0,0,0]");
+  ValidateCompare<TypeParam>(lte, "[null,0,1,1]", one, "[null,1,1,1]");
 }
 
 TYPED_TEST(TestNumericCompareKernel, SimpleCompareScalarArray) {
@@ -310,52 +304,52 @@ TYPED_TEST(TestNumericCompareKernel, SimpleCompareScalarArray) {
   Datum one(std::make_shared<ScalarType>(CType(1)));
 
   CompareOptions eq(CompareOperator::EQUAL);
-  ValidateCompare<TypeParam>(&this->ctx_, eq, one, "[]", "[]");
-  ValidateCompare<TypeParam>(&this->ctx_, eq, one, "[null]", "[null]");
-  ValidateCompare<TypeParam>(&this->ctx_, eq, one, "[0,0,1,1,2,2]", "[0,0,1,1,0,0]");
-  ValidateCompare<TypeParam>(&this->ctx_, eq, one, "[0,1,2,3,4,5]", "[0,1,0,0,0,0]");
-  ValidateCompare<TypeParam>(&this->ctx_, eq, one, "[5,4,3,2,1,0]", "[0,0,0,0,1,0]");
-  ValidateCompare<TypeParam>(&this->ctx_, eq, one, "[null,0,1,1]", "[null,0,1,1]");
+  ValidateCompare<TypeParam>(eq, one, "[]", "[]");
+  ValidateCompare<TypeParam>(eq, one, "[null]", "[null]");
+  ValidateCompare<TypeParam>(eq, one, "[0,0,1,1,2,2]", "[0,0,1,1,0,0]");
+  ValidateCompare<TypeParam>(eq, one, "[0,1,2,3,4,5]", "[0,1,0,0,0,0]");
+  ValidateCompare<TypeParam>(eq, one, "[5,4,3,2,1,0]", "[0,0,0,0,1,0]");
+  ValidateCompare<TypeParam>(eq, one, "[null,0,1,1]", "[null,0,1,1]");
 
   CompareOptions neq(CompareOperator::NOT_EQUAL);
-  ValidateCompare<TypeParam>(&this->ctx_, neq, one, "[]", "[]");
-  ValidateCompare<TypeParam>(&this->ctx_, neq, one, "[null]", "[null]");
-  ValidateCompare<TypeParam>(&this->ctx_, neq, one, "[0,0,1,1,2,2]", "[1,1,0,0,1,1]");
-  ValidateCompare<TypeParam>(&this->ctx_, neq, one, "[0,1,2,3,4,5]", "[1,0,1,1,1,1]");
-  ValidateCompare<TypeParam>(&this->ctx_, neq, one, "[5,4,3,2,1,0]", "[1,1,1,1,0,1]");
-  ValidateCompare<TypeParam>(&this->ctx_, neq, one, "[null,0,1,1]", "[null,1,0,0]");
+  ValidateCompare<TypeParam>(neq, one, "[]", "[]");
+  ValidateCompare<TypeParam>(neq, one, "[null]", "[null]");
+  ValidateCompare<TypeParam>(neq, one, "[0,0,1,1,2,2]", "[1,1,0,0,1,1]");
+  ValidateCompare<TypeParam>(neq, one, "[0,1,2,3,4,5]", "[1,0,1,1,1,1]");
+  ValidateCompare<TypeParam>(neq, one, "[5,4,3,2,1,0]", "[1,1,1,1,0,1]");
+  ValidateCompare<TypeParam>(neq, one, "[null,0,1,1]", "[null,1,0,0]");
 
   CompareOptions gt(CompareOperator::GREATER);
-  ValidateCompare<TypeParam>(&this->ctx_, gt, one, "[]", "[]");
-  ValidateCompare<TypeParam>(&this->ctx_, gt, one, "[null]", "[null]");
-  ValidateCompare<TypeParam>(&this->ctx_, gt, one, "[0,0,1,1,2,2]", "[1,1,0,0,0,0]");
-  ValidateCompare<TypeParam>(&this->ctx_, gt, one, "[0,1,2,3,4,5]", "[1,0,0,0,0,0]");
-  ValidateCompare<TypeParam>(&this->ctx_, gt, one, "[4,5,6,7,8,9]", "[0,0,0,0,0,0]");
-  ValidateCompare<TypeParam>(&this->ctx_, gt, one, "[null,0,1,1]", "[null,1,0,0]");
+  ValidateCompare<TypeParam>(gt, one, "[]", "[]");
+  ValidateCompare<TypeParam>(gt, one, "[null]", "[null]");
+  ValidateCompare<TypeParam>(gt, one, "[0,0,1,1,2,2]", "[1,1,0,0,0,0]");
+  ValidateCompare<TypeParam>(gt, one, "[0,1,2,3,4,5]", "[1,0,0,0,0,0]");
+  ValidateCompare<TypeParam>(gt, one, "[4,5,6,7,8,9]", "[0,0,0,0,0,0]");
+  ValidateCompare<TypeParam>(gt, one, "[null,0,1,1]", "[null,1,0,0]");
 
   CompareOptions gte(CompareOperator::GREATER_EQUAL);
-  ValidateCompare<TypeParam>(&this->ctx_, gte, one, "[]", "[]");
-  ValidateCompare<TypeParam>(&this->ctx_, gte, one, "[null]", "[null]");
-  ValidateCompare<TypeParam>(&this->ctx_, gte, one, "[0,0,1,1,2,2]", "[1,1,1,1,0,0]");
-  ValidateCompare<TypeParam>(&this->ctx_, gte, one, "[0,1,2,3,4,5]", "[1,1,0,0,0,0]");
-  ValidateCompare<TypeParam>(&this->ctx_, gte, one, "[4,5,6,7,8,9]", "[0,0,0,0,0,0]");
-  ValidateCompare<TypeParam>(&this->ctx_, gte, one, "[null,0,1,1]", "[null,1,1,1]");
+  ValidateCompare<TypeParam>(gte, one, "[]", "[]");
+  ValidateCompare<TypeParam>(gte, one, "[null]", "[null]");
+  ValidateCompare<TypeParam>(gte, one, "[0,0,1,1,2,2]", "[1,1,1,1,0,0]");
+  ValidateCompare<TypeParam>(gte, one, "[0,1,2,3,4,5]", "[1,1,0,0,0,0]");
+  ValidateCompare<TypeParam>(gte, one, "[4,5,6,7,8,9]", "[0,0,0,0,0,0]");
+  ValidateCompare<TypeParam>(gte, one, "[null,0,1,1]", "[null,1,1,1]");
 
   CompareOptions lt(CompareOperator::LESS);
-  ValidateCompare<TypeParam>(&this->ctx_, lt, one, "[]", "[]");
-  ValidateCompare<TypeParam>(&this->ctx_, lt, one, "[null]", "[null]");
-  ValidateCompare<TypeParam>(&this->ctx_, lt, one, "[0,0,1,1,2,2]", "[0,0,0,0,1,1]");
-  ValidateCompare<TypeParam>(&this->ctx_, lt, one, "[0,1,2,3,4,5]", "[0,0,1,1,1,1]");
-  ValidateCompare<TypeParam>(&this->ctx_, lt, one, "[4,5,6,7,8,9]", "[1,1,1,1,1,1]");
-  ValidateCompare<TypeParam>(&this->ctx_, lt, one, "[null,0,1,1]", "[null,0,0,0]");
+  ValidateCompare<TypeParam>(lt, one, "[]", "[]");
+  ValidateCompare<TypeParam>(lt, one, "[null]", "[null]");
+  ValidateCompare<TypeParam>(lt, one, "[0,0,1,1,2,2]", "[0,0,0,0,1,1]");
+  ValidateCompare<TypeParam>(lt, one, "[0,1,2,3,4,5]", "[0,0,1,1,1,1]");
+  ValidateCompare<TypeParam>(lt, one, "[4,5,6,7,8,9]", "[1,1,1,1,1,1]");
+  ValidateCompare<TypeParam>(lt, one, "[null,0,1,1]", "[null,0,0,0]");
 
   CompareOptions lte(CompareOperator::LESS_EQUAL);
-  ValidateCompare<TypeParam>(&this->ctx_, lte, one, "[]", "[]");
-  ValidateCompare<TypeParam>(&this->ctx_, lte, one, "[null]", "[null]");
-  ValidateCompare<TypeParam>(&this->ctx_, lte, one, "[0,0,1,1,2,2]", "[0,0,1,1,1,1]");
-  ValidateCompare<TypeParam>(&this->ctx_, lte, one, "[0,1,2,3,4,5]", "[0,1,1,1,1,1]");
-  ValidateCompare<TypeParam>(&this->ctx_, lte, one, "[4,5,6,7,8,9]", "[1,1,1,1,1,1]");
-  ValidateCompare<TypeParam>(&this->ctx_, lte, one, "[null,0,1,1]", "[null,0,1,1]");
+  ValidateCompare<TypeParam>(lte, one, "[]", "[]");
+  ValidateCompare<TypeParam>(lte, one, "[null]", "[null]");
+  ValidateCompare<TypeParam>(lte, one, "[0,0,1,1,2,2]", "[0,0,1,1,1,1]");
+  ValidateCompare<TypeParam>(lte, one, "[0,1,2,3,4,5]", "[0,1,1,1,1,1]");
+  ValidateCompare<TypeParam>(lte, one, "[4,5,6,7,8,9]", "[1,1,1,1,1,1]");
+  ValidateCompare<TypeParam>(lte, one, "[null,0,1,1]", "[null,0,1,1]");
 }
 
 TYPED_TEST(TestNumericCompareKernel, TestNullScalar) {
@@ -366,11 +360,11 @@ TYPED_TEST(TestNumericCompareKernel, TestNullScalar) {
   EXPECT_FALSE(null.scalar()->is_valid);
 
   CompareOptions eq(CompareOperator::EQUAL);
-  ValidateCompare<TypeParam>(&this->ctx_, eq, "[]", null, "[]");
-  ValidateCompare<TypeParam>(&this->ctx_, eq, null, "[]", "[]");
-  ValidateCompare<TypeParam>(&this->ctx_, eq, "[null]", null, "[null]");
-  ValidateCompare<TypeParam>(&this->ctx_, eq, null, "[null]", "[null]");
-  ValidateCompare<TypeParam>(&this->ctx_, eq, null, "[1,2,3]", "[null, null, null]");
+  ValidateCompare<TypeParam>(eq, "[]", null, "[]");
+  ValidateCompare<TypeParam>(eq, null, "[]", "[]");
+  ValidateCompare<TypeParam>(eq, "[null]", null, "[null]");
+  ValidateCompare<TypeParam>(eq, null, "[null]", "[null]");
+  ValidateCompare<TypeParam>(eq, null, "[1,2,3]", "[null, null, null]");
 }
 
 TYPED_TEST_SUITE(TestNumericCompareKernel, NumericArrowTypes);
@@ -379,15 +373,15 @@ TYPED_TEST(TestNumericCompareKernel, RandomCompareArrayScalar) {
   using CType = typename TypeTraits<TypeParam>::CType;
 
   auto rand = random::RandomArrayGenerator(0x5416447);
-  for (size_t i = 3; i < 13; i++) {
+  for (size_t i = 3; i < 10; i++) {
     for (auto null_probability : {0.0, 0.01, 0.1, 0.25, 0.5, 1.0}) {
       for (auto op : {EQUAL, NOT_EQUAL, GREATER, LESS_EQUAL}) {
         const int64_t length = static_cast<int64_t>(1ULL << i);
         auto array = Datum(rand.Numeric<TypeParam>(length, 0, 100, null_probability));
         auto fifty = Datum(std::make_shared<ScalarType>(CType(50)));
         auto options = CompareOptions(op);
-        ValidateCompare<TypeParam>(&this->ctx_, options, array, fifty);
-        ValidateCompare<TypeParam>(&this->ctx_, options, fifty, array);
+        ValidateCompare<TypeParam>(options, array, fifty);
+        ValidateCompare<TypeParam>(options, fifty, array);
       }
     }
   }
@@ -396,16 +390,15 @@ TYPED_TEST(TestNumericCompareKernel, RandomCompareArrayScalar) {
 TYPED_TEST(TestNumericCompareKernel, SimpleCompareArrayArray) {
   /* Ensure that null scalar broadcast to all null results. */
   CompareOptions eq(CompareOperator::EQUAL);
-  ValidateCompare<TypeParam>(&this->ctx_, eq, "[]", "[]", "[]");
-  ValidateCompare<TypeParam>(&this->ctx_, eq, "[null]", "[null]", "[null]");
-  ValidateCompare<TypeParam>(&this->ctx_, eq, "[1]", "[1]", "[1]");
-  ValidateCompare<TypeParam>(&this->ctx_, eq, "[1]", "[2]", "[0]");
-  ValidateCompare<TypeParam>(&this->ctx_, eq, "[null]", "[1]", "[null]");
-  ValidateCompare<TypeParam>(&this->ctx_, eq, "[1]", "[null]", "[null]");
+  ValidateCompare<TypeParam>(eq, "[]", "[]", "[]");
+  ValidateCompare<TypeParam>(eq, "[null]", "[null]", "[null]");
+  ValidateCompare<TypeParam>(eq, "[1]", "[1]", "[1]");
+  ValidateCompare<TypeParam>(eq, "[1]", "[2]", "[0]");
+  ValidateCompare<TypeParam>(eq, "[null]", "[1]", "[null]");
+  ValidateCompare<TypeParam>(eq, "[1]", "[null]", "[null]");
 
   CompareOptions lte(CompareOperator::LESS_EQUAL);
-  ValidateCompare<TypeParam>(&this->ctx_, lte, "[1,2,3,4,5]", "[2,3,4,5,6]",
-                             "[1,1,1,1,1]");
+  ValidateCompare<TypeParam>(lte, "[1,2,3,4,5]", "[2,3,4,5,6]", "[1,1,1,1,1]");
 }
 
 TYPED_TEST(TestNumericCompareKernel, RandomCompareArrayArray) {
@@ -417,105 +410,92 @@ TYPED_TEST(TestNumericCompareKernel, RandomCompareArrayArray) {
         auto lhs = Datum(rand.Numeric<TypeParam>(length << i, 0, 100, null_probability));
         auto rhs = Datum(rand.Numeric<TypeParam>(length << i, 0, 100, null_probability));
         auto options = CompareOptions(op);
-        ValidateCompare<TypeParam>(&this->ctx_, options, lhs, rhs);
+        ValidateCompare<TypeParam>(options, lhs, rhs);
       }
     }
   }
 }
 
-class TestStringCompareKernel : public ComputeFixture, public TestBase {};
+class TestStringCompareKernel : public TestBase {};
 
 TEST_F(TestStringCompareKernel, SimpleCompareArrayScalar) {
   Datum one(std::make_shared<StringScalar>("one"));
 
   CompareOptions eq(CompareOperator::EQUAL);
-  ValidateCompare<StringType>(&this->ctx_, eq, "[]", one, "[]");
-  ValidateCompare<StringType>(&this->ctx_, eq, "[null]", one, "[null]");
-  ValidateCompare<StringType>(&this->ctx_, eq,
-                              "[\"zero\",\"zero\",\"one\",\"one\",\"two\",\"two\"]", one,
-                              "[0,0,1,1,0,0]");
-  ValidateCompare<StringType>(&this->ctx_, eq,
-                              "[\"zero\",\"one\",\"two\",\"three\",\"four\",\"five\"]",
-                              one, "[0,1,0,0,0,0]");
-  ValidateCompare<StringType>(&this->ctx_, eq,
-                              "[\"five\",\"four\",\"three\",\"two\",\"one\",\"zero\"]",
-                              one, "[0,0,0,0,1,0]");
-  ValidateCompare<StringType>(&this->ctx_, eq, "[null,\"zero\",\"one\",\"one\"]", one,
-                              "[null,0,1,1]");
+  ValidateCompare<StringType>(eq, "[]", one, "[]");
+  ValidateCompare<StringType>(eq, "[null]", one, "[null]");
+  ValidateCompare<StringType>(eq, "[\"zero\",\"zero\",\"one\",\"one\",\"two\",\"two\"]",
+                              one, "[0,0,1,1,0,0]");
+  ValidateCompare<StringType>(
+      eq, "[\"zero\",\"one\",\"two\",\"three\",\"four\",\"five\"]", one, "[0,1,0,0,0,0]");
+  ValidateCompare<StringType>(
+      eq, "[\"five\",\"four\",\"three\",\"two\",\"one\",\"zero\"]", one, "[0,0,0,0,1,0]");
+  ValidateCompare<StringType>(eq, "[null,\"zero\",\"one\",\"one\"]", one, "[null,0,1,1]");
 
   CompareOptions neq(CompareOperator::NOT_EQUAL);
-  ValidateCompare<StringType>(&this->ctx_, neq, "[]", one, "[]");
-  ValidateCompare<StringType>(&this->ctx_, neq, "[null]", one, "[null]");
-  ValidateCompare<StringType>(&this->ctx_, neq,
-                              "[\"zero\",\"zero\",\"one\",\"one\",\"two\",\"two\"]", one,
-                              "[1,1,0,0,1,1]");
-  ValidateCompare<StringType>(&this->ctx_, neq,
+  ValidateCompare<StringType>(neq, "[]", one, "[]");
+  ValidateCompare<StringType>(neq, "[null]", one, "[null]");
+  ValidateCompare<StringType>(neq, "[\"zero\",\"zero\",\"one\",\"one\",\"two\",\"two\"]",
+                              one, "[1,1,0,0,1,1]");
+  ValidateCompare<StringType>(neq,
                               "[\"zero\",\"one\",\"two\",\"three\",\"four\",\"five\"]",
                               one, "[1,0,1,1,1,1]");
-  ValidateCompare<StringType>(&this->ctx_, neq,
+  ValidateCompare<StringType>(neq,
                               "[\"five\",\"four\",\"three\",\"two\",\"one\",\"zero\"]",
                               one, "[1,1,1,1,0,1]");
-  ValidateCompare<StringType>(&this->ctx_, neq, "[null,\"zero\",\"one\",\"one\"]", one,
+  ValidateCompare<StringType>(neq, "[null,\"zero\",\"one\",\"one\"]", one,
                               "[null,1,0,0]");
 
   CompareOptions gt(CompareOperator::GREATER);
-  ValidateCompare<StringType>(&this->ctx_, gt, "[]", one, "[]");
-  ValidateCompare<StringType>(&this->ctx_, gt, "[null]", one, "[null]");
-  ValidateCompare<StringType>(&this->ctx_, gt,
-                              "[\"zero\",\"zero\",\"one\",\"one\",\"two\",\"two\"]", one,
-                              "[1,1,0,0,1,1]");
-  ValidateCompare<StringType>(&this->ctx_, gt,
-                              "[\"zero\",\"one\",\"two\",\"three\",\"four\",\"five\"]",
-                              one, "[1,0,1,1,0,0]");
-  ValidateCompare<StringType>(&this->ctx_, gt,
+  ValidateCompare<StringType>(gt, "[]", one, "[]");
+  ValidateCompare<StringType>(gt, "[null]", one, "[null]");
+  ValidateCompare<StringType>(gt, "[\"zero\",\"zero\",\"one\",\"one\",\"two\",\"two\"]",
+                              one, "[1,1,0,0,1,1]");
+  ValidateCompare<StringType>(
+      gt, "[\"zero\",\"one\",\"two\",\"three\",\"four\",\"five\"]", one, "[1,0,1,1,0,0]");
+  ValidateCompare<StringType>(gt,
                               "[\"four\",\"five\",\"six\",\"seven\",\"eight\",\"nine\"]",
                               one, "[0,0,1,1,0,0]");
-  ValidateCompare<StringType>(&this->ctx_, gt, "[null,\"zero\",\"one\",\"one\"]", one,
-                              "[null,1,0,0]");
+  ValidateCompare<StringType>(gt, "[null,\"zero\",\"one\",\"one\"]", one, "[null,1,0,0]");
 
   CompareOptions gte(CompareOperator::GREATER_EQUAL);
-  ValidateCompare<StringType>(&this->ctx_, gte, "[]", one, "[]");
-  ValidateCompare<StringType>(&this->ctx_, gte, "[null]", one, "[null]");
-  ValidateCompare<StringType>(&this->ctx_, gte,
-                              "[\"zero\",\"zero\",\"one\",\"one\",\"two\",\"two\"]", one,
-                              "[1,1,1,1,1,1]");
-  ValidateCompare<StringType>(&this->ctx_, gte,
+  ValidateCompare<StringType>(gte, "[]", one, "[]");
+  ValidateCompare<StringType>(gte, "[null]", one, "[null]");
+  ValidateCompare<StringType>(gte, "[\"zero\",\"zero\",\"one\",\"one\",\"two\",\"two\"]",
+                              one, "[1,1,1,1,1,1]");
+  ValidateCompare<StringType>(gte,
                               "[\"zero\",\"one\",\"two\",\"three\",\"four\",\"five\"]",
                               one, "[1,1,1,1,0,0]");
-  ValidateCompare<StringType>(&this->ctx_, gte,
+  ValidateCompare<StringType>(gte,
                               "[\"four\",\"five\",\"six\",\"seven\",\"eight\",\"nine\"]",
                               one, "[0,0,1,1,0,0]");
-  ValidateCompare<StringType>(&this->ctx_, gte, "[null,\"zero\",\"one\",\"one\"]", one,
+  ValidateCompare<StringType>(gte, "[null,\"zero\",\"one\",\"one\"]", one,
                               "[null,1,1,1]");
 
   CompareOptions lt(CompareOperator::LESS);
-  ValidateCompare<StringType>(&this->ctx_, lt, "[]", one, "[]");
-  ValidateCompare<StringType>(&this->ctx_, lt, "[null]", one, "[null]");
-  ValidateCompare<StringType>(&this->ctx_, lt,
-                              "[\"zero\",\"zero\",\"one\",\"one\",\"two\",\"two\"]", one,
-                              "[0,0,0,0,0,0]");
-  ValidateCompare<StringType>(&this->ctx_, lt,
-                              "[\"zero\",\"one\",\"two\",\"three\",\"four\",\"five\"]",
-                              one, "[0,0,0,0,1,1]");
-  ValidateCompare<StringType>(&this->ctx_, lt,
+  ValidateCompare<StringType>(lt, "[]", one, "[]");
+  ValidateCompare<StringType>(lt, "[null]", one, "[null]");
+  ValidateCompare<StringType>(lt, "[\"zero\",\"zero\",\"one\",\"one\",\"two\",\"two\"]",
+                              one, "[0,0,0,0,0,0]");
+  ValidateCompare<StringType>(
+      lt, "[\"zero\",\"one\",\"two\",\"three\",\"four\",\"five\"]", one, "[0,0,0,0,1,1]");
+  ValidateCompare<StringType>(lt,
                               "[\"four\",\"five\",\"six\",\"seven\",\"eight\",\"nine\"]",
                               one, "[1,1,0,0,1,1]");
-  ValidateCompare<StringType>(&this->ctx_, lt, "[null,\"zero\",\"one\",\"one\"]", one,
-                              "[null,0,0,0]");
+  ValidateCompare<StringType>(lt, "[null,\"zero\",\"one\",\"one\"]", one, "[null,0,0,0]");
 
   CompareOptions lte(CompareOperator::LESS_EQUAL);
-  ValidateCompare<StringType>(&this->ctx_, lte, "[]", one, "[]");
-  ValidateCompare<StringType>(&this->ctx_, lte, "[null]", one, "[null]");
-  ValidateCompare<StringType>(&this->ctx_, lte,
-                              "[\"zero\",\"zero\",\"one\",\"one\",\"two\",\"two\"]", one,
-                              "[0,0,1,1,0,0]");
-  ValidateCompare<StringType>(&this->ctx_, lte,
+  ValidateCompare<StringType>(lte, "[]", one, "[]");
+  ValidateCompare<StringType>(lte, "[null]", one, "[null]");
+  ValidateCompare<StringType>(lte, "[\"zero\",\"zero\",\"one\",\"one\",\"two\",\"two\"]",
+                              one, "[0,0,1,1,0,0]");
+  ValidateCompare<StringType>(lte,
                               "[\"zero\",\"one\",\"two\",\"three\",\"four\",\"five\"]",
                               one, "[0,1,0,0,1,1]");
-  ValidateCompare<StringType>(&this->ctx_, lte,
+  ValidateCompare<StringType>(lte,
                               "[\"four\",\"five\",\"six\",\"seven\",\"eight\",\"nine\"]",
                               one, "[1,1,0,0,1,1]");
-  ValidateCompare<StringType>(&this->ctx_, lte, "[null,\"zero\",\"one\",\"one\"]", one,
+  ValidateCompare<StringType>(lte, "[null,\"zero\",\"one\",\"one\"]", one,
                               "[null,0,1,1]");
 }
 
@@ -523,15 +503,15 @@ TEST_F(TestStringCompareKernel, RandomCompareArrayScalar) {
   using ScalarType = typename TypeTraits<StringType>::ScalarType;
 
   auto rand = random::RandomArrayGenerator(0x5416447);
-  for (size_t i = 3; i < 13; i++) {
+  for (size_t i = 3; i < 10; i++) {
     for (auto null_probability : {0.0, 0.01, 0.1, 0.25, 0.5, 1.0}) {
       for (auto op : {EQUAL, NOT_EQUAL, GREATER, LESS_EQUAL}) {
         const int64_t length = static_cast<int64_t>(1ULL << i);
         auto array = Datum(rand.String(length, 0, 16, null_probability));
         auto hello = Datum(std::make_shared<ScalarType>("hello"));
         auto options = CompareOptions(op);
-        ValidateCompare<StringType>(&this->ctx_, options, array, hello);
-        ValidateCompare<StringType>(&this->ctx_, options, hello, array);
+        ValidateCompare<StringType>(options, array, hello);
+        ValidateCompare<StringType>(options, hello, array);
       }
     }
   }
@@ -546,7 +526,7 @@ TEST_F(TestStringCompareKernel, RandomCompareArrayArray) {
         auto lhs = Datum(rand.String(length << i, 0, 16, null_probability));
         auto rhs = Datum(rand.String(length << i, 0, 16, null_probability));
         auto options = CompareOptions(op);
-        ValidateCompare<StringType>(&this->ctx_, options, lhs, rhs);
+        ValidateCompare<StringType>(options, lhs, rhs);
       }
     }
   }
diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
new file mode 100644
index 00000000000..0a725dbf8a3
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
@@ -0,0 +1,317 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+
+#include "arrow/builder.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/util/hashing.h"
+#include "arrow/util/optional.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+
+namespace compute {
+
+template <typename T, typename R = void>
+using enable_if_supports_set_lookup =
+    enable_if_t<has_c_type<T>::value || is_base_binary_type<T>::value ||
+                    is_fixed_size_binary_type<T>::value || is_decimal_type<T>::value,
+                R>;
+
+template <typename Type>
+struct SetLookupState : public KernelState {
+  explicit SetLookupState(MemoryPool* pool) : lookup_table(pool, 0) {}
+
+  Status Init(const SetLookupOptions& options) {
+    using T = typename GetValueType<Type>::T;
+    this->lookup_null_count = options.value_set->null_count();
+    auto insert_value = [&](util::optional<T> v) {
+      if (v.has_value()) {
+        int32_t unused_memo_index;
+        return lookup_table.GetOrInsert(*v, &unused_memo_index);
+      } else if (!options.skip_nulls) {
+        lookup_table.GetOrInsertNull();
+      }
+      return Status::OK();
+    };
+    return VisitArrayDataInline<Type>(*options.value_set->data(), insert_value);
+  }
+
+  using MemoTable = typename internal::HashTraits<Type>::MemoTableType;
+  MemoTable lookup_table;
+  int64_t lookup_null_count;
+  int64_t null_index = -1;
+};
+
+template <>
+struct SetLookupState<NullType> : public KernelState {
+  explicit SetLookupState(MemoryPool*) {}
+
+  Status Init(const SetLookupOptions& options) {
+    this->lookup_null_count = options.value_set->null_count();
+    return Status::OK();
+  }
+
+  int64_t lookup_null_count;
+};
+
+// Constructing the type requires a type parameter
+struct InitStateVisitor {
+  KernelContext* ctx;
+  const SetLookupOptions* options;
+  std::unique_ptr<KernelState> result;
+
+  InitStateVisitor(KernelContext* ctx, const SetLookupOptions* options)
+      : ctx(ctx), options(options) {}
+
+  template <typename Type>
+  Status Init() {
+    using StateType = SetLookupState<Type>;
+    result.reset(new StateType(ctx->exec_context()->memory_pool()));
+    return static_cast<StateType*>(result.get())->Init(*options);
+  }
+
+  Status Visit(const DataType&) { return Init<NullType>(); }
+
+  template <typename Type>
+  enable_if_supports_set_lookup<Type, Status> Visit(const Type&) {
+    return Init<Type>();
+  }
+  Status GetResult(std::unique_ptr<KernelState>* out) {
+    RETURN_NOT_OK(VisitTypeInline(*options->value_set->type(), this));
+    *out = std::move(result);
+    return Status::OK();
+  }
+};
+
+std::unique_ptr<KernelState> InitSetLookup(KernelContext* ctx, const Kernel&,
+                                           const FunctionOptions* options) {
+  InitStateVisitor visitor{ctx, static_cast<const SetLookupOptions*>(options)};
+  std::unique_ptr<KernelState> result;
+  ctx->SetStatus(visitor.GetResult(&result));
+  return result;
+}
+
+struct MatchVisitor {
+  KernelContext* ctx;
+  const ArrayData& data;
+  Datum* out;
+  Int32Builder builder;
+
+  MatchVisitor(KernelContext* ctx, const ArrayData& data, Datum* out)
+      : ctx(ctx), data(data), out(out), builder(ctx->exec_context()->memory_pool()) {}
+
+  Status Visit(const DataType&) {
+    const auto& state = checked_cast<const SetLookupState<NullType>&>(*ctx->state());
+    if (data.length != 0) {
+      if (state.lookup_null_count == 0) {
+        RETURN_NOT_OK(this->builder.AppendNulls(data.length));
+      } else {
+        RETURN_NOT_OK(this->builder.Reserve(data.length));
+        for (int64_t i = 0; i < data.length; ++i) {
+          this->builder.UnsafeAppend(0);
+        }
+      }
+    }
+    return Status::OK();
+  }
+
+  template <typename Type>
+  enable_if_supports_set_lookup<Type, Status> Visit(const Type&) {
+    using T = typename GetValueType<Type>::T;
+
+    const auto& state = checked_cast<const SetLookupState<Type>&>(*ctx->state());
+
+    int32_t null_index = state.lookup_table.GetNull();
+    RETURN_NOT_OK(this->builder.Reserve(data.length));
+    auto lookup_value = [&](util::optional<T> v) {
+      if (v.has_value()) {
+        int32_t index = state.lookup_table.Get(*v);
+        if (index != -1) {
+          // matching needle; output index from value_set
+          this->builder.UnsafeAppend(index);
+        } else {
+          // no matching needle; output null
+          this->builder.UnsafeAppendNull();
+        }
+      } else {
+        if (null_index != -1) {
+          // value_set included null
+          this->builder.UnsafeAppend(null_index);
+        } else {
+          // value_set does not include null; output null
+          this->builder.UnsafeAppendNull();
+        }
+      }
+    };
+    VisitArrayDataInline<Type>(data, lookup_value);
+    return Status::OK();
+  }
+
+  Status Execute() {
+    Status s = VisitTypeInline(*data.type, this);
+    if (!s.ok()) {
+      return s;
+    }
+    std::shared_ptr<ArrayData> out_data;
+    RETURN_NOT_OK(this->builder.FinishInternal(&out_data));
+    out->value = std::move(out_data);
+    return Status::OK();
+  }
+};
+
+void ExecMatch(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  MatchVisitor dispatch(ctx, *batch[0].array(), out);
+  ctx->SetStatus(dispatch.Execute());
+}
+
+// ----------------------------------------------------------------------
+
+// IsIn writes the results into a preallocated binary data bitmap
+struct IsInVisitor {
+  KernelContext* ctx;
+  const ArrayData& data;
+  Datum* out;
+
+  IsInVisitor(KernelContext* ctx, const ArrayData& data, Datum* out)
+      : ctx(ctx), data(data), out(out) {}
+
+  Status Visit(const DataType&) {
+    const auto& state = checked_cast<const SetLookupState<NullType>&>(*ctx->state());
+    ArrayData* output = out->mutable_array();
+    if (state.lookup_null_count > 0) {
+      BitUtil::SetBitsTo(output->buffers[0]->mutable_data(), output->offset,
+                         output->length, true);
+      BitUtil::SetBitsTo(output->buffers[1]->mutable_data(), output->offset,
+                         output->length, true);
+    } else {
+      BitUtil::SetBitsTo(output->buffers[1]->mutable_data(), output->offset,
+                         output->length, false);
+    }
+    return Status::OK();
+  }
+
+  template <typename Type>
+  enable_if_supports_set_lookup<Type, Status> Visit(const Type&) {
+    using T = typename GetValueType<Type>::T;
+    const auto& state = checked_cast<const SetLookupState<Type>&>(*ctx->state());
+    ArrayData* output = out->mutable_array();
+
+    if (this->data.GetNullCount() > 0 && state.lookup_null_count > 0) {
+      // If there were nulls in the value set, set the whole validity bitmap to
+      // true
+      output->null_count = 0;
+      BitUtil::SetBitsTo(output->buffers[0]->mutable_data(), output->offset,
+                         output->length, true);
+    }
+    internal::FirstTimeBitmapWriter writer(output->buffers[1]->mutable_data(),
+                                           output->offset, output->length);
+    auto lookup_value = [&](util::optional<T> v) {
+      if (!v.has_value() || state.lookup_table.Get(*v) != -1) {
+        writer.Set();
+      } else {
+        writer.Clear();
+      }
+      writer.Next();
+    };
+    VisitArrayDataInline<Type>(this->data, std::move(lookup_value));
+    writer.Finish();
+    return Status::OK();
+  }
+
+  Status Execute() { return VisitTypeInline(*data.type, this); }
+};
+
+void ExecIsIn(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  IsInVisitor dispatch(ctx, *batch[0].array(), out);
+  ctx->SetStatus(dispatch.Execute());
+}
+
+namespace codegen {
+
+// Unary set lookup kernels available for the following input types
+//
+// * Null type
+// * Boolean
+// * Numeric
+// * Simple temporal types (date, time, timestamp)
+// * Base binary types
+// * Decimal
+
+void AddBasicSetLookupKernels(ScalarKernel kernel,
+                              const std::shared_ptr<DataType>& out_ty,
+                              ScalarFunction* func) {
+  auto AddKernels = [&](const std::vector<std::shared_ptr<DataType>>& types) {
+    for (const std::shared_ptr<DataType>& ty : types) {
+      kernel.signature = KernelSignature::Make({InputType::Array(ty)}, out_ty);
+      DCHECK_OK(func->AddKernel(kernel));
+    }
+  };
+
+  AddKernels(BaseBinaryTypes());
+  AddKernels(NumericTypes());
+  AddKernels(TemporalTypes());
+
+  std::vector<Type::type> other_types = {Type::BOOL, Type::DECIMAL,
+                                         Type::FIXED_SIZE_BINARY};
+  for (auto ty : other_types) {
+    kernel.signature = KernelSignature::Make({InputType::Array(ty)}, out_ty);
+    DCHECK_OK(func->AddKernel(kernel));
+  }
+}
+
+}  // namespace codegen
+
+namespace internal {
+
+void RegisterSetLookupFunctions(FunctionRegistry* registry) {
+  // IsIn always writes into preallocated memory
+  {
+    ScalarKernel isin_base;
+    isin_base.init = InitSetLookup;
+    isin_base.exec = ExecIsIn;
+    auto isin = std::make_shared<ScalarFunction>("isin", /*arity=*/1);
+
+    codegen::AddBasicSetLookupKernels(isin_base, /*output_type=*/boolean(), isin.get());
+
+    isin_base.signature = KernelSignature::Make({InputType::Array(null())}, boolean());
+    isin_base.null_handling = NullHandling::COMPUTED_PREALLOCATE;
+    DCHECK_OK(isin->AddKernel(isin_base));
+    DCHECK_OK(registry->AddFunction(isin));
+  }
+
+  // Match uses Int32Builder and so is responsible for all its own allocation
+  {
+    ScalarKernel match_base;
+    match_base.init = InitSetLookup;
+    match_base.exec = ExecMatch;
+    match_base.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+    match_base.mem_allocation = MemAllocation::NO_PREALLOCATE;
+    auto match = std::make_shared<ScalarFunction>("match", /*arity=*/1);
+    codegen::AddBasicSetLookupKernels(match_base, /*output_type=*/int32(), match.get());
+
+    match_base.signature = KernelSignature::Make({InputType::Array(null())}, int32());
+    DCHECK_OK(match->AddKernel(match_base));
+    DCHECK_OK(registry->AddFunction(match));
+  }
+}
+
+}  // namespace internal
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc
new file mode 100644
index 00000000000..8e9b3c86506
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc
@@ -0,0 +1,677 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <functional>
+#include <locale>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "arrow/array.h"
+#include "arrow/buffer.h"
+#include "arrow/compute/api_eager.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/compute/test_util.h"
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "arrow/table.h"
+#include "arrow/testing/gtest_common.h"
+#include "arrow/testing/util.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/decimal.h"
+
+namespace arrow {
+namespace compute {
+
+// ----------------------------------------------------------------------
+// IsIn tests
+
+template <typename Type, typename T = typename TypeTraits<Type>::c_type>
+void CheckIsIn(const std::shared_ptr<DataType>& type, const std::vector<T>& in_values,
+               const std::vector<bool>& in_is_valid,
+               const std::vector<T>& member_set_values,
+               const std::vector<bool>& member_set_is_valid,
+               const std::vector<bool>& out_values,
+               const std::vector<bool>& out_is_valid) {
+  std::shared_ptr<Array> input = _MakeArray<Type, T>(type, in_values, in_is_valid);
+  std::shared_ptr<Array> member_set =
+      _MakeArray<Type, T>(type, member_set_values, member_set_is_valid);
+  std::shared_ptr<Array> expected =
+      _MakeArray<BooleanType, bool>(boolean(), out_values, out_is_valid);
+
+  ASSERT_OK_AND_ASSIGN(Datum datum_out, IsIn(input, member_set));
+  std::shared_ptr<Array> result = datum_out.make_array();
+  ASSERT_OK(result->ValidateFull());
+  AssertArraysEqual(*expected, *result, /*verbose=*/true);
+}
+
+class TestIsInKernel : public ::testing::Test {};
+
+template <typename Type>
+class TestIsInKernelPrimitive : public ::testing::Test {};
+
+template <typename Type>
+class TestIsInKernelBinary : public ::testing::Test {};
+
+typedef ::testing::Types<Int8Type, UInt8Type, Int16Type, UInt16Type, Int32Type,
+                         UInt32Type, Int64Type, UInt64Type, FloatType, DoubleType,
+                         Date32Type, Date64Type>
+    PrimitiveDictionaries;
+
+TYPED_TEST_SUITE(TestIsInKernelPrimitive, PrimitiveDictionaries);
+
+TYPED_TEST(TestIsInKernelPrimitive, IsIn) {
+  using T = typename TypeParam::c_type;
+  auto type = TypeTraits<TypeParam>::type_singleton();
+
+  // No Nulls
+  CheckIsIn<TypeParam, T>(type, {2, 1, 2, 1, 2, 3}, {true, true, true, true, true, true},
+                          {2, 1, 2, 3}, {true, true, true, true, true},
+                          {true, true, true, true, true, true}, {});
+
+  // Nulls in left array
+  CheckIsIn<TypeParam, T>(type, {2, 1, 2, 1, 2, 3},
+                          {false, false, false, false, false, false}, {2, 1, 2, 1, 3}, {},
+                          {false, false, false, false, false, false},
+                          {false, false, false, false, false, false});
+
+  // Nulls in right array
+  CheckIsIn<TypeParam, T>(type, {2, 1, 2, 1, 2, 3}, {}, {2, 1, 2, 3},
+                          {false, false, false, false},
+                          {false, false, false, false, false, false}, {});
+
+  // Nulls in both the arrays
+  CheckIsIn<TypeParam, T>(
+      type, {2, 1, 2, 3}, {false, false, false, false}, {2, 1, 2, 1, 2, 3, 3},
+      {false, false, false, false, false, false, false}, {true, true, true, true}, {});
+  // No Match
+  CheckIsIn<TypeParam, T>(
+      type, {2, 1, 7, 3, 8}, {true, false, true, true, true}, {2, 1, 2, 1, 6, 3, 3},
+      {true, false, true, false, true, true, true}, {true, true, false, true, false}, {});
+
+  // Empty Arrays
+  CheckIsIn<TypeParam, T>(type, {}, {}, {}, {}, {}, {});
+}
+
+TYPED_TEST(TestIsInKernelPrimitive, PrimitiveResizeTable) {
+  using T = typename TypeParam::c_type;
+
+  const int64_t kTotalValues = std::min<int64_t>(INT16_MAX, 1UL << sizeof(T) / 2);
+  const int64_t kRepeats = 5;
+
+  std::vector<T> values;
+  std::vector<T> member_set;
+  std::vector<bool> expected;
+  for (int64_t i = 0; i < kTotalValues * kRepeats; i++) {
+    const auto val = static_cast<T>(i % kTotalValues);
+    values.push_back(val);
+    member_set.push_back(val);
+    expected.push_back(static_cast<bool>(true));
+  }
+
+  auto type = TypeTraits<TypeParam>::type_singleton();
+  CheckIsIn<TypeParam, T>(type, values, {}, member_set, {}, expected, {});
+}
+
+TEST_F(TestIsInKernel, IsInNull) {
+  CheckIsIn<NullType, std::nullptr_t>(null(), {0, 0, 0}, {false, false, false}, {0, 0, 0},
+                                      {false, false, false}, {true, true, true}, {});
+
+  CheckIsIn<NullType, std::nullptr_t>(null(), {NULL, NULL, NULL}, {},
+                                      {NULL, NULL, NULL, NULL}, {}, {true, true, true},
+                                      {});
+
+  CheckIsIn<NullType, std::nullptr_t>(null(), {nullptr, nullptr, nullptr}, {}, {nullptr},
+                                      {}, {true, true, true}, {});
+
+  // Empty left array
+  CheckIsIn<NullType, std::nullptr_t>(null(), {}, {}, {nullptr, nullptr, nullptr}, {}, {},
+                                      {});
+
+  // Empty right array
+  CheckIsIn<NullType, std::nullptr_t>(null(), {nullptr, nullptr, nullptr}, {}, {}, {},
+                                      {false, false, false}, {false, false, false});
+
+  // Empty arrays
+  CheckIsIn<NullType, std::nullptr_t>(null(), {}, {}, {}, {}, {}, {});
+}
+
+TEST_F(TestIsInKernel, IsInTimeTimestamp) {
+  CheckIsIn<Time32Type, int32_t>(
+      time32(TimeUnit::SECOND), {2, 1, 5, 1}, {true, false, true, true}, {2, 1, 2, 1},
+      {true, false, true, true}, {true, true, false, true}, {});
+
+  // Right array has no Nulls
+  CheckIsIn<Time32Type, int32_t>(time32(TimeUnit::SECOND), {2, 1, 5, 1},
+                                 {true, false, true, true}, {2, 1, 1}, {true, true, true},
+                                 {true, false, false, true}, {true, false, true, true});
+
+  // No match
+  CheckIsIn<Time32Type, int32_t>(time32(TimeUnit::SECOND), {3, 5, 5, 3},
+                                 {true, false, true, true}, {2, 1, 2, 1, 2},
+                                 {true, true, true, true, true},
+                                 {false, false, false, false}, {true, false, true, true});
+
+  // Empty arrays
+  CheckIsIn<Time32Type, int32_t>(time32(TimeUnit::SECOND), {}, {}, {}, {}, {}, {});
+
+  CheckIsIn<Time64Type, int64_t>(time64(TimeUnit::NANO), {2, 1, 2, 1},
+                                 {true, false, true, true}, {2, 1, 1},
+                                 {true, false, true}, {true, true, true, true}, {});
+
+  CheckIsIn<TimestampType, int64_t>(
+      timestamp(TimeUnit::NANO), {2, 1, 2, 1}, {true, false, true, true}, {2, 1, 2, 1},
+      {true, false, true, true}, {true, true, true, true}, {});
+
+  // Empty left array
+  CheckIsIn<TimestampType, int64_t>(timestamp(TimeUnit::NANO), {}, {}, {2, 1, 2, 1},
+                                    {true, false, true, true}, {}, {});
+
+  // Empty right array
+  CheckIsIn<TimestampType, int64_t>(
+      timestamp(TimeUnit::NANO), {2, 1, 2, 1}, {true, false, true, true}, {}, {},
+      {false, false, false, false}, {true, false, true, true});
+
+  // Both array have Nulls
+  CheckIsIn<Time32Type, int32_t>(time32(TimeUnit::SECOND), {2, 1, 2, 1},
+                                 {false, false, false, false}, {2, 1}, {false, false},
+                                 {true, true, true, true}, {});
+}
+
+TEST_F(TestIsInKernel, IsInBoolean) {
+  CheckIsIn<BooleanType, bool>(boolean(), {false, true, false, true},
+                               {true, false, true, true}, {true, false, true},
+                               {false, true, true}, {true, true, true, true}, {});
+
+  CheckIsIn<BooleanType, bool>(
+      boolean(), {false, true, false, true}, {true, false, true, true},
+      {false, true, false, true, false}, {true, true, false, true, false},
+      {true, true, true, true}, {});
+
+  // No Nulls
+  CheckIsIn<BooleanType, bool>(boolean(), {true, true, false, true}, {}, {false, true},
+                               {}, {true, true, true, true}, {});
+
+  CheckIsIn<BooleanType, bool>(boolean(), {false, true, false, true}, {},
+                               {true, true, true, true}, {}, {false, true, false, true},
+                               {});
+
+  // No match
+  CheckIsIn<BooleanType, bool>(boolean(), {true, true, true, true}, {},
+                               {false, false, false, false, false}, {},
+                               {false, false, false, false}, {});
+
+  // Nulls in left array
+  CheckIsIn<BooleanType, bool>(
+      boolean(), {false, true, false, true}, {false, false, false, false}, {true, true},
+      {}, {false, false, false, false}, {false, false, false, false});
+
+  // Nulls in right array
+  CheckIsIn<BooleanType, bool>(
+      boolean(), {true, true, false, true}, {}, {true, true, false, true, true},
+      {false, false, false, false, false}, {false, false, false, false}, {});
+
+  // Both array have Nulls
+  CheckIsIn<BooleanType, bool>(boolean(), {false, true, false, true},
+                               {false, false, false, false}, {true, true, true, true},
+                               {false, false, false, false}, {true, true, true, true},
+                               {});
+}
+
+using BinaryTypes = ::testing::Types<BinaryType, StringType>;
+TYPED_TEST_SUITE(TestIsInKernelBinary, BinaryTypes);
+
+TYPED_TEST(TestIsInKernelBinary, IsInBinary) {
+  auto type = TypeTraits<TypeParam>::type_singleton();
+  CheckIsIn<TypeParam, std::string>(type, {"test", "", "test2", "test"},
+                                    {true, false, true, true}, {"test", "", "test2"},
+                                    {true, false, true}, {true, true, true, true}, {});
+
+  // No match
+  CheckIsIn<TypeParam, std::string>(
+      type, {"test", "", "test2", "test"}, {true, false, true, true},
+      {"test3", "test4", "test3", "test4"}, {true, true, true, true},
+      {false, false, false, false}, {true, false, true, true});
+
+  // Nulls in left array
+  CheckIsIn<TypeParam, std::string>(
+      type, {"test", "", "test2", "test"}, {false, false, false, false},
+      {"test", "test2", "test"}, {true, true, true}, {false, false, false, false},
+      {false, false, false, false});
+
+  // Nulls in right array
+  CheckIsIn<TypeParam, std::string>(
+      type, {"test", "test2", "test"}, {true, true, true}, {"test", "", "test2", "test"},
+      {false, false, false, false}, {false, false, false}, {});
+
+  // Both array have Nulls
+  CheckIsIn<TypeParam, std::string>(
+      type, {"test", "", "test2", "test"}, {false, false, false, false},
+      {"test", "", "test2", "test"}, {false, false, false, false},
+      {true, true, true, true}, {});
+
+  // Empty arrays
+  CheckIsIn<TypeParam, std::string>(type, {}, {}, {}, {}, {}, {});
+
+  // Empty left array
+  CheckIsIn<TypeParam, std::string>(type, {}, {}, {"test", "", "test2", "test"},
+                                    {true, false, true, false}, {}, {});
+
+  // Empty right array
+  CheckIsIn<TypeParam, std::string>(
+      type, {"test", "", "test2", "test"}, {true, false, true, true}, {}, {},
+      {false, false, false, false}, {true, false, true, true});
+}
+
+TEST_F(TestIsInKernel, BinaryResizeTable) {
+  const int32_t kTotalValues = 10000;
+#if !defined(ARROW_VALGRIND)
+  const int32_t kRepeats = 10;
+#else
+  // Mitigate Valgrind's slowness
+  const int32_t kRepeats = 3;
+#endif
+
+  std::vector<std::string> values;
+  std::vector<std::string> member_set;
+  std::vector<bool> expected;
+  char buf[20] = "test";
+
+  for (int32_t i = 0; i < kTotalValues * kRepeats; i++) {
+    int32_t index = i % kTotalValues;
+
+    ASSERT_GE(snprintf(buf + 4, sizeof(buf) - 4, "%d", index), 0);
+    values.emplace_back(buf);
+    member_set.emplace_back(buf);
+    expected.push_back(true);
+  }
+
+  CheckIsIn<BinaryType, std::string>(binary(), values, {}, member_set, {}, expected, {});
+
+  CheckIsIn<StringType, std::string>(utf8(), values, {}, member_set, {}, expected, {});
+}
+
+TEST_F(TestIsInKernel, IsInFixedSizeBinary) {
+  CheckIsIn<FixedSizeBinaryType, std::string>(
+      fixed_size_binary(5), {"bbbbb", "", "aaaaa", "ccccc"}, {true, false, true, true},
+      {"bbbbb", "", "bbbbb", "aaaaa", "ccccc"}, {true, false, true, true, true},
+      {true, true, true, true}, {});
+
+  // Nulls in left
+  CheckIsIn<FixedSizeBinaryType, std::string>(
+      fixed_size_binary(5), {"bbbbb", "", "bbbbb", "aaaaa", "ccccc"},
+      {false, false, false, false, false}, {"bbbbb", "aabbb", "bbbbb", "aaaaa", "ccccc"},
+      {true, true, true, true, true}, {false, false, false, false, false},
+      {false, false, false, false, false});
+
+  // Nulls in right
+  CheckIsIn<FixedSizeBinaryType, std::string>(
+      fixed_size_binary(5), {"bbbbb", "", "bbbbb", "aaaaa", "ccccc"},
+      {true, false, true, true, true}, {"bbbbb", "", "bbbbb"}, {false, false, false},
+      {false, true, false, false, false}, {});
+
+  // Both array have Nulls
+  CheckIsIn<FixedSizeBinaryType, std::string>(
+      fixed_size_binary(5), {"bbbbb", "", "bbbbb", "aaaaa", "ccccc"},
+      {false, false, false, false, false}, {"", "", "bbbbb", "aaaaa"},
+      {false, false, false, false}, {true, true, true, true, true}, {});
+
+  // No match
+  CheckIsIn<FixedSizeBinaryType, std::string>(
+      fixed_size_binary(5), {"bbbbc", "bbbbc", "aaaad", "cccca"},
+      {true, true, true, true}, {"bbbbb", "", "bbbbb", "aaaaa", "ddddd"},
+      {true, false, true, true, true}, {false, false, false, false}, {});
+
+  // Empty left array
+  CheckIsIn<FixedSizeBinaryType, std::string>(fixed_size_binary(5), {}, {},
+                                              {"bbbbb", "", "bbbbb", "aaaaa", "ccccc"},
+                                              {true, false, true, true, true}, {}, {});
+
+  // Empty right array
+  CheckIsIn<FixedSizeBinaryType, std::string>(
+      fixed_size_binary(5), {"bbbbb", "", "bbbbb", "aaaaa", "ccccc"},
+      {true, false, true, true, true}, {}, {}, {false, false, false, false, false},
+      {true, false, true, true, true});
+
+  // Empty arrays
+  CheckIsIn<FixedSizeBinaryType, std::string>(fixed_size_binary(0), {}, {}, {}, {}, {},
+                                              {});
+}
+
+TEST_F(TestIsInKernel, IsInDecimal) {
+  std::vector<Decimal128> input{12, 12, 11, 12};
+  std::vector<Decimal128> member_set{12, 12, 11, 12};
+  std::vector<bool> expected{true, true, true, true};
+
+  CheckIsIn<Decimal128Type, Decimal128>(decimal(2, 0), input, {true, false, true, true},
+                                        member_set, {true, false, true, true}, expected,
+                                        {});
+}
+
+// ----------------------------------------------------------------------
+// Match tests
+
+class TestMatchKernel : public ::testing::Test {
+ public:
+  void CheckMatch(const std::shared_ptr<DataType>& type, const std::string& haystack_json,
+                  const std::string& needles_json, const std::string& expected_json) {
+    std::shared_ptr<Array> haystack = ArrayFromJSON(type, haystack_json);
+    std::shared_ptr<Array> needles = ArrayFromJSON(type, needles_json);
+    std::shared_ptr<Array> expected = ArrayFromJSON(int32(), expected_json);
+
+    ASSERT_OK_AND_ASSIGN(Datum actual_datum, Match(haystack, needles));
+    std::shared_ptr<Array> actual = actual_datum.make_array();
+    AssertArraysEqual(*expected, *actual, /*verbose=*/true);
+  }
+};
+
+template <typename Type>
+class TestMatchKernelPrimitive : public TestMatchKernel {};
+
+using PrimitiveDictionaries =
+    ::testing::Types<Int8Type, UInt8Type, Int16Type, UInt16Type, Int32Type, UInt32Type,
+                     Int64Type, UInt64Type, FloatType, DoubleType, Date32Type,
+                     Date64Type>;
+
+TYPED_TEST_SUITE(TestMatchKernelPrimitive, PrimitiveDictionaries);
+
+TYPED_TEST(TestMatchKernelPrimitive, Match) {
+  auto type = TypeTraits<TypeParam>::type_singleton();
+
+  // No Nulls
+  this->CheckMatch(type,
+                   /* haystack= */ "[2, 1, 2, 1, 2, 3]",
+                   /* needles= */ "[2, 1, 2, 3]",
+                   /* expected= */ "[0, 1, 0, 1, 0, 2]");
+
+  // Haystack array all null
+  this->CheckMatch(type,
+                   /* haystack= */ "[null, null, null, null, null, null]",
+                   /* needles= */ "[2, 1, 3]",
+                   /* expected= */ "[null, null, null, null, null, null]");
+
+  // Needles array all null
+  this->CheckMatch(type,
+                   /* haystack= */ "[2, 1, 2, 1, 2, 3]",
+                   /* needles= */ "[null, null, null, null]",
+                   /* expected= */ "[null, null, null, null, null, null]");
+
+  // Both arrays all null
+  this->CheckMatch(type,
+                   /* haystack= */ "[null, null, null, null]",
+                   /* needles= */ "[null, null]",
+                   /* expected= */ "[0, 0, 0, 0]");
+
+  // No Match
+  this->CheckMatch(type,
+                   /* haystack= */ "[2, null, 7, 3, 8]",
+                   /* needles= */ "[2, null, 2, null, 6, 3, 3]",
+                   /* expected= */ "[0, 1, null, 3, null]");
+
+  // Empty Arrays
+  this->CheckMatch(type, "[]", "[]", "[]");
+}
+
+TYPED_TEST(TestMatchKernelPrimitive, PrimitiveResizeTable) {
+  using T = typename TypeParam::c_type;
+
+  const int64_t kTotalValues = std::min<int64_t>(INT16_MAX, 1UL << sizeof(T) / 2);
+  const int64_t kRepeats = 5;
+
+  Int32Builder expected_builder;
+  NumericBuilder<TypeParam> haystack_builder;
+  ASSERT_OK(expected_builder.Resize(kTotalValues * kRepeats));
+  ASSERT_OK(haystack_builder.Resize(kTotalValues * kRepeats));
+
+  for (int64_t i = 0; i < kTotalValues * kRepeats; i++) {
+    const auto index = i % kTotalValues;
+
+    haystack_builder.UnsafeAppend(static_cast<T>(index));
+    expected_builder.UnsafeAppend(static_cast<int32_t>(index));
+  }
+
+  std::shared_ptr<Array> haystack, needles, expected;
+  ASSERT_OK(haystack_builder.Finish(&haystack));
+  needles = haystack;
+  ASSERT_OK(expected_builder.Finish(&expected));
+
+  ASSERT_OK_AND_ASSIGN(Datum actual_datum, Match(haystack, needles));
+  std::shared_ptr<Array> actual = actual_datum.make_array();
+  ASSERT_ARRAYS_EQUAL(*expected, *actual);
+}
+
+TEST_F(TestMatchKernel, MatchNull) {
+  CheckMatch(null(), "[null, null, null]", "[null, null]", "[0, 0, 0]");
+
+  CheckMatch(null(), "[null, null, null]", "[]", "[null, null, null]");
+
+  CheckMatch(null(), "[]", "[null, null]", "[]");
+
+  CheckMatch(null(), "[]", "[]", "[]");
+}
+
+TEST_F(TestMatchKernel, MatchTimeTimestamp) {
+  CheckMatch(time32(TimeUnit::SECOND),
+             /* haystack= */ "[1, null, 5, 1, 2]",
+             /* needles= */ "[2, 1, null, 1]",
+             /* expected= */ "[1, 2, null, 1, 0]");
+
+  // Needles array has no nulls
+  CheckMatch(time32(TimeUnit::SECOND),
+             /* haystack= */ "[2, null, 5, 1]",
+             /* needles= */ "[2, 1, 1]",
+             /* expected= */ "[0, null, null, 1]");
+
+  // No match
+  CheckMatch(time32(TimeUnit::SECOND), "[3, null, 5, 3]", "[2, 1, 2, 1, 2]",
+             "[null, null, null, null]");
+
+  // Empty arrays
+  CheckMatch(time32(TimeUnit::SECOND), "[]", "[]", "[]");
+
+  CheckMatch(time64(TimeUnit::NANO), "[2, null, 2, 1]", "[2, null, 1]", "[0, 1, 0, 2]");
+
+  CheckMatch(timestamp(TimeUnit::NANO), "[2, null, 2, 1]", "[2, null, 2, 1]",
+             "[0, 1, 0, 2]");
+
+  // Empty haystack array
+  CheckMatch(timestamp(TimeUnit::NANO), "[]", "[2, null, 2, 1]", "[]");
+
+  // Empty needles array
+  CheckMatch(timestamp(TimeUnit::NANO), "[2, null, 2, 1]", "[]",
+             "[null, null, null, null]");
+
+  // Both array are all null
+  CheckMatch(time32(TimeUnit::SECOND), "[null, null, null, null]", "[null, null]",
+             "[0, 0, 0, 0]");
+}
+
+TEST_F(TestMatchKernel, MatchBoolean) {
+  CheckMatch(boolean(),
+             /* haystack= */ "[false, null, false, true]",
+             /* needles= */ "[null, false, true]",
+             /* expected= */ "[1, 0, 1, 2]");
+
+  CheckMatch(boolean(), "[false, null, false, true]", "[false, true, null, true, null]",
+             "[0, 2, 0, 1]");
+
+  // No Nulls
+  CheckMatch(boolean(), "[true, true, false, true]", "[false, true]", "[1, 1, 0, 1]");
+
+  CheckMatch(boolean(), "[false, true, false, true]", "[true, true, true, true]",
+             "[null, 0, null, 0]");
+
+  // No match
+  CheckMatch(boolean(), "[true, true, true, true]", "[false, false, false]",
+             "[null, null, null, null]");
+
+  // Nulls in haystack array
+  CheckMatch(boolean(), "[null, null, null, null]", "[true, true]",
+             "[null, null, null, null]");
+
+  // Nulls in needles array
+  CheckMatch(boolean(), "[true, true, false, true]",
+             "[null, null, null, null, null, null]", "[null, null, null, null]");
+
+  // Both array have Nulls
+  CheckMatch(boolean(), "[null, null, null, null]", "[null, null, null, null]",
+             "[0, 0, 0, 0]");
+}
+
+template <typename Type>
+class TestMatchKernelBinary : public TestMatchKernel {};
+
+using BinaryTypes = ::testing::Types<BinaryType, StringType>;
+TYPED_TEST_SUITE(TestMatchKernelBinary, BinaryTypes);
+
+TYPED_TEST(TestMatchKernelBinary, MatchBinary) {
+  auto type = TypeTraits<TypeParam>::type_singleton();
+  this->CheckMatch(type, R"(["foo", null, "bar", "foo"])", R"(["foo", null, "bar"])",
+                   R"([0, 1, 2, 0])");
+
+  // No match
+  this->CheckMatch(type,
+                   /* haystack= */ R"(["foo", null, "bar", "foo"])",
+                   /* needles= */ R"(["baz", "bazzz", "baz", "bazzz"])",
+                   /* expected= */ R"([null, null, null, null])");
+
+  // Nulls in haystack array
+  this->CheckMatch(type,
+                   /* haystack= */ R"([null, null, null, null])",
+                   /* needles= */ R"(["foo", "bar", "foo"])",
+                   /* expected= */ R"([null, null, null, null])");
+
+  // Nulls in needles array
+  this->CheckMatch(type, R"(["foo", "bar", "foo"])", R"([null, null, null])",
+                   R"([null, null, null])");
+
+  // Both array have Nulls
+  this->CheckMatch(type,
+                   /* haystack= */ R"([null, null, null, null])",
+                   /* needles= */ R"([null, null, null, null])",
+                   /* expected= */ R"([0, 0, 0, 0])");
+
+  // Empty arrays
+  this->CheckMatch(type, R"([])", R"([])", R"([])");
+
+  // Empty haystack array
+  this->CheckMatch(type, R"([])", R"(["foo", null, "bar", null])", "[]");
+
+  // Empty needles array
+  this->CheckMatch(type, R"(["foo", null, "bar", "foo"])", "[]",
+                   R"([null, null, null, null])");
+}
+
+TEST_F(TestMatchKernel, BinaryResizeTable) {
+  const int32_t kTotalValues = 10000;
+#if !defined(ARROW_VALGRIND)
+  const int32_t kRepeats = 10;
+#else
+  // Mitigate Valgrind's slowness
+  const int32_t kRepeats = 3;
+#endif
+
+  const int32_t kBufSize = 20;
+
+  Int32Builder expected_builder;
+  StringBuilder haystack_builder;
+  ASSERT_OK(expected_builder.Resize(kTotalValues * kRepeats));
+  ASSERT_OK(haystack_builder.Resize(kTotalValues * kRepeats));
+  ASSERT_OK(haystack_builder.ReserveData(kBufSize * kTotalValues * kRepeats));
+
+  for (int32_t i = 0; i < kTotalValues * kRepeats; i++) {
+    int32_t index = i % kTotalValues;
+
+    char buf[kBufSize] = "test";
+    ASSERT_GE(snprintf(buf + 4, sizeof(buf) - 4, "%d", index), 0);
+
+    haystack_builder.UnsafeAppend(util::string_view(buf));
+    expected_builder.UnsafeAppend(index);
+  }
+
+  std::shared_ptr<Array> haystack, needles, expected;
+  ASSERT_OK(haystack_builder.Finish(&haystack));
+  needles = haystack;
+  ASSERT_OK(expected_builder.Finish(&expected));
+
+  ASSERT_OK_AND_ASSIGN(Datum actual_datum, Match(haystack, needles));
+  std::shared_ptr<Array> actual = actual_datum.make_array();
+  ASSERT_ARRAYS_EQUAL(*expected, *actual);
+}
+
+TEST_F(TestMatchKernel, MatchFixedSizeBinary) {
+  CheckMatch(fixed_size_binary(5),
+             /* haystack= */ R"(["bbbbb", null, "aaaaa", "ccccc"])",
+             /* needles= */ R"(["bbbbb", null, "bbbbb", "aaaaa", "ccccc"])",
+             /* expected= */ R"([0, 1, 2, 3])");
+
+  // Nulls in haystack
+  CheckMatch(fixed_size_binary(5),
+             /* haystack= */ R"([null, null, null, null, null])",
+             /* needles= */ R"(["bbbbb", "aabbb", "bbbbb", "aaaaa", "ccccc"])",
+             /* expected= */ R"([null, null, null, null, null])");
+
+  // Nulls in needles
+  CheckMatch(fixed_size_binary(5),
+             /* haystack= */ R"(["bbbbb", null, "bbbbb", "aaaaa", "ccccc"])",
+             /* needles= */ R"([null, null, null])",
+             /* expected= */ R"([null, 0, null, null, null])");
+
+  // Both array have Nulls
+  CheckMatch(fixed_size_binary(5),
+             /* haystack= */ R"([null, null, null, null, null])",
+             /* needles= */ R"([null, null, null, null])",
+             /* expected= */ R"([0, 0, 0, 0, 0])");
+
+  // No match
+  CheckMatch(fixed_size_binary(5),
+             /* haystack= */ R"(["bbbbc", "bbbbc", "aaaad", "cccca"])",
+             /* needles= */ R"(["bbbbb", null, "bbbbb", "aaaaa", "ddddd"])",
+             /* expected= */ R"([null, null, null, null])");
+
+  // Empty haystack array
+  CheckMatch(fixed_size_binary(5), R"([])",
+             R"(["bbbbb", null, "bbbbb", "aaaaa", "ccccc"])", R"([])");
+
+  // Empty needles array
+  CheckMatch(fixed_size_binary(5), R"(["bbbbb", null, "bbbbb", "aaaaa", "ccccc"])",
+             R"([])", R"([null, null, null, null, null])");
+
+  // Empty arrays
+  CheckMatch(fixed_size_binary(0), R"([])", R"([])", R"([])");
+}
+
+TEST_F(TestMatchKernel, MatchDecimal) {
+  std::vector<Decimal128> input{12, 12, 11, 12};
+  std::vector<Decimal128> member_set{12, 12, 11, 12};
+  std::vector<int32_t> expected{0, 1, 2, 0};
+
+  CheckMatch(decimal(2, 0),
+             /* haystack= */ R"(["12", null, "11", "12"])",
+             /* needles= */ R"(["12", null, "11", "12"])",
+             /* expected= */ R"([0, 1, 2, 0])");
+}
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/sum.cc b/cpp/src/arrow/compute/kernels/sum.cc
deleted file mode 100644
index 3b12021c7f5..00000000000
--- a/cpp/src/arrow/compute/kernels/sum.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// returnGegarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <utility>
-
-#include "arrow/compute/kernels/sum.h"
-#include "arrow/compute/kernels/sum_internal.h"
-
-namespace arrow {
-namespace compute {
-
-template <typename ArrowType,
-          typename SumType = typename FindAccumulatorType<ArrowType>::Type>
-struct SumState {
-  using ThisType = SumState<ArrowType, SumType>;
-
-  ThisType operator+(const ThisType& rhs) const {
-    return ThisType(this->count + rhs.count, this->sum + rhs.sum);
-  }
-
-  ThisType& operator+=(const ThisType& rhs) {
-    this->count += rhs.count;
-    this->sum += rhs.sum;
-
-    return *this;
-  }
-
-  std::shared_ptr<Scalar> Finalize() const {
-    using ScalarType = typename TypeTraits<SumType>::ScalarType;
-
-    if (count == 0) {
-      return std::make_shared<ScalarType>();
-    }
-
-    return MakeScalar(sum);
-  }
-
-  static std::shared_ptr<DataType> out_type() {
-    return TypeTraits<SumType>::type_singleton();
-  }
-
-  size_t count = 0;
-  typename SumType::c_type sum = 0;
-};
-
-#define SUM_AGG_FN_CASE(T)                              \
-  case T::type_id:                                      \
-    return std::static_pointer_cast<AggregateFunction>( \
-        std::make_shared<SumAggregateFunction<T, SumState<T>>>());
-
-std::shared_ptr<AggregateFunction> MakeSumAggregateFunction(const DataType& type,
-                                                            FunctionContext* ctx) {
-  switch (type.id()) {
-    SUM_AGG_FN_CASE(UInt8Type);
-    SUM_AGG_FN_CASE(Int8Type);
-    SUM_AGG_FN_CASE(UInt16Type);
-    SUM_AGG_FN_CASE(Int16Type);
-    SUM_AGG_FN_CASE(UInt32Type);
-    SUM_AGG_FN_CASE(Int32Type);
-    SUM_AGG_FN_CASE(UInt64Type);
-    SUM_AGG_FN_CASE(Int64Type);
-    SUM_AGG_FN_CASE(FloatType);
-    SUM_AGG_FN_CASE(DoubleType);
-    default:
-      return nullptr;
-  }
-
-#undef SUM_AGG_FN_CASE
-}
-
-static Status GetSumKernel(FunctionContext* ctx, const DataType& type,
-                           std::shared_ptr<AggregateUnaryKernel>& kernel) {
-  std::shared_ptr<AggregateFunction> aggregate = MakeSumAggregateFunction(type, ctx);
-  if (!aggregate) return Status::Invalid("No sum for type ", type);
-
-  kernel = std::make_shared<AggregateUnaryKernel>(aggregate);
-
-  return Status::OK();
-}
-
-Status Sum(FunctionContext* ctx, const Datum& value, Datum* out) {
-  std::shared_ptr<AggregateUnaryKernel> kernel;
-
-  auto data_type = value.type();
-  if (data_type == nullptr)
-    return Status::Invalid("Datum must be array-like");
-  else if (!is_integer(data_type->id()) && !is_floating(data_type->id()))
-    return Status::Invalid("Datum must contain a NumericType");
-
-  RETURN_NOT_OK(GetSumKernel(ctx, *data_type, kernel));
-
-  return kernel->Call(ctx, value, out);
-}
-
-Status Sum(FunctionContext* ctx, const Array& array, Datum* out) {
-  return Sum(ctx, array.data(), out);
-}
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/sum.h b/cpp/src/arrow/compute/kernels/sum.h
deleted file mode 100644
index 5eb27cd4e33..00000000000
--- a/cpp/src/arrow/compute/kernels/sum.h
+++ /dev/null
@@ -1,71 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <memory>
-
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-
-class Array;
-class ChunkedArray;
-class DataType;
-class Status;
-
-namespace compute {
-
-struct Datum;
-class FunctionContext;
-class AggregateFunction;
-
-/// \brief Return a Sum Kernel
-///
-/// \param[in] type required to specialize the kernel
-/// \param[in] context the FunctionContext
-///
-/// \since 0.13.0
-/// \note API not yet finalized
-ARROW_EXPORT
-std::shared_ptr<AggregateFunction> MakeSumAggregateFunction(const DataType& type,
-                                                            FunctionContext* context);
-
-/// \brief Sum values of a numeric array.
-///
-/// \param[in] context the FunctionContext
-/// \param[in] value datum to sum, expecting Array or ChunkedArray
-/// \param[out] out resulting datum
-///
-/// \since 0.13.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Status Sum(FunctionContext* context, const Datum& value, Datum* out);
-
-/// \brief Sum values of a numeric array.
-///
-/// \param[in] context the FunctionContext
-/// \param[in] array to sum
-/// \param[out] out resulting datum
-///
-/// \since 0.13.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Status Sum(FunctionContext* context, const Array& array, Datum* out);
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/sum_internal.h b/cpp/src/arrow/compute/kernels/sum_internal.h
deleted file mode 100644
index 302d004e399..00000000000
--- a/cpp/src/arrow/compute/kernels/sum_internal.h
+++ /dev/null
@@ -1,207 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <memory>
-#include <type_traits>
-
-#include "arrow/compute/kernel.h"
-#include "arrow/compute/kernels/aggregate.h"
-#include "arrow/status.h"
-#include "arrow/type.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/logging.h"
-
-namespace arrow {
-
-class Array;
-class DataType;
-
-namespace compute {
-
-// Find the largest compatible primitive type for a primitive type.
-template <typename I, typename Enable = void>
-struct FindAccumulatorType {};
-
-template <typename I>
-struct FindAccumulatorType<I, enable_if_signed_integer<I>> {
-  using Type = Int64Type;
-};
-
-template <typename I>
-struct FindAccumulatorType<I, enable_if_unsigned_integer<I>> {
-  using Type = UInt64Type;
-};
-
-template <typename I>
-struct FindAccumulatorType<I, enable_if_floating_point<I>> {
-  using Type = DoubleType;
-};
-
-template <typename ArrowType, typename StateType>
-class SumAggregateFunction final : public AggregateFunctionStaticState<StateType> {
-  using CType = typename TypeTraits<ArrowType>::CType;
-  using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
-
-  // A small number of elements rounded to the next cacheline. This should
-  // amount to a maximum of 4 cachelines when dealing with 8 bytes elements.
-  static constexpr int64_t kTinyThreshold = 32;
-  static_assert(kTinyThreshold >= (2 * CHAR_BIT) + 1,
-                "ConsumeSparse requires 3 bytes of null bitmap, and 17 is the"
-                "required minimum number of bits/elements to cover 3 bytes.");
-
- public:
-  Status Consume(const Array& input, StateType* state) const override {
-    const ArrayType& array = static_cast<const ArrayType&>(input);
-
-    if (input.null_count() == 0) {
-      *state = ConsumeDense(array);
-    } else if (input.length() <= kTinyThreshold) {
-      // In order to simplify ConsumeSparse implementation (requires at least 3
-      // bytes of bitmap data), small arrays are handled differently.
-      *state = ConsumeTiny(array);
-    } else {
-      *state = ConsumeSparse(array);
-    }
-
-    return Status::OK();
-  }
-
-  Status Merge(const StateType& src, StateType* dst) const override {
-    *dst += src;
-    return Status::OK();
-  }
-
-  Status Finalize(const StateType& src, Datum* output) const override {
-    *output = src.Finalize();
-    return Status::OK();
-  }
-
-  std::shared_ptr<DataType> out_type() const override { return StateType::out_type(); }
-
- private:
-  StateType ConsumeDense(const ArrayType& array) const {
-    StateType local;
-
-    const auto values = array.raw_values();
-    const int64_t length = array.length();
-    for (int64_t i = 0; i < length; i++) {
-      local.sum += values[i];
-    }
-
-    local.count = length;
-
-    return local;
-  }
-
-  StateType ConsumeTiny(const ArrayType& array) const {
-    StateType local;
-
-    internal::BitmapReader reader(array.null_bitmap_data(), array.offset(),
-                                  array.length());
-    const auto values = array.raw_values();
-    for (int64_t i = 0; i < array.length(); i++) {
-      if (reader.IsSet()) {
-        local.sum += values[i];
-        local.count++;
-      }
-      reader.Next();
-    }
-
-    return local;
-  }
-
-  // While this is not branchless, gcc needs this to be in a different function
-  // for it to generate cmov which ends to be slightly faster than
-  // multiplication but safe for handling NaN with doubles.
-  inline CType MaskedValue(bool valid, CType value) const { return valid ? value : 0; }
-
-  inline StateType UnrolledSum(uint8_t bits, const CType* values) const {
-    StateType local;
-
-    if (bits < 0xFF) {
-      // Some nulls
-      for (size_t i = 0; i < 8; i++) {
-        local.sum += MaskedValue(bits & (1U << i), values[i]);
-      }
-      local.count += BitUtil::kBytePopcount[bits];
-    } else {
-      // No nulls
-      for (size_t i = 0; i < 8; i++) {
-        local.sum += values[i];
-      }
-      local.count += 8;
-    }
-
-    return local;
-  }
-
-  StateType ConsumeSparse(const ArrayType& array) const {
-    StateType local;
-
-    // Sliced bitmaps on non-byte positions induce problem with the branchless
-    // unrolled technique. Thus extra padding is added on both left and right
-    // side of the slice such that both ends are byte-aligned. The first and
-    // last bitmap are properly masked to ignore extra values induced by
-    // padding.
-    //
-    // The execution is divided in 3 sections.
-    //
-    // 1. Compute the sum of the first masked byte.
-    // 2. Compute the sum of the middle bytes
-    // 3. Compute the sum of the last masked byte.
-
-    const int64_t length = array.length();
-    const int64_t offset = array.offset();
-
-    // The number of bytes covering the range, this includes partial bytes.
-    // This number bounded by `<= (length / 8) + 2`, e.g. a possible extra byte
-    // on the left, and on the right.
-    const int64_t covering_bytes = BitUtil::CoveringBytes(offset, length);
-    DCHECK_GE(covering_bytes, 3);
-
-    // Align values to the first batch of 8 elements. Note that raw_values() is
-    // already adjusted with the offset, thus we rewind a little to align to
-    // the closest 8-batch offset.
-    const auto values = array.raw_values() - (offset % 8);
-
-    // Align bitmap at the first consumable byte.
-    const auto bitmap = array.null_bitmap_data() + BitUtil::RoundDown(offset, 8) / 8;
-
-    // Consume the first (potentially partial) byte.
-    const uint8_t first_mask = BitUtil::kTrailingBitmask[offset % 8];
-    local += UnrolledSum(bitmap[0] & first_mask, values);
-
-    // Consume the (full) middle bytes. The loop iterates in unit of
-    // batches of 8 values and 1 byte of bitmap.
-    for (int64_t i = 1; i < covering_bytes - 1; i++) {
-      local += UnrolledSum(bitmap[i], &values[i * 8]);
-    }
-
-    // Consume the last (potentially partial) byte.
-    const int64_t last_idx = covering_bytes - 1;
-    const uint8_t last_mask = BitUtil::kPrecedingWrappingBitmask[(offset + length) % 8];
-    local += UnrolledSum(bitmap[last_idx] & last_mask, &values[last_idx * 8]);
-
-    return local;
-  }
-};  // namespace compute
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/util_internal.cc b/cpp/src/arrow/compute/kernels/util_internal.cc
deleted file mode 100644
index f5c20d8bf8b..00000000000
--- a/cpp/src/arrow/compute/kernels/util_internal.cc
+++ /dev/null
@@ -1,337 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/kernels/util_internal.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "arrow/array.h"
-#include "arrow/status.h"
-#include "arrow/table.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/logging.h"
-
-#include "arrow/compute/context.h"
-#include "arrow/compute/kernel.h"
-
-namespace arrow {
-
-using internal::BitmapAnd;
-using internal::checked_cast;
-
-namespace compute {
-namespace detail {
-
-namespace {
-
-inline void ZeroLastByte(Buffer* buffer) {
-  *(buffer->mutable_data() + (buffer->size() - 1)) = 0;
-}
-
-Status AllocateValueBuffer(FunctionContext* ctx, const DataType& type, int64_t length,
-                           std::shared_ptr<Buffer>* buffer) {
-  if (type.id() != Type::NA) {
-    const auto& fw_type = checked_cast<const FixedWidthType&>(type);
-
-    int bit_width = fw_type.bit_width();
-    int64_t buffer_size = 0;
-
-    if (bit_width == 1) {
-      buffer_size = BitUtil::BytesForBits(length);
-    } else {
-      ARROW_CHECK_EQ(bit_width % 8, 0)
-          << "Only bit widths with multiple of 8 are currently supported";
-      buffer_size = length * fw_type.bit_width() / 8;
-    }
-    RETURN_NOT_OK(ctx->Allocate(buffer_size, buffer));
-
-    if (bit_width == 1 && buffer_size > 0) {
-      // Some utility methods access the last byte before it might be
-      // initialized this makes valgrind/asan unhappy, so we proactively
-      // zero it.
-      ZeroLastByte(buffer->get());
-    }
-  }
-  return Status::OK();
-}
-
-}  // namespace
-
-Status InvokeUnaryArrayKernel(FunctionContext* ctx, UnaryKernel* kernel,
-                              const Datum& value, std::vector<Datum>* outputs) {
-  if (value.kind() == Datum::ARRAY) {
-    Datum out;
-    out.value = ArrayData::Make(kernel->out_type(), value.array()->length);
-    RETURN_NOT_OK(kernel->Call(ctx, value, &out));
-    outputs->push_back(out);
-  } else if (value.kind() == Datum::CHUNKED_ARRAY) {
-    const ChunkedArray& array = *value.chunked_array();
-    for (int i = 0; i < array.num_chunks(); i++) {
-      Datum out;
-      out.value = ArrayData::Make(kernel->out_type(), array.chunk(i)->length());
-      RETURN_NOT_OK(kernel->Call(ctx, array.chunk(i), &out));
-      outputs->push_back(out);
-    }
-  } else {
-    return Status::Invalid("Input Datum was not array-like");
-  }
-  return Status::OK();
-}
-
-Status InvokeBinaryArrayKernel(FunctionContext* ctx, BinaryKernel* kernel,
-                               const Datum& left, const Datum& right,
-                               std::vector<Datum>* outputs) {
-  int64_t left_length;
-  std::vector<std::shared_ptr<Array>> left_arrays;
-  if (left.kind() == Datum::ARRAY) {
-    left_length = left.array()->length;
-    left_arrays.push_back(left.make_array());
-  } else if (left.kind() == Datum::CHUNKED_ARRAY) {
-    left_length = left.chunked_array()->length();
-    left_arrays = left.chunked_array()->chunks();
-  } else {
-    return Status::Invalid("Left input Datum was not array-like");
-  }
-
-  int64_t right_length;
-  std::vector<std::shared_ptr<Array>> right_arrays;
-  if (right.kind() == Datum::ARRAY) {
-    right_length = right.array()->length;
-    right_arrays.push_back(right.make_array());
-  } else if (right.kind() == Datum::CHUNKED_ARRAY) {
-    right_length = right.chunked_array()->length();
-    right_arrays = right.chunked_array()->chunks();
-  } else {
-    return Status::Invalid("Right input Datum was not array-like");
-  }
-
-  if (right_length != left_length) {
-    return Status::Invalid("Right and left have different lengths");
-  }
-  // TODO: Remove duplication with ChunkedArray::Equals
-  int left_chunk_idx = 0;
-  int64_t left_start_idx = 0;
-  int right_chunk_idx = 0;
-  int64_t right_start_idx = 0;
-
-  int64_t elements_compared = 0;
-  do {
-    const std::shared_ptr<Array> left_array = left_arrays[left_chunk_idx];
-    const std::shared_ptr<Array> right_array = right_arrays[right_chunk_idx];
-    int64_t common_length = std::min(left_array->length() - left_start_idx,
-                                     right_array->length() - right_start_idx);
-    std::shared_ptr<Array> left_op = left_array->Slice(left_start_idx, common_length);
-    std::shared_ptr<Array> right_op = right_array->Slice(right_start_idx, common_length);
-
-    Datum output;
-    output.value = ArrayData::Make(kernel->out_type(), common_length);
-    RETURN_NOT_OK(kernel->Call(ctx, left_op, right_op, &output));
-    outputs->push_back(output);
-
-    elements_compared += common_length;
-    // If we have exhausted the current chunk, proceed to the next one individually.
-    if (left_start_idx + common_length == left_array->length()) {
-      left_chunk_idx++;
-      left_start_idx = 0;
-    } else {
-      left_start_idx += common_length;
-    }
-
-    if (right_start_idx + common_length == right_array->length()) {
-      right_chunk_idx++;
-      right_start_idx = 0;
-    } else {
-      right_start_idx += common_length;
-    }
-  } while (elements_compared < left_length);
-  return Status::OK();
-}
-
-Status InvokeBinaryArrayKernel(FunctionContext* ctx, BinaryKernel* kernel,
-                               const Datum& left, const Datum& right, Datum* output) {
-  std::vector<Datum> result;
-  RETURN_NOT_OK(InvokeBinaryArrayKernel(ctx, kernel, left, right, &result));
-  *output = detail::WrapDatumsLike(left, kernel->out_type(), result);
-  return Status::OK();
-}
-
-Datum WrapArraysLike(const Datum& value, std::shared_ptr<DataType> type,
-                     const std::vector<std::shared_ptr<Array>>& arrays) {
-  // Create right kind of datum
-  if (value.kind() == Datum::ARRAY) {
-    return Datum(arrays[0]->data());
-  } else if (value.kind() == Datum::CHUNKED_ARRAY) {
-    return Datum(std::make_shared<ChunkedArray>(arrays, std::move(type)));
-  } else {
-    ARROW_LOG(FATAL) << "unhandled datum kind";
-    return Datum();
-  }
-}
-
-Datum WrapDatumsLike(const Datum& value, std::shared_ptr<DataType> type,
-                     const std::vector<Datum>& datums) {
-  // Create right kind of datum
-  if (value.kind() == Datum::ARRAY) {
-    DCHECK_EQ(1, datums.size());
-    return Datum(datums[0].array());
-  } else if (value.kind() == Datum::CHUNKED_ARRAY) {
-    std::vector<std::shared_ptr<Array>> arrays;
-    for (const Datum& datum : datums) {
-      DCHECK_EQ(Datum::ARRAY, datum.kind());
-      arrays.push_back(datum.make_array());
-    }
-    return Datum(std::make_shared<ChunkedArray>(std::move(arrays), std::move(type)));
-  } else {
-    ARROW_LOG(FATAL) << "unhandled datum kind";
-    return Datum();
-  }
-}
-
-PrimitiveAllocatingUnaryKernel::PrimitiveAllocatingUnaryKernel(UnaryKernel* delegate)
-    : delegate_(delegate) {}
-
-Status PropagateNulls(FunctionContext* ctx, const ArrayData& input, ArrayData* output) {
-  const int64_t length = input.length;
-  if (output->buffers.size() == 0) {
-    // Ensure we can assign a buffer
-    output->buffers.resize(1);
-  }
-
-  // Handle validity bitmap
-  output->null_count = input.GetNullCount();
-  if (output->null_count > 0) {
-    if (input.buffers[0] == nullptr) {
-      // Input was null, need to allocate new null bitmap
-      DCHECK_EQ(input.type->id(), Type::NA);
-      std::shared_ptr<Buffer> buffer;
-      RETURN_NOT_OK(ctx->Allocate(BitUtil::BytesForBits(length), &buffer));
-      memset(buffer->mutable_data(), 0, static_cast<size_t>(buffer->size()));
-      output->buffers[0] = std::move(buffer);
-    } else if (input.offset != 0) {
-      // Realign input null bitmap
-      const Buffer& validity_bitmap = *input.buffers[0];
-      std::shared_ptr<Buffer> buffer;
-      RETURN_NOT_OK(ctx->Allocate(BitUtil::BytesForBits(length), &buffer));
-      // Per spec all trailing bits should indicate nullness, since
-      // the last byte might only be partially set, we ensure the
-      // remaining bit is set.
-      ZeroLastByte(buffer.get());
-      internal::CopyBitmap(validity_bitmap.data(), input.offset, length,
-                           buffer->mutable_data(), 0 /* destination offset */);
-      output->buffers[0] = std::move(buffer);
-    } else {
-      // Use input null bitmap as-is
-      output->buffers[0] = input.buffers[0];
-    }
-  }
-  return Status::OK();
-}
-
-Status PropagateNulls(FunctionContext* ctx, const ArrayData& lhs, const ArrayData& rhs,
-                      ArrayData* output) {
-  return AssignNullIntersection(ctx, lhs, rhs, output);
-}
-
-Status SetAllNulls(FunctionContext* ctx, const ArrayData& input, ArrayData* output) {
-  const int64_t length = input.length;
-  if (output->buffers.size() == 0) {
-    // Ensure we can assign a buffer
-    output->buffers.resize(1);
-  }
-
-  // Handle validity bitmap
-  if (output->buffers[0] == nullptr) {
-    std::shared_ptr<Buffer> buffer;
-    RETURN_NOT_OK(ctx->Allocate(BitUtil::BytesForBits(length), &buffer));
-    output->buffers[0] = std::move(buffer);
-  }
-
-  output->null_count = length;
-  BitUtil::SetBitsTo(output->buffers[0]->mutable_data(), 0, length, false);
-
-  return Status::OK();
-}
-
-Status AssignNullIntersection(FunctionContext* ctx, const ArrayData& left,
-                              const ArrayData& right, ArrayData* output) {
-  if (output->buffers.size() == 0) {
-    // Ensure we can assign a buffer
-    output->buffers.resize(1);
-  }
-
-  if (left.GetNullCount() > 0 && right.GetNullCount() > 0) {
-    ARROW_ASSIGN_OR_RAISE(
-        output->buffers[0],
-        BitmapAnd(ctx->memory_pool(), left.buffers[0]->data(), left.offset,
-                  right.buffers[0]->data(), right.offset, right.length, 0));
-    // Force computation of null count.
-    output->null_count = kUnknownNullCount;
-    output->GetNullCount();
-    return Status::OK();
-  } else if (left.null_count != 0) {
-    return PropagateNulls(ctx, left, output);
-  } else {
-    // right has a positive null_count or both are zero.
-    return PropagateNulls(ctx, right, output);
-  }
-  return Status::OK();
-}
-
-Status PrimitiveAllocatingUnaryKernel::Call(FunctionContext* ctx, const Datum& input,
-                                            Datum* out) {
-  DCHECK_EQ(out->kind(), Datum::ARRAY);
-  ArrayData* result = out->array().get();
-  result->buffers.resize(2);
-
-  const int64_t length = input.length();
-  // Allocate the value buffer
-  RETURN_NOT_OK(AllocateValueBuffer(ctx, *out_type(), length, &(result->buffers[1])));
-  return delegate_->Call(ctx, input, out);
-}
-
-std::shared_ptr<DataType> PrimitiveAllocatingUnaryKernel::out_type() const {
-  return delegate_->out_type();
-}
-
-PrimitiveAllocatingBinaryKernel::PrimitiveAllocatingBinaryKernel(BinaryKernel* delegate)
-    : delegate_(delegate) {}
-
-Status PrimitiveAllocatingBinaryKernel::Call(FunctionContext* ctx, const Datum& left,
-                                             const Datum& right, Datum* out) {
-  DCHECK_EQ(out->kind(), Datum::ARRAY);
-  ArrayData* result = out->array().get();
-  result->buffers.resize(2);
-
-  const int64_t length = result->length;
-  RETURN_NOT_OK(AllocateValueBuffer(ctx, *out_type(), length, &(result->buffers[1])));
-
-  // Allocate the value buffer
-  return delegate_->Call(ctx, left, right, out);
-}
-
-std::shared_ptr<DataType> PrimitiveAllocatingBinaryKernel::out_type() const {
-  return delegate_->out_type();
-}
-
-}  // namespace detail
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/util_internal.h b/cpp/src/arrow/compute/kernels/util_internal.h
deleted file mode 100644
index 1977aee4702..00000000000
--- a/cpp/src/arrow/compute/kernels/util_internal.h
+++ /dev/null
@@ -1,154 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <memory>
-#include <vector>
-
-#include "arrow/array.h"
-#include "arrow/buffer.h"
-#include "arrow/compute/kernel.h"
-#include "arrow/status.h"
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-namespace compute {
-
-class FunctionContext;
-
-// \brief Make a copy of the buffers into a destination array without carrying
-// the type.
-static inline void ZeroCopyData(const ArrayData& input, ArrayData* output) {
-  output->length = input.length;
-  output->SetNullCount(input.null_count);
-  output->buffers = input.buffers;
-  output->offset = input.offset;
-  output->child_data = input.child_data;
-}
-
-namespace detail {
-
-/// \brief Invoke the kernel on value using the ctx and store results in outputs.
-///
-/// \param[in,out] ctx The function context to use when invoking the kernel.
-/// \param[in,out] kernel The kernel to execute.
-/// \param[in] value The input value to execute the kernel with.
-/// \param[out] outputs One ArrayData datum for each ArrayData available in value.
-ARROW_EXPORT
-Status InvokeUnaryArrayKernel(FunctionContext* ctx, UnaryKernel* kernel,
-                              const Datum& value, std::vector<Datum>* outputs);
-
-ARROW_EXPORT
-Status InvokeBinaryArrayKernel(FunctionContext* ctx, BinaryKernel* kernel,
-                               const Datum& left, const Datum& right,
-                               std::vector<Datum>* outputs);
-ARROW_EXPORT
-Status InvokeBinaryArrayKernel(FunctionContext* ctx, BinaryKernel* kernel,
-                               const Datum& left, const Datum& right, Datum* output);
-
-/// \brief Assign validity bitmap to output, copying bitmap if necessary, but
-/// zero-copy otherwise, so that the same value slots are valid/not-null in the
-/// output (sliced arrays).
-///
-/// \param[in] ctx the kernel FunctionContext
-/// \param[in] input the input array
-/// \param[out] output the output array.  Must have length set correctly.
-ARROW_EXPORT
-Status PropagateNulls(FunctionContext* ctx, const ArrayData& input, ArrayData* output);
-
-/// \brief Assign validity bitmap to output, copying and computing the
-/// intersection bitmap if necessary, but zero-copy if possible, so that the
-/// same value slots are valid/not-null in the output (sliced arrays).
-///
-/// \param[in] ctx the kernel FunctionContext
-/// \param[in] left the left input array
-/// \param[in] right the right input array
-/// \param[out] output the output array.  Must have length set correctly.
-ARROW_EXPORT
-Status PropagateNulls(FunctionContext* ctx, const ArrayData& left, const ArrayData& right,
-                      ArrayData* output);
-
-/// \brief Set validity bitmap in output with all null values.
-///
-/// \param[in] ctx the kernel FunctionContext
-/// \param[in] input the input array
-/// \param[out] output the output array.  Must have length and buffer set correctly.
-ARROW_EXPORT
-Status SetAllNulls(FunctionContext* ctx, const ArrayData& input, ArrayData* output);
-
-/// \brief Assign validity bitmap to output, taking the intersection of left and right
-/// null bitmaps if necessary, but zero-copy otherwise.
-///
-/// \param[in] ctx the kernel FunctionContext
-/// \param[in] left the left operand
-/// \param[in] right the right operand
-/// \param[out] output the output array. Must have length set correctly.
-ARROW_EXPORT
-Status AssignNullIntersection(FunctionContext* ctx, const ArrayData& left,
-                              const ArrayData& right, ArrayData* output);
-
-ARROW_EXPORT
-Datum WrapArraysLike(const Datum& value, std::shared_ptr<DataType> type,
-                     const std::vector<std::shared_ptr<Array>>& arrays);
-
-ARROW_EXPORT
-Datum WrapDatumsLike(const Datum& value, std::shared_ptr<DataType> type,
-                     const std::vector<Datum>& datums);
-
-/// \brief Kernel used to preallocate outputs for primitive types. This
-/// does not include allocations for the validity bitmap (PropagateNulls
-/// should be used for that).
-class ARROW_EXPORT PrimitiveAllocatingUnaryKernel : public UnaryKernel {
- public:
-  // \brief Construct with a delegate that must live longer
-  // then this object.
-  explicit PrimitiveAllocatingUnaryKernel(UnaryKernel* delegate);
-  /// \brief Allocates ArrayData with the necessary data buffers allocated and
-  /// then written into by the delegate kernel
-  Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override;
-
-  std::shared_ptr<DataType> out_type() const override;
-
- private:
-  UnaryKernel* delegate_;
-};
-
-/// \brief Kernel used to preallocate outputs for primitive types.
-class ARROW_EXPORT PrimitiveAllocatingBinaryKernel : public BinaryKernel {
- public:
-  // \brief Construct with a kernel to delegate operations to.
-  //
-  // Ownership is not taken of the delegate kernel, it must outlive
-  // the life time of this object.
-  explicit PrimitiveAllocatingBinaryKernel(BinaryKernel* delegate);
-
-  /// \brief Sets out to be of type ArrayData with the necessary
-  /// data buffers prepopulated.
-  Status Call(FunctionContext* ctx, const Datum& left, const Datum& right,
-              Datum* out) override;
-
-  std::shared_ptr<DataType> out_type() const override;
-
- private:
-  BinaryKernel* delegate_;
-};
-
-}  // namespace detail
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/util_internal_test.cc b/cpp/src/arrow/compute/kernels/util_internal_test.cc
deleted file mode 100644
index 35d36c754c0..00000000000
--- a/cpp/src/arrow/compute/kernels/util_internal_test.cc
+++ /dev/null
@@ -1,247 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-#include <memory>
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-
-#include "arrow/testing/gtest_common.h"
-#include "arrow/testing/gtest_util.h"
-
-#include "arrow/array.h"
-#include "arrow/buffer.h"
-#include "arrow/compute/kernels/util_internal.h"
-#include "arrow/compute/test_util.h"
-
-namespace arrow {
-namespace compute {
-namespace detail {
-
-using ::testing::_;
-using ::testing::AllOf;
-using ::testing::Each;
-using ::testing::ElementsAre;
-using ::testing::ElementsAreArray;
-using ::testing::Eq;
-using ::testing::Ge;
-using ::testing::IsNull;
-using ::testing::Ne;
-using ::testing::NotNull;
-using ::testing::Return;
-
-TEST(PropagateNulls, UnknownNullCountWithNullsZeroCopies) {
-  ArrayData input(boolean(), /*length=*/16, kUnknownNullCount);
-  constexpr uint8_t validity_bitmap[8] = {254, 0, 0, 0, 0, 0, 0, 0};
-  std::shared_ptr<Buffer> nulls = std::make_shared<Buffer>(validity_bitmap, 8);
-  input.buffers.push_back(nulls);
-  FunctionContext ctx(default_memory_pool());
-  ArrayData output;
-
-  ASSERT_OK(PropagateNulls(&ctx, input, &output));
-
-  ASSERT_THAT(output.buffers, ElementsAre(Eq(nulls)));
-  ASSERT_EQ(output.null_count, 9);
-}
-
-TEST(PropagateNulls, UnknownNullCountWithoutNullsLeavesNullptr) {
-  ArrayData input(boolean(), /*length=*/16, kUnknownNullCount);
-  constexpr uint8_t validity_bitmap[8] = {255, 255, 0, 0, 0, 0, 0, 0};
-  std::shared_ptr<Buffer> nulls = std::make_shared<Buffer>(validity_bitmap, 8);
-  input.buffers.push_back(nulls);
-  FunctionContext ctx(default_memory_pool());
-  ArrayData output;
-
-  ASSERT_OK(PropagateNulls(&ctx, input, &output));
-
-  EXPECT_EQ(output.null_count, 0);
-  EXPECT_THAT(output.buffers, ElementsAre(IsNull())) << output.buffers[0]->data()[0];
-}
-
-TEST(PropagateNulls, OffsetAndHasNulls) {
-  ArrayData input(boolean(), /*length=*/16, kUnknownNullCount,  // slice off the first 8
-                  /*offset=*/7);
-  constexpr uint8_t validity_bitmap[8] = {0, 1, 0, 0, 0, 0, 0, 0};
-  std::shared_ptr<Buffer> nulls = std::make_shared<Buffer>(validity_bitmap, 8);
-  input.buffers.push_back(nulls);
-  FunctionContext ctx(default_memory_pool());
-  ArrayData output;
-
-  ASSERT_OK(PropagateNulls(&ctx, input, &output));
-
-  // Copy is made.
-  EXPECT_THAT(output.null_count, Eq(15));
-  ASSERT_THAT(output.buffers, ElementsAre(AllOf(Ne(nulls), NotNull())));
-  const auto& output_buffer = *output.buffers[0];
-  // the slice shifts the bit one over
-  ASSERT_THAT(std::vector<uint8_t>(output_buffer.data(),
-                                   output_buffer.data() + output_buffer.size()),
-              ElementsAreArray({2, 0}));
-  ASSERT_THAT(std::vector<uint8_t>(output_buffer.data() + output_buffer.size(),
-                                   output_buffer.data() + output_buffer.capacity()),
-              Each(0));
-}
-
-TEST(SetAllNulls, Basic) {
-  const int64_t length = 16;
-  ArrayData input(boolean(), length);
-  FunctionContext ctx(default_memory_pool());
-  ArrayData output;
-
-  ASSERT_OK(SetAllNulls(&ctx, input, &output));
-  ASSERT_THAT(output.null_count, Eq(length));
-
-  const auto& output_buffer = *output.buffers[0];
-  ASSERT_THAT(std::vector<uint8_t>(output_buffer.data(),
-                                   output_buffer.data() + output_buffer.size()),
-              Each(0));
-}
-
-TEST(AssignNullIntersection, ZeroCopyWhenZeroNullsOnOneInput) {
-  ArrayData some_nulls(boolean(), /* length= */ 16, kUnknownNullCount);
-  constexpr uint8_t validity_bitmap[8] = {254, 0, 0, 0, 0, 0, 0, 0};
-  std::shared_ptr<Buffer> nulls = std::make_shared<Buffer>(validity_bitmap, 8);
-  some_nulls.buffers.push_back(nulls);
-
-  ArrayData no_nulls(boolean(), /* length= */ 16, /*null_count=*/0);
-
-  FunctionContext ctx(default_memory_pool());
-  ArrayData output;
-  output.length = 16;
-
-  ASSERT_OK(AssignNullIntersection(&ctx, some_nulls, no_nulls, &output));
-  ASSERT_THAT(output.buffers, ElementsAre(Eq(nulls)));
-  ASSERT_EQ(output.null_count, 9);
-
-  output.buffers[0] = nullptr;
-  ASSERT_OK(AssignNullIntersection(&ctx, no_nulls, some_nulls, &output));
-  ASSERT_THAT(output.buffers, ElementsAre(Eq(nulls)));
-  ASSERT_EQ(output.null_count, 9);
-}
-
-TEST(AssignNullIntersection, IntersectsNullsWhenSomeOnBoth) {
-  ArrayData left(boolean(), /* length= */ 16, kUnknownNullCount);
-  constexpr uint8_t left_validity_bitmap[8] = {254, 0, 0, 0, 0, 0, 0, 0};
-  std::shared_ptr<Buffer> left_nulls = std::make_shared<Buffer>(left_validity_bitmap, 8);
-  left.buffers.push_back(left_nulls);
-
-  ArrayData right(boolean(), /* length= */ 16, kUnknownNullCount);
-  constexpr uint8_t right_validity_bitmap[8] = {127, 0, 0, 0, 0, 0, 0, 0};
-  std::shared_ptr<Buffer> right_nulls =
-      std::make_shared<Buffer>(right_validity_bitmap, 8);
-  right.buffers.push_back(right_nulls);
-
-  FunctionContext ctx(default_memory_pool());
-  ArrayData output;
-  output.length = 16;
-
-  ASSERT_OK(AssignNullIntersection(&ctx, left, right, &output));
-
-  EXPECT_EQ(output.null_count, 10);
-  ASSERT_THAT(output.buffers, ElementsAre(NotNull()));
-  const auto& output_buffer = *output.buffers[0];
-  EXPECT_THAT(std::vector<uint8_t>(output_buffer.data(),
-                                   output_buffer.data() + output_buffer.size()),
-              ElementsAreArray({126, 0}));
-  EXPECT_THAT(std::vector<uint8_t>(output_buffer.data() + output_buffer.size(),
-                                   output_buffer.data() + output_buffer.capacity()),
-              Each(0));
-}
-
-TEST(PrimitiveAllocatingUnaryKernel, BooleanFunction) {
-  MockUnaryKernel mock;
-  EXPECT_CALL(mock, out_type()).WillRepeatedly(Return(boolean()));
-  EXPECT_CALL(mock, Call(_, _, _)).WillOnce(Return(Status::OK()));
-  PrimitiveAllocatingUnaryKernel kernel(&mock);
-
-  auto input =
-      std::make_shared<ArrayData>(boolean(), /* length= */ 16, kUnknownNullCount);
-  FunctionContext ctx(default_memory_pool());
-  Datum output;
-  output.value = ArrayData::Make(kernel.out_type(), input->length);
-  ASSERT_OK(kernel.Call(&ctx, input, &output));
-
-  ASSERT_THAT(output.array()->buffers, ElementsAre(IsNull(), NotNull()));
-  auto value_buffer = output.array()->buffers[1];
-  EXPECT_THAT(value_buffer->size(), Eq(2));
-  EXPECT_THAT(value_buffer->capacity(), Ge(2));
-  // Booleans should have this always zeroed out.
-  EXPECT_THAT(*(value_buffer->data() + value_buffer->size() - 1), Eq(0));
-}
-
-TEST(PrimitiveAllocatingUnaryKernel, NonBoolean) {
-  MockUnaryKernel mock;
-  EXPECT_CALL(mock, out_type()).WillRepeatedly(Return(int32()));
-  EXPECT_CALL(mock, Call(_, _, _)).WillOnce(Return(Status::OK()));
-  PrimitiveAllocatingUnaryKernel kernel(&mock);
-
-  auto input =
-      std::make_shared<ArrayData>(boolean(), /* length= */ 16, kUnknownNullCount);
-  FunctionContext ctx(default_memory_pool());
-  Datum output;
-  output.value = ArrayData::Make(kernel.out_type(), input->length);
-  ASSERT_OK(kernel.Call(&ctx, input, &output));
-
-  ASSERT_THAT(output.array()->buffers, ElementsAre(IsNull(), NotNull()));
-  auto value_buffer = output.array()->buffers[1];
-  EXPECT_THAT(value_buffer->size(), Eq(64));
-  EXPECT_THAT(value_buffer->capacity(), Ge(64));
-}
-
-TEST(PrimitiveAllocatingBinaryKernel, BooleanFunction) {
-  MockBinaryKernel mock;
-  EXPECT_CALL(mock, out_type()).WillRepeatedly(Return(boolean()));
-  EXPECT_CALL(mock, Call(_, _, _, _)).WillOnce(Return(Status::OK()));
-  PrimitiveAllocatingBinaryKernel kernel(&mock);
-
-  auto input =
-      std::make_shared<ArrayData>(boolean(), /* length= */ 16, kUnknownNullCount);
-  FunctionContext ctx(default_memory_pool());
-  Datum output;
-  output.value = ArrayData::Make(kernel.out_type(), input->length);
-  ASSERT_OK(kernel.Call(&ctx, input, input, &output));
-
-  ASSERT_THAT(output.array()->buffers, ElementsAre(IsNull(), NotNull()));
-  auto value_buffer = output.array()->buffers[1];
-  EXPECT_THAT(value_buffer->size(), Eq(2));
-  EXPECT_THAT(value_buffer->capacity(), Ge(2));
-  // Booleans should have this always zeroed out.
-  EXPECT_THAT(*(value_buffer->data() + value_buffer->size() - 1), Eq(0));
-}
-
-TEST(PrimitiveAllocatingBinaryKernel, NonBoolean) {
-  MockBinaryKernel mock;
-  EXPECT_CALL(mock, out_type()).WillRepeatedly(Return(int32()));
-  EXPECT_CALL(mock, Call(_, _, _, _)).WillOnce(Return(Status::OK()));
-  PrimitiveAllocatingBinaryKernel kernel(&mock);
-
-  auto input =
-      std::make_shared<ArrayData>(boolean(), /* length= */ 16, kUnknownNullCount);
-  FunctionContext ctx(default_memory_pool());
-  Datum output;
-  output.value = ArrayData::Make(kernel.out_type(), input->length);
-  ASSERT_OK(kernel.Call(&ctx, input, input, &output));
-
-  ASSERT_THAT(output.array()->buffers, ElementsAre(IsNull(), NotNull()));
-  auto value_buffer = output.array()->buffers[1];
-  EXPECT_THAT(value_buffer->size(), Eq(64));
-  EXPECT_THAT(value_buffer->capacity(), Ge(64));
-}
-
-}  // namespace detail
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/filter.cc b/cpp/src/arrow/compute/kernels/vector_filter.cc
similarity index 64%
rename from cpp/src/arrow/compute/kernels/filter.cc
rename to cpp/src/arrow/compute/kernels/vector_filter.cc
index 4c4919a9290..01b7ca2d19f 100644
--- a/cpp/src/arrow/compute/kernels/filter.cc
+++ b/cpp/src/arrow/compute/kernels/vector_filter.cc
@@ -24,6 +24,7 @@
 
 #include "arrow/array/concatenate.h"
 #include "arrow/builder.h"
+#include "arrow/compute/kernel.h"
 #include "arrow/compute/kernels/take_internal.h"
 #include "arrow/record_batch.h"
 #include "arrow/result.h"
@@ -114,6 +115,42 @@ static Status CheckFilterValuesLengths(int64_t values, int64_t filter) {
   return Status::OK();
 }
 
+/// \brief BinaryKernel implementing Filter operation
+class FilterKernel : public BinaryKernel {
+ public:
+  const FilterOptions& options() const { return options_; }
+
+  /// \brief BinaryKernel interface
+  ///
+  /// delegates to subclasses via Filter()
+  Status Call(FunctionContext* ctx, const Datum& values, const Datum& filter,
+              Datum* out) override;
+
+  /// \brief output type of this kernel (identical to type of values filtered)
+  std::shared_ptr<DataType> out_type() const override { return type_; }
+
+  /// \brief factory for FilterKernels
+  ///
+  /// \param[in] value_type constructed FilterKernel will support filtering
+  ///            values of this type
+  /// \param[in] options configures null_selection_behavior
+  /// \param[out] out created kernel
+  static Status Make(std::shared_ptr<DataType> value_type, FilterOptions options,
+                     std::unique_ptr<FilterKernel>* out);
+
+  /// \brief single-array implementation
+  virtual Status Filter(FunctionContext* ctx, const Array& values,
+                        const BooleanArray& filter, int64_t out_length,
+                        std::shared_ptr<Array>* out) = 0;
+
+ protected:
+  explicit FilterKernel(std::shared_ptr<DataType> type, FilterOptions options)
+      : type_(std::move(type)), options_(options) {}
+
+  std::shared_ptr<DataType> type_;
+  FilterOptions options_;
+};
+
 template <typename IndexSequence>
 class FilterKernelImpl : public FilterKernel {
  public:
@@ -185,72 +222,57 @@ Status FilterKernel::Call(FunctionContext* ctx, const Datum& values, const Datum
   return Status::OK();
 }
 
-Status FilterTable(FunctionContext* ctx, const Table& table, const Datum& filter,
-                   FilterOptions options, std::shared_ptr<Table>* out) {
-  auto new_columns = table.columns();
-
-  for (auto& column : new_columns) {
-    Datum out_column;
-    RETURN_NOT_OK(Filter(ctx, Datum(column), filter, options, &out_column));
-    column = out_column.chunked_array();
-  }
-
-  *out = Table::Make(table.schema(), std::move(new_columns));
-  return Status::OK();
-}
-
-Status FilterRecordBatch(FunctionContext* ctx, const RecordBatch& batch,
-                         const Array& filter, FilterOptions options,
-                         std::shared_ptr<RecordBatch>* out) {
-  RETURN_NOT_OK(CheckFilterType(filter.type()));
-  const auto& filter_array = checked_cast<const BooleanArray&>(filter);
-
-  std::vector<std::unique_ptr<FilterKernel>> kernels(batch.num_columns());
-  for (int i = 0; i < batch.num_columns(); ++i) {
-    RETURN_NOT_OK(
-        FilterKernel::Make(batch.schema()->field(i)->type(), options, &kernels[i]));
-  }
-
-  std::vector<std::shared_ptr<Array>> columns(batch.num_columns());
-  auto out_length = OutputSize(options, filter_array);
-  for (int i = 0; i < batch.num_columns(); ++i) {
-    RETURN_NOT_OK(
-        kernels[i]->Filter(ctx, *batch.column(i), filter_array, out_length, &columns[i]));
-  }
-
-  *out = RecordBatch::Make(batch.schema(), out_length, columns);
-  return Status::OK();
-}
-
-Status Filter(FunctionContext* ctx, const Datum& values, const Datum& filter,
-              FilterOptions options, Datum* out) {
-  if (values.kind() == Datum::RECORD_BATCH) {
-    if (!filter.is_array()) {
-      return Status::Invalid("Cannot filter a RecordBatch with a filter of kind ",
-                             filter.kind());
-    }
-
-    auto values_batch = values.record_batch();
-    auto filter_array = filter.make_array();
-    std::shared_ptr<RecordBatch> out_batch;
-    RETURN_NOT_OK(
-        FilterRecordBatch(ctx, *values_batch, *filter_array, options, &out_batch));
-    *out = std::move(out_batch);
-    return Status::OK();
-  }
-
-  if (values.kind() == Datum::TABLE) {
-    auto values_table = values.table();
-
-    std::shared_ptr<Table> out_table;
-    RETURN_NOT_OK(FilterTable(ctx, *values_table, filter, options, &out_table));
-    *out = std::move(out_table);
-    return Status::OK();
-  }
-
-  std::unique_ptr<FilterKernel> kernel;
-  RETURN_NOT_OK(FilterKernel::Make(values.type(), options, &kernel));
-  return kernel->Call(ctx, values, filter, out);
+// Status FilterRecordBatch(FunctionContext* ctx, const RecordBatch& batch,
+//                          const Array& filter, FilterOptions options,
+//                          std::shared_ptr<RecordBatch>* out) {
+//   RETURN_NOT_OK(CheckFilterType(filter.type()));
+//   const auto& filter_array = checked_cast<const BooleanArray&>(filter);
+//   std::vector<std::unique_ptr<FilterKernel>> kernels(batch.num_columns());
+//   for (int i = 0; i < batch.num_columns(); ++i) {
+//     RETURN_NOT_OK(
+//         FilterKernel::Make(batch.schema()->field(i)->type(), options, &kernels[i]));
+//   }
+//   std::vector<std::shared_ptr<Array>> columns(batch.num_columns());
+//   auto out_length = OutputSize(options, filter_array);
+//   for (int i = 0; i < batch.num_columns(); ++i) {
+//     RETURN_NOT_OK(
+//         kernels[i]->Filter(ctx, *batch.column(i), filter_array, out_length,
+//         &columns[i]));
+//   }
+//   *out = RecordBatch::Make(batch.schema(), out_length, columns);
+//   return Status::OK();
+// }
+
+// Status Filter(FunctionContext* ctx, const Datum& values, const Datum& filter,
+//               FilterOptions options, Datum* out) {
+//   if (values.kind() == Datum::RECORD_BATCH) {
+//     if (!filter.is_array()) {
+//       return Status::Invalid("Cannot filter a RecordBatch with a filter of kind ",
+//                              filter.kind());
+//     }
+//     auto values_batch = values.record_batch();
+//     auto filter_array = filter.make_array();
+//     std::shared_ptr<RecordBatch> out_batch;
+//     RETURN_NOT_OK(
+//         FilterRecordBatch(ctx, *values_batch, *filter_array, options, &out_batch));
+//     *out = std::move(out_batch);
+//     return Status::OK();
+//   }
+//   if (values.kind() == Datum::TABLE) {
+//     auto values_table = values.table();
+//     std::shared_ptr<Table> out_table;
+//     RETURN_NOT_OK(FilterTable(ctx, *values_table, filter, options, &out_table));
+//     *out = std::move(out_table);
+//     return Status::OK();
+//   }
+//   std::unique_ptr<FilterKernel> kernel;
+//   RETURN_NOT_OK(FilterKernel::Make(values.type(), options, &kernel));
+//   return kernel->Call(ctx, values, filter, out);
+// }
+
+Result<Datum> Filter(const Datum& values, const Datum& filter, FilterOptions options,
+                     ExecContext* context) {
+  return Status::NotImplemented("NYI");
 }
 
 }  // namespace compute
diff --git a/cpp/src/arrow/compute/kernels/filter_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_filter_benchmark.cc
similarity index 86%
rename from cpp/src/arrow/compute/kernels/filter_benchmark.cc
rename to cpp/src/arrow/compute/kernels/vector_filter_benchmark.cc
index 41a423fd834..5a1e2126ce2 100644
--- a/cpp/src/arrow/compute/kernels/filter_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/vector_filter_benchmark.cc
@@ -17,7 +17,7 @@
 
 #include "benchmark/benchmark.h"
 
-#include "arrow/compute/kernels/filter.h"
+#include "arrow/compute/kernel.h"
 
 #include "arrow/compute/benchmark_util.h"
 #include "arrow/compute/test_util.h"
@@ -39,12 +39,8 @@ static void FilterInt64(benchmark::State& state) {
   auto filter = std::static_pointer_cast<BooleanArray>(
       rand.Boolean(array_size, 0.75, args.null_proportion));
 
-  FilterOptions options;
-  FunctionContext ctx;
   for (auto _ : state) {
-    Datum out;
-    ABORT_NOT_OK(Filter(&ctx, Datum(array), Datum(filter), options, &out));
-    benchmark::DoNotOptimize(out);
+    ABORT_NOT_OK(Filter(array, filter).status());
   }
 }
 
@@ -61,12 +57,8 @@ static void FilterFixedSizeList1Int64(benchmark::State& state) {
   auto filter = std::static_pointer_cast<BooleanArray>(
       rand.Boolean(array_size, 0.75, args.null_proportion));
 
-  FilterOptions options;
-  FunctionContext ctx;
   for (auto _ : state) {
-    Datum out;
-    ABORT_NOT_OK(Filter(&ctx, Datum(array), Datum(filter), options, &out));
-    benchmark::DoNotOptimize(out);
+    ABORT_NOT_OK(Filter(array, filter).status());
   }
 }
 
@@ -86,12 +78,8 @@ static void FilterString(benchmark::State& state) {
   auto filter = std::static_pointer_cast<BooleanArray>(
       rand.Boolean(array_size, 0.75, args.null_proportion));
 
-  FilterOptions options;
-  FunctionContext ctx;
   for (auto _ : state) {
-    Datum out;
-    ABORT_NOT_OK(Filter(&ctx, Datum(array), Datum(filter), options, &out));
-    benchmark::DoNotOptimize(out);
+    ABORT_NOT_OK(Filter(array, filter).status());
   }
 }
 
diff --git a/cpp/src/arrow/compute/kernels/filter_test.cc b/cpp/src/arrow/compute/kernels/vector_filter_test.cc
similarity index 87%
rename from cpp/src/arrow/compute/kernels/filter_test.cc
rename to cpp/src/arrow/compute/kernels/vector_filter_test.cc
index 45c690ecce6..edffa97bfb9 100644
--- a/cpp/src/arrow/compute/kernels/filter_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_filter_test.cc
@@ -20,10 +20,8 @@
 #include <utility>
 #include <vector>
 
-#include "arrow/compute/context.h"
-#include "arrow/compute/kernels/boolean.h"
-#include "arrow/compute/kernels/compare.h"
-#include "arrow/compute/kernels/filter.h"
+#include "arrow/compute/api.h"
+#include "arrow/compute/kernel.h"
 #include "arrow/compute/test_util.h"
 #include "arrow/testing/gtest_common.h"
 #include "arrow/testing/gtest_util.h"
@@ -38,21 +36,19 @@ using util::string_view;
 
 constexpr auto kSeed = 0x0ff1ce;
 
-std::shared_ptr<Array> CoalesceNullToFalse(FunctionContext* ctx,
-                                           std::shared_ptr<Array> filter) {
+std::shared_ptr<Array> CoalesceNullToFalse(std::shared_ptr<Array> filter) {
   if (filter->null_count() == 0) {
     return filter;
   }
   const auto& data = *filter->data();
   auto is_true = std::make_shared<BooleanArray>(data.length, data.buffers[1]);
   auto is_valid = std::make_shared<BooleanArray>(data.length, data.buffers[0]);
-  Datum out_datum;
-  ARROW_EXPECT_OK(arrow::compute::And(ctx, is_true, is_valid, &out_datum));
+  EXPECT_OK_AND_ASSIGN(Datum out_datum, arrow::compute::And(is_true, is_valid));
   return out_datum.make_array();
 }
 
 template <typename ArrowType>
-class TestFilterKernel : public ComputeFixture, public TestBase {
+class TestFilterKernel : public TestBase {
  protected:
   TestFilterKernel() {
     emit_null_.null_selection_behavior = FilterOptions::EMIT_NULL;
@@ -62,19 +58,18 @@ class TestFilterKernel : public ComputeFixture, public TestBase {
   void AssertFilter(std::shared_ptr<Array> values, std::shared_ptr<Array> filter,
                     std::shared_ptr<Array> expected) {
     // test with EMIT_NULL
-    Datum out_datum;
-    ASSERT_OK(
-        arrow::compute::Filter(&this->ctx_, values, filter, emit_null_, &out_datum));
+    ASSERT_OK_AND_ASSIGN(Datum out_datum,
+                         arrow::compute::Filter(values, filter, emit_null_));
     auto actual = out_datum.make_array();
     ASSERT_OK(actual->ValidateFull());
     AssertArraysEqual(*expected, *actual);
 
     // test with DROP using EMIT_NULL and a coalesced filter
-    auto coalesced_filter = CoalesceNullToFalse(&this->ctx_, filter);
-    ASSERT_OK(arrow::compute::Filter(&this->ctx_, values, coalesced_filter, emit_null_,
-                                     &out_datum));
+    auto coalesced_filter = CoalesceNullToFalse(filter);
+    ASSERT_OK_AND_ASSIGN(out_datum,
+                         arrow::compute::Filter(values, coalesced_filter, emit_null_));
     expected = out_datum.make_array();
-    ASSERT_OK(arrow::compute::Filter(&this->ctx_, values, filter, drop_, &out_datum));
+    ASSERT_OK_AND_ASSIGN(out_datum, arrow::compute::Filter(values, filter, drop_));
     actual = out_datum.make_array();
     AssertArraysEqual(*expected, *actual);
   }
@@ -87,14 +82,12 @@ class TestFilterKernel : public ComputeFixture, public TestBase {
 
   void ValidateFilter(const std::shared_ptr<Array>& values,
                       const std::shared_ptr<Array>& filter_boxed) {
-    Datum out_datum;
-    ASSERT_OK(arrow::compute::Filter(&this->ctx_, values, filter_boxed, emit_null_,
-                                     &out_datum));
+    ASSERT_OK_AND_ASSIGN(Datum out_datum,
+                         arrow::compute::Filter(values, filter_boxed, emit_null_));
     auto filtered_emit_null = out_datum.make_array();
     ASSERT_OK(filtered_emit_null->ValidateFull());
 
-    ASSERT_OK(
-        arrow::compute::Filter(&this->ctx_, values, filter_boxed, drop_, &out_datum));
+    ASSERT_OK_AND_ASSIGN(out_datum, arrow::compute::Filter(values, filter_boxed, drop_));
     auto filtered_drop = out_datum.make_array();
     ASSERT_OK(filtered_drop->ValidateFull());
 
@@ -192,14 +185,12 @@ TYPED_TEST(TestFilterKernelWithNumeric, FilterNumeric) {
                      ArrayFromJSON(boolean(), "[0, 1, 1, 1, 0, 1]")->Slice(3, 3),
                      ArrayFromJSON(type, "[7, 9]"));
 
-  Datum out_datum;
   ASSERT_RAISES(Invalid,
-                arrow::compute::Filter(&this->ctx_, ArrayFromJSON(type, "[7, 8, 9]"),
-                                       ArrayFromJSON(boolean(), "[]"), this->emit_null_,
-                                       &out_datum));
-  ASSERT_RAISES(Invalid, arrow::compute::Filter(
-                             &this->ctx_, ArrayFromJSON(type, "[7, 8, 9]"),
-                             ArrayFromJSON(boolean(), "[]"), this->drop_, &out_datum));
+                arrow::compute::Filter(ArrayFromJSON(type, "[7, 8, 9]"),
+                                       ArrayFromJSON(boolean(), "[]"), this->emit_null_));
+  ASSERT_RAISES(Invalid,
+                arrow::compute::Filter(ArrayFromJSON(type, "[7, 8, 9]"),
+                                       ArrayFromJSON(boolean(), "[]"), this->drop_));
 }
 
 TYPED_TEST(TestFilterKernelWithNumeric, FilterRandomNumeric) {
@@ -276,11 +267,9 @@ TYPED_TEST(TestFilterKernelWithNumeric, CompareScalarAndFilterRandomNumeric) {
     CType c_fifty = 50;
     auto fifty = std::make_shared<ScalarType>(c_fifty);
     for (auto op : {EQUAL, NOT_EQUAL, GREATER, LESS_EQUAL}) {
-      Datum selection, filtered;
-      ASSERT_OK(arrow::compute::Compare(&this->ctx_, Datum(array), Datum(fifty),
-                                        CompareOptions(op), &selection));
-      ASSERT_OK(
-          arrow::compute::Filter(&this->ctx_, Datum(array), selection, {}, &filtered));
+      ASSERT_OK_AND_ASSIGN(Datum selection,
+                           arrow::compute::Compare(array, fifty, CompareOptions(op)));
+      ASSERT_OK_AND_ASSIGN(Datum filtered, arrow::compute::Filter(array, selection, {}));
       auto filtered_array = filtered.make_array();
       ASSERT_OK(filtered_array->ValidateFull());
       auto expected =
@@ -301,11 +290,9 @@ TYPED_TEST(TestFilterKernelWithNumeric, CompareArrayAndFilterRandomNumeric) {
     auto rhs = checked_pointer_cast<ArrayType>(
         rand.Numeric<TypeParam>(length, 0, 100, /*null_probability=*/0.0));
     for (auto op : {EQUAL, NOT_EQUAL, GREATER, LESS_EQUAL}) {
-      Datum selection, filtered;
-      ASSERT_OK(arrow::compute::Compare(&this->ctx_, Datum(lhs), Datum(rhs),
-                                        CompareOptions(op), &selection));
-      ASSERT_OK(
-          arrow::compute::Filter(&this->ctx_, Datum(lhs), selection, {}, &filtered));
+      ASSERT_OK_AND_ASSIGN(Datum selection,
+                           arrow::compute::Compare(lhs, rhs, CompareOptions(op)));
+      ASSERT_OK_AND_ASSIGN(Datum filtered, arrow::compute::Filter(lhs, selection, {}));
       auto filtered_array = filtered.make_array();
       ASSERT_OK(filtered_array->ValidateFull());
       auto expected = CompareAndFilter<TypeParam>(lhs->raw_values(), lhs->length(),
@@ -328,15 +315,13 @@ TYPED_TEST(TestFilterKernelWithNumeric, ScalarInRangeAndFilterRandomNumeric) {
     CType c_fifty = 50, c_hundred = 100;
     auto fifty = std::make_shared<ScalarType>(c_fifty);
     auto hundred = std::make_shared<ScalarType>(c_hundred);
-    Datum greater_than_fifty, less_than_hundred, selection, filtered;
-    ASSERT_OK(arrow::compute::Compare(&this->ctx_, Datum(array), Datum(fifty),
-                                      CompareOptions(GREATER), &greater_than_fifty));
-    ASSERT_OK(arrow::compute::Compare(&this->ctx_, Datum(array), Datum(hundred),
-                                      CompareOptions(LESS), &less_than_hundred));
-    ASSERT_OK(arrow::compute::And(&this->ctx_, greater_than_fifty, less_than_hundred,
-                                  &selection));
-    ASSERT_OK(
-        arrow::compute::Filter(&this->ctx_, Datum(array), selection, {}, &filtered));
+    ASSERT_OK_AND_ASSIGN(Datum greater_than_fifty,
+                         arrow::compute::Compare(array, fifty, CompareOptions(GREATER)));
+    ASSERT_OK_AND_ASSIGN(Datum less_than_hundred,
+                         arrow::compute::Compare(array, hundred, CompareOptions(LESS)));
+    ASSERT_OK_AND_ASSIGN(Datum selection,
+                         arrow::compute::And(greater_than_fifty, less_than_hundred));
+    ASSERT_OK_AND_ASSIGN(Datum filtered, arrow::compute::Filter(array, selection, {}));
     auto filtered_array = filtered.make_array();
     ASSERT_OK(filtered_array->ValidateFull());
     auto expected = CompareAndFilter<TypeParam>(
@@ -545,9 +530,9 @@ class TestFilterKernelWithRecordBatch : public TestFilterKernel<RecordBatch> {
                 const std::string& selection, FilterOptions options,
                 std::shared_ptr<RecordBatch>* out) {
     auto batch = RecordBatchFromJSON(schm, batch_json);
-    Datum out_datum;
-    RETURN_NOT_OK(arrow::compute::Filter(
-        &this->ctx_, batch, ArrayFromJSON(boolean(), selection), options, &out_datum));
+    ARROW_ASSIGN_OR_RAISE(
+        Datum out_datum,
+        arrow::compute::Filter(batch, ArrayFromJSON(boolean(), selection), options));
     *out = out_datum.record_batch();
     return Status::OK();
   }
@@ -608,10 +593,9 @@ class TestFilterKernelWithChunkedArray : public TestFilterKernel<ChunkedArray> {
   Status FilterWithArray(const std::shared_ptr<DataType>& type,
                          const std::vector<std::string>& values,
                          const std::string& filter, std::shared_ptr<ChunkedArray>* out) {
-    Datum out_datum;
-    RETURN_NOT_OK(arrow::compute::Filter(&this->ctx_, ChunkedArrayFromJSON(type, values),
-                                         ArrayFromJSON(boolean(), filter), {},
-                                         &out_datum));
+    ARROW_ASSIGN_OR_RAISE(Datum out_datum,
+                          arrow::compute::Filter(ChunkedArrayFromJSON(type, values),
+                                                 ArrayFromJSON(boolean(), filter), {}));
     *out = out_datum.chunked_array();
     return Status::OK();
   }
@@ -620,10 +604,10 @@ class TestFilterKernelWithChunkedArray : public TestFilterKernel<ChunkedArray> {
                                 const std::vector<std::string>& values,
                                 const std::vector<std::string>& filter,
                                 std::shared_ptr<ChunkedArray>* out) {
-    Datum out_datum;
-    RETURN_NOT_OK(arrow::compute::Filter(&this->ctx_, ChunkedArrayFromJSON(type, values),
-                                         ChunkedArrayFromJSON(boolean(), filter), {},
-                                         &out_datum));
+    ARROW_ASSIGN_OR_RAISE(
+        Datum out_datum,
+        arrow::compute::Filter(ChunkedArrayFromJSON(type, values),
+                               ChunkedArrayFromJSON(boolean(), filter), {}));
     *out = out_datum.chunked_array();
     return Status::OK();
   }
@@ -673,10 +657,10 @@ class TestFilterKernelWithTable : public TestFilterKernel<Table> {
                          const std::vector<std::string>& values,
                          const std::string& filter, FilterOptions options,
                          std::shared_ptr<Table>* out) {
-    Datum out_datum;
-    RETURN_NOT_OK(arrow::compute::Filter(&this->ctx_, TableFromJSON(schm, values),
-                                         ArrayFromJSON(boolean(), filter), options,
-                                         &out_datum));
+    ARROW_ASSIGN_OR_RAISE(
+        Datum out_datum,
+        arrow::compute::Filter(TableFromJSON(schm, values),
+                               ArrayFromJSON(boolean(), filter), options));
     *out = out_datum.table();
     return Status::OK();
   }
@@ -685,10 +669,10 @@ class TestFilterKernelWithTable : public TestFilterKernel<Table> {
                                 const std::vector<std::string>& values,
                                 const std::vector<std::string>& filter,
                                 FilterOptions options, std::shared_ptr<Table>* out) {
-    Datum out_datum;
-    RETURN_NOT_OK(arrow::compute::Filter(&this->ctx_, TableFromJSON(schm, values),
-                                         ChunkedArrayFromJSON(boolean(), filter), options,
-                                         &out_datum));
+    ARROW_ASSIGN_OR_RAISE(
+        Datum out_datum,
+        arrow::compute::Filter(TableFromJSON(schm, values),
+                               ChunkedArrayFromJSON(boolean(), filter), options));
     *out = out_datum.table();
     return Status::OK();
   }
diff --git a/cpp/src/arrow/compute/kernels/hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc
similarity index 87%
rename from cpp/src/arrow/compute/kernels/hash.cc
rename to cpp/src/arrow/compute/kernels/vector_hash.cc
index 70617d892d8..54e6f752fff 100644
--- a/cpp/src/arrow/compute/kernels/hash.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash.cc
@@ -15,8 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "arrow/compute/kernels/hash.h"
-
 #include <algorithm>
 #include <cstdint>
 #include <cstring>
@@ -32,9 +30,8 @@
 #include "arrow/array/dict_internal.h"
 #include "arrow/buffer.h"
 #include "arrow/builder.h"
-#include "arrow/compute/context.h"
+#include "arrow/compute/exec.h"
 #include "arrow/compute/kernel.h"
-#include "arrow/compute/kernels/util_internal.h"
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/checked_cast.h"
@@ -541,77 +538,6 @@ Status GetValueCountsKernel(FunctionContext* ctx, const std::shared_ptr<DataType
   return Status::OK();
 }
 
-namespace {
-
-Status InvokeHash(FunctionContext* ctx, HashKernel* func, const Datum& value,
-                  std::vector<Datum>* kernel_outputs,
-                  std::shared_ptr<Array>* dictionary) {
-  RETURN_NOT_OK(detail::InvokeUnaryArrayKernel(ctx, func, value, kernel_outputs));
-
-  std::shared_ptr<ArrayData> dict_data;
-  RETURN_NOT_OK(func->GetDictionary(&dict_data));
-  *dictionary = MakeArray(dict_data);
-  return Status::OK();
-}
-
-}  // namespace
-
-Status Unique(FunctionContext* ctx, const Datum& value, std::shared_ptr<Array>* out) {
-  std::unique_ptr<HashKernel> func;
-  RETURN_NOT_OK(GetUniqueKernel(ctx, value.type(), &func));
-
-  std::vector<Datum> dummy_outputs;
-  return InvokeHash(ctx, func.get(), value, &dummy_outputs, out);
-}
-
-Status DictionaryEncode(FunctionContext* ctx, const Datum& value, Datum* out) {
-  std::unique_ptr<HashKernel> func;
-  RETURN_NOT_OK(GetDictionaryEncodeKernel(ctx, value.type(), &func));
-
-  std::shared_ptr<Array> dict;
-  std::vector<Datum> indices_outputs;
-  RETURN_NOT_OK(InvokeHash(ctx, func.get(), value, &indices_outputs, &dict));
-
-  auto dict_type = dictionary(func->out_type(), dict->type());
-
-  // Wrap indices in dictionary arrays for result
-  std::vector<std::shared_ptr<Array>> dict_chunks;
-  for (const Datum& datum : indices_outputs) {
-    dict_chunks.emplace_back(
-        std::make_shared<DictionaryArray>(dict_type, datum.make_array(), dict));
-  }
-  *out = detail::WrapArraysLike(value, dict_type, dict_chunks);
-
-  return Status::OK();
-}
-
-const char kValuesFieldName[] = "values";
-const char kCountsFieldName[] = "counts";
-const int32_t kValuesFieldIndex = 0;
-const int32_t kCountsFieldIndex = 1;
-
-Status ValueCounts(FunctionContext* ctx, const Datum& value,
-                   std::shared_ptr<Array>* counts) {
-  std::unique_ptr<HashKernel> func;
-  RETURN_NOT_OK(GetValueCountsKernel(ctx, value.type(), &func));
-
-  // Calls return nothing for counts.
-  std::vector<Datum> unused_output;
-  std::shared_ptr<Array> uniques;
-  RETURN_NOT_OK(InvokeHash(ctx, func.get(), value, &unused_output, &uniques));
-
-  Datum value_counts;
-  RETURN_NOT_OK(func->FlushFinal(&value_counts));
-
-  auto data_type = std::make_shared<StructType>(std::vector<std::shared_ptr<Field>>{
-      std::make_shared<Field>(kValuesFieldName, uniques->type()),
-      std::make_shared<Field>(kCountsFieldName, int64())});
-  *counts = std::make_shared<StructArray>(
-      data_type, uniques->length(),
-      std::vector<std::shared_ptr<Array>>{uniques, MakeArray(value_counts.array())});
-  return Status::OK();
-}
-
 #undef PROCESS_SUPPORTED_HASH_TYPES
 
 }  // namespace compute
diff --git a/cpp/src/arrow/compute/compute_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc
similarity index 93%
rename from cpp/src/arrow/compute/compute_benchmark.cc
rename to cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc
index 449504121f5..df007d6cb1f 100644
--- a/cpp/src/arrow/compute/compute_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash_benchmark.cc
@@ -25,8 +25,7 @@
 #include "arrow/testing/random.h"
 #include "arrow/testing/util.h"
 
-#include "arrow/compute/context.h"
-#include "arrow/compute/kernels/hash.h"
+#include "arrow/compute/api.h"
 
 namespace arrow {
 namespace compute {
@@ -46,11 +45,8 @@ static void BuildDictionary(benchmark::State& state) {  // NOLINT non-const refe
   std::shared_ptr<Array> arr;
   ArrayFromVector<Int64Type, int64_t>(is_valid, values, &arr);
 
-  FunctionContext ctx;
-
   while (state.KeepRunning()) {
-    Datum out;
-    ABORT_NOT_OK(DictionaryEncode(&ctx, Datum(arr), &out));
+    ABORT_NOT_OK(DictionaryEncode(arr).status());
   }
   state.SetBytesProcessed(state.iterations() * values.size() * sizeof(int64_t));
 }
@@ -73,11 +69,8 @@ static void BuildStringDictionary(
   std::shared_ptr<Array> arr;
   ArrayFromVector<StringType, std::string>(data, &arr);
 
-  FunctionContext ctx;
-
   while (state.KeepRunning()) {
-    Datum out;
-    ABORT_NOT_OK(DictionaryEncode(&ctx, Datum(arr), &out));
+    ABORT_NOT_OK(DictionaryEncode(arr).status());
   }
   // Assuming a string here needs on average 2 bytes
   state.SetBytesProcessed(state.iterations() * total_bytes);
@@ -150,10 +143,8 @@ void BenchUnique(benchmark::State& state, const ParamType& params, int64_t lengt
   std::shared_ptr<Array> arr;
   params.GenerateTestData(length, num_unique, &arr);
 
-  FunctionContext ctx;
   while (state.KeepRunning()) {
-    std::shared_ptr<Array> out;
-    ABORT_NOT_OK(Unique(&ctx, Datum(arr), &out));
+    ABORT_NOT_OK(Unique(arr).status());
   }
   state.SetBytesProcessed(state.iterations() * params.GetBytesProcessed(length));
 }
@@ -164,10 +155,8 @@ void BenchDictionaryEncode(benchmark::State& state, const ParamType& params,
   std::shared_ptr<Array> arr;
   params.GenerateTestData(length, num_unique, &arr);
 
-  FunctionContext ctx;
   while (state.KeepRunning()) {
-    Datum out;
-    ABORT_NOT_OK(DictionaryEncode(&ctx, Datum(arr), &out));
+    ABORT_NOT_OK(DictionaryEncode(arr).status());
   }
   state.SetBytesProcessed(state.iterations() * params.GetBytesProcessed(length));
 }
diff --git a/cpp/src/arrow/compute/kernels/hash_test.cc b/cpp/src/arrow/compute/kernels/vector_hash_test.cc
similarity index 68%
rename from cpp/src/arrow/compute/kernels/hash_test.cc
rename to cpp/src/arrow/compute/kernels/vector_hash_test.cc
index 22cf912564b..40060d35085 100644
--- a/cpp/src/arrow/compute/kernels/hash_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash_test.cc
@@ -40,10 +40,7 @@
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/decimal.h"
 
-#include "arrow/compute/context.h"
-#include "arrow/compute/kernel.h"
-#include "arrow/compute/kernels/hash.h"
-#include "arrow/compute/kernels/util_internal.h"
+#include "arrow/compute/api.h"
 #include "arrow/compute/test_util.h"
 
 #include "arrow/ipc/json_simple.h"
@@ -60,28 +57,26 @@ using StringTypes =
 // ----------------------------------------------------------------------
 // Dictionary tests
 
-void CheckUnique(FunctionContext* ctx, const std::shared_ptr<Array>& input,
+void CheckUnique(const std::shared_ptr<Array>& input,
                  const std::shared_ptr<Array>& expected) {
-  std::shared_ptr<Array> result;
-  ASSERT_OK(Unique(ctx, input, &result));
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> result, Unique(input));
   ASSERT_OK(result->ValidateFull());
   // TODO: We probably shouldn't rely on array ordering.
   ASSERT_ARRAYS_EQUAL(*expected, *result);
 }
 
 template <typename Type, typename T>
-void CheckUnique(FunctionContext* ctx, const std::shared_ptr<DataType>& type,
-                 const std::vector<T>& in_values, const std::vector<bool>& in_is_valid,
-                 const std::vector<T>& out_values,
+void CheckUnique(const std::shared_ptr<DataType>& type, const std::vector<T>& in_values,
+                 const std::vector<bool>& in_is_valid, const std::vector<T>& out_values,
                  const std::vector<bool>& out_is_valid) {
   std::shared_ptr<Array> input = _MakeArray<Type, T>(type, in_values, in_is_valid);
   std::shared_ptr<Array> expected = _MakeArray<Type, T>(type, out_values, out_is_valid);
 
-  CheckUnique(ctx, input, expected);
+  CheckUnique(input, expected);
 }
 
 // Check that ValueCounts() accepts a 0-length array with null buffers
-void CheckValueCountsNull(FunctionContext* ctx, const std::shared_ptr<DataType>& type) {
+void CheckValueCountsNull(const std::shared_ptr<DataType>& type) {
   std::vector<std::shared_ptr<Buffer>> data_buffers(2);
   Datum input;
   input.value =
@@ -90,8 +85,7 @@ void CheckValueCountsNull(FunctionContext* ctx, const std::shared_ptr<DataType>&
   std::shared_ptr<Array> ex_values = ArrayFromJSON(type, "[]");
   std::shared_ptr<Array> ex_counts = ArrayFromJSON(int64(), "[]");
 
-  std::shared_ptr<Array> result;
-  ASSERT_OK(ValueCounts(ctx, input, &result));
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> result, ValueCounts(input));
   ASSERT_OK(result->ValidateFull());
   auto result_struct = std::dynamic_pointer_cast<StructArray>(result);
   ASSERT_NE(result_struct->GetFieldByName(kValuesFieldName), nullptr);
@@ -100,11 +94,10 @@ void CheckValueCountsNull(FunctionContext* ctx, const std::shared_ptr<DataType>&
   ASSERT_ARRAYS_EQUAL(*ex_counts, *result_struct->GetFieldByName(kCountsFieldName));
 }
 
-void CheckValueCounts(FunctionContext* ctx, const std::shared_ptr<Array>& input,
+void CheckValueCounts(const std::shared_ptr<Array>& input,
                       const std::shared_ptr<Array>& expected_values,
                       const std::shared_ptr<Array>& expected_counts) {
-  std::shared_ptr<Array> result;
-  ASSERT_OK(ValueCounts(ctx, input, &result));
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> result, ValueCounts(input));
   ASSERT_OK(result->ValidateFull());
   auto result_struct = std::dynamic_pointer_cast<StructArray>(result);
   ASSERT_EQ(result_struct->num_fields(), 2);
@@ -114,7 +107,7 @@ void CheckValueCounts(FunctionContext* ctx, const std::shared_ptr<Array>& input,
 }
 
 template <typename Type, typename T>
-void CheckValueCounts(FunctionContext* ctx, const std::shared_ptr<DataType>& type,
+void CheckValueCounts(const std::shared_ptr<DataType>& type,
                       const std::vector<T>& in_values,
                       const std::vector<bool>& in_is_valid,
                       const std::vector<T>& out_values,
@@ -126,17 +119,16 @@ void CheckValueCounts(FunctionContext* ctx, const std::shared_ptr<DataType>& typ
   std::shared_ptr<Array> ex_counts =
       _MakeArray<Int64Type, int64_t>(int64(), out_counts, all_valids);
 
-  CheckValueCounts(ctx, input, ex_values, ex_counts);
+  CheckValueCounts(input, ex_values, ex_counts);
 }
 
-void CheckDictEncode(FunctionContext* ctx, const std::shared_ptr<Array>& input,
+void CheckDictEncode(const std::shared_ptr<Array>& input,
                      const std::shared_ptr<Array>& expected_values,
                      const std::shared_ptr<Array>& expected_indices) {
   auto type = dictionary(expected_indices->type(), expected_values->type());
   DictionaryArray expected(type, expected_indices, expected_values);
 
-  Datum datum_out;
-  ASSERT_OK(DictionaryEncode(ctx, input, &datum_out));
+  ARROW_ASSIGN_OR_RAISE(Datum datum_out, DictionaryEncode(input));
   std::shared_ptr<Array> result = MakeArray(datum_out.array());
   ASSERT_OK(result->ValidateFull());
 
@@ -144,7 +136,7 @@ void CheckDictEncode(FunctionContext* ctx, const std::shared_ptr<Array>& input,
 }
 
 template <typename Type, typename T>
-void CheckDictEncode(FunctionContext* ctx, const std::shared_ptr<DataType>& type,
+void CheckDictEncode(const std::shared_ptr<DataType>& type,
                      const std::vector<T>& in_values,
                      const std::vector<bool>& in_is_valid,
                      const std::vector<T>& out_values,
@@ -154,13 +146,13 @@ void CheckDictEncode(FunctionContext* ctx, const std::shared_ptr<DataType>& type
   std::shared_ptr<Array> ex_dict = _MakeArray<Type, T>(type, out_values, out_is_valid);
   std::shared_ptr<Array> ex_indices =
       _MakeArray<Int32Type, int32_t>(int32(), out_indices, in_is_valid);
-  return CheckDictEncode(ctx, input, ex_dict, ex_indices);
+  return CheckDictEncode(input, ex_dict, ex_indices);
 }
 
-class TestHashKernel : public ComputeFixture, public TestBase {};
+class TestHashKernel : public TestBase {};
 
 template <typename Type>
-class TestHashKernelPrimitive : public ComputeFixture, public TestBase {};
+class TestHashKernelPrimitive : public TestBase {};
 
 typedef ::testing::Types<Int8Type, UInt8Type, Int16Type, UInt16Type, Int32Type,
                          UInt32Type, Int64Type, UInt64Type, FloatType, DoubleType,
@@ -172,50 +164,49 @@ TYPED_TEST_SUITE(TestHashKernelPrimitive, PrimitiveDictionaries);
 TYPED_TEST(TestHashKernelPrimitive, Unique) {
   using T = typename TypeParam::c_type;
   auto type = TypeTraits<TypeParam>::type_singleton();
-  CheckUnique<TypeParam, T>(&this->ctx_, type, {2, 1, 2, 1}, {true, false, true, true},
-                            {2, 0, 1}, {1, 0, 1});
-  CheckUnique<TypeParam, T>(&this->ctx_, type, {2, 1, 3, 1}, {false, false, true, true},
-                            {0, 3, 1}, {0, 1, 1});
+  CheckUnique<TypeParam, T>(type, {2, 1, 2, 1}, {true, false, true, true}, {2, 0, 1},
+                            {1, 0, 1});
+  CheckUnique<TypeParam, T>(type, {2, 1, 3, 1}, {false, false, true, true}, {0, 3, 1},
+                            {0, 1, 1});
 
   // Sliced
-  CheckUnique(&this->ctx_, ArrayFromJSON(type, "[1, 2, null, 3, 2, null]")->Slice(1, 4),
+  CheckUnique(ArrayFromJSON(type, "[1, 2, null, 3, 2, null]")->Slice(1, 4),
               ArrayFromJSON(type, "[2, null, 3]"));
 }
 
 TYPED_TEST(TestHashKernelPrimitive, ValueCounts) {
   using T = typename TypeParam::c_type;
   auto type = TypeTraits<TypeParam>::type_singleton();
-  CheckValueCounts<TypeParam, T>(&this->ctx_, type, {2, 1, 2, 1, 2, 3, 4},
+  CheckValueCounts<TypeParam, T>(type, {2, 1, 2, 1, 2, 3, 4},
                                  {true, false, true, true, true, true, false},
                                  {2, 0, 1, 3}, {1, 0, 1, 1}, {3, 2, 1, 1});
-  CheckValueCounts<TypeParam, T>(&this->ctx_, type, {}, {}, {}, {}, {});
-  CheckValueCountsNull(&this->ctx_, type);
+  CheckValueCounts<TypeParam, T>(type, {}, {}, {}, {}, {});
+  CheckValueCountsNull(type);
 
   // Sliced
-  CheckValueCounts(
-      &this->ctx_, ArrayFromJSON(type, "[1, 2, null, 3, 2, null]")->Slice(1, 4),
-      ArrayFromJSON(type, "[2, null, 3]"), ArrayFromJSON(int64(), "[2, 1, 1]"));
+  CheckValueCounts(ArrayFromJSON(type, "[1, 2, null, 3, 2, null]")->Slice(1, 4),
+                   ArrayFromJSON(type, "[2, null, 3]"),
+                   ArrayFromJSON(int64(), "[2, 1, 1]"));
 }
 
 TYPED_TEST(TestHashKernelPrimitive, DictEncode) {
   using T = typename TypeParam::c_type;
   auto type = TypeTraits<TypeParam>::type_singleton();
-  CheckDictEncode<TypeParam, T>(&this->ctx_, type, {2, 1, 2, 1, 2, 3},
+  CheckDictEncode<TypeParam, T>(type, {2, 1, 2, 1, 2, 3},
                                 {true, false, true, true, true, true}, {2, 1, 3},
                                 {1, 1, 1}, {0, 0, 0, 1, 0, 2});
 
   // Sliced
-  CheckDictEncode(
-      &this->ctx_, ArrayFromJSON(type, "[2, 1, null, 4, 3, 1, 42]")->Slice(1, 5),
-      ArrayFromJSON(type, "[1, 4, 3]"), ArrayFromJSON(int32(), "[0, null, 1, 2, 0]"));
+  CheckDictEncode(ArrayFromJSON(type, "[2, 1, null, 4, 3, 1, 42]")->Slice(1, 5),
+                  ArrayFromJSON(type, "[1, 4, 3]"),
+                  ArrayFromJSON(int32(), "[0, null, 1, 2, 0]"));
 }
 
 TYPED_TEST(TestHashKernelPrimitive, ZeroChunks) {
   auto type = TypeTraits<TypeParam>::type_singleton();
 
-  Datum result;
   auto zero_chunks = std::make_shared<ChunkedArray>(ArrayVector{}, type);
-  ASSERT_OK(DictionaryEncode(&this->ctx_, zero_chunks, &result));
+  ASSERT_OK_AND_ASSIGN(Datum result, DictionaryEncode(zero_chunks));
 
   ASSERT_EQ(result.kind(), Datum::CHUNKED_ARRAY);
   AssertChunkedEqual(*result.chunked_array(),
@@ -244,102 +235,96 @@ TYPED_TEST(TestHashKernelPrimitive, PrimitiveResizeTable) {
   }
 
   auto type = TypeTraits<TypeParam>::type_singleton();
-  CheckUnique<TypeParam, T>(&this->ctx_, type, values, {}, uniques, {});
-  CheckValueCounts<TypeParam, T>(&this->ctx_, type, values, {}, uniques, {}, counts);
-  CheckDictEncode<TypeParam, T>(&this->ctx_, type, values, {}, uniques, {}, indices);
+  CheckUnique<TypeParam, T>(type, values, {}, uniques, {});
+  CheckValueCounts<TypeParam, T>(type, values, {}, uniques, {}, counts);
+  CheckDictEncode<TypeParam, T>(type, values, {}, uniques, {}, indices);
 }
 
 TEST_F(TestHashKernel, UniqueTimeTimestamp) {
-  CheckUnique<Time32Type, int32_t>(&this->ctx_, time32(TimeUnit::SECOND), {2, 1, 2, 1},
+  CheckUnique<Time32Type, int32_t>(time32(TimeUnit::SECOND), {2, 1, 2, 1},
                                    {true, false, true, true}, {2, 0, 1}, {1, 0, 1});
 
-  CheckUnique<Time64Type, int64_t>(&this->ctx_, time64(TimeUnit::NANO), {2, 1, 2, 1},
+  CheckUnique<Time64Type, int64_t>(time64(TimeUnit::NANO), {2, 1, 2, 1},
                                    {true, false, true, true}, {2, 0, 1}, {1, 0, 1});
 
-  CheckUnique<TimestampType, int64_t>(&this->ctx_, timestamp(TimeUnit::NANO),
-                                      {2, 1, 2, 1}, {true, false, true, true}, {2, 0, 1},
-                                      {1, 0, 1});
+  CheckUnique<TimestampType, int64_t>(timestamp(TimeUnit::NANO), {2, 1, 2, 1},
+                                      {true, false, true, true}, {2, 0, 1}, {1, 0, 1});
 }
 
 TEST_F(TestHashKernel, ValueCountsTimeTimestamp) {
-  CheckValueCounts<Time32Type, int32_t>(&this->ctx_, time32(TimeUnit::SECOND),
-                                        {2, 1, 2, 1}, {true, false, true, true},
-                                        {2, 0, 1}, {1, 0, 1}, {2, 1, 1});
+  CheckValueCounts<Time32Type, int32_t>(time32(TimeUnit::SECOND), {2, 1, 2, 1},
+                                        {true, false, true, true}, {2, 0, 1}, {1, 0, 1},
+                                        {2, 1, 1});
 
-  CheckValueCounts<Time64Type, int64_t>(&this->ctx_, time64(TimeUnit::NANO), {2, 1, 2, 1},
+  CheckValueCounts<Time64Type, int64_t>(time64(TimeUnit::NANO), {2, 1, 2, 1},
                                         {true, false, true, true}, {2, 0, 1}, {1, 0, 1},
                                         {2, 1, 1});
 
-  CheckValueCounts<TimestampType, int64_t>(&this->ctx_, timestamp(TimeUnit::NANO),
-                                           {2, 1, 2, 1}, {true, false, true, true},
-                                           {2, 0, 1}, {1, 0, 1}, {2, 1, 1});
+  CheckValueCounts<TimestampType, int64_t>(timestamp(TimeUnit::NANO), {2, 1, 2, 1},
+                                           {true, false, true, true}, {2, 0, 1},
+                                           {1, 0, 1}, {2, 1, 1});
 }
 
 TEST_F(TestHashKernel, UniqueBoolean) {
-  CheckUnique<BooleanType, bool>(&this->ctx_, boolean(), {true, true, false, true},
+  CheckUnique<BooleanType, bool>(boolean(), {true, true, false, true},
                                  {true, false, true, true}, {true, false, false},
                                  {1, 0, 1});
 
-  CheckUnique<BooleanType, bool>(&this->ctx_, boolean(), {false, true, false, true},
+  CheckUnique<BooleanType, bool>(boolean(), {false, true, false, true},
                                  {true, false, true, true}, {false, false, true},
                                  {1, 0, 1});
 
   // No nulls
-  CheckUnique<BooleanType, bool>(&this->ctx_, boolean(), {true, true, false, true}, {},
-                                 {true, false}, {});
+  CheckUnique<BooleanType, bool>(boolean(), {true, true, false, true}, {}, {true, false},
+                                 {});
 
-  CheckUnique<BooleanType, bool>(&this->ctx_, boolean(), {false, true, false, true}, {},
-                                 {false, true}, {});
+  CheckUnique<BooleanType, bool>(boolean(), {false, true, false, true}, {}, {false, true},
+                                 {});
 
   // Sliced
-  CheckUnique(&this->ctx_,
-              ArrayFromJSON(boolean(), "[null, true, true, false]")->Slice(1, 2),
+  CheckUnique(ArrayFromJSON(boolean(), "[null, true, true, false]")->Slice(1, 2),
               ArrayFromJSON(boolean(), "[true]"));
 }
 
 TEST_F(TestHashKernel, ValueCountsBoolean) {
-  CheckValueCounts<BooleanType, bool>(&this->ctx_, boolean(), {true, true, false, true},
+  CheckValueCounts<BooleanType, bool>(boolean(), {true, true, false, true},
                                       {true, false, true, true}, {true, false, false},
                                       {1, 0, 1}, {2, 1, 1});
 
-  CheckValueCounts<BooleanType, bool>(&this->ctx_, boolean(), {false, true, false, true},
+  CheckValueCounts<BooleanType, bool>(boolean(), {false, true, false, true},
                                       {true, false, true, true}, {false, false, true},
                                       {1, 0, 1}, {2, 1, 1});
 
   // No nulls
-  CheckValueCounts<BooleanType, bool>(&this->ctx_, boolean(), {true, true, false, true},
-                                      {}, {true, false}, {}, {3, 1});
+  CheckValueCounts<BooleanType, bool>(boolean(), {true, true, false, true}, {},
+                                      {true, false}, {}, {3, 1});
 
-  CheckValueCounts<BooleanType, bool>(&this->ctx_, boolean(), {false, true, false, true},
-                                      {}, {false, true}, {}, {2, 2});
+  CheckValueCounts<BooleanType, bool>(boolean(), {false, true, false, true}, {},
+                                      {false, true}, {}, {2, 2});
 
   // Sliced
-  CheckValueCounts(&this->ctx_,
-                   ArrayFromJSON(boolean(), "[true, false, false, null]")->Slice(1, 2),
+  CheckValueCounts(ArrayFromJSON(boolean(), "[true, false, false, null]")->Slice(1, 2),
                    ArrayFromJSON(boolean(), "[false]"), ArrayFromJSON(int64(), "[2]"));
 }
 
 TEST_F(TestHashKernel, DictEncodeBoolean) {
-  CheckDictEncode<BooleanType, bool>(
-      &this->ctx_, boolean(), {true, true, false, true, false},
-      {true, false, true, true, true}, {true, false}, {}, {0, 0, 1, 0, 1});
+  CheckDictEncode<BooleanType, bool>(boolean(), {true, true, false, true, false},
+                                     {true, false, true, true, true}, {true, false}, {},
+                                     {0, 0, 1, 0, 1});
 
-  CheckDictEncode<BooleanType, bool>(
-      &this->ctx_, boolean(), {false, true, false, true, false},
-      {true, false, true, true, true}, {false, true}, {}, {0, 0, 0, 1, 0});
+  CheckDictEncode<BooleanType, bool>(boolean(), {false, true, false, true, false},
+                                     {true, false, true, true, true}, {false, true}, {},
+                                     {0, 0, 0, 1, 0});
 
   // No nulls
-  CheckDictEncode<BooleanType, bool>(&this->ctx_, boolean(),
-                                     {true, true, false, true, false}, {}, {true, false},
-                                     {}, {0, 0, 1, 0, 1});
+  CheckDictEncode<BooleanType, bool>(boolean(), {true, true, false, true, false}, {},
+                                     {true, false}, {}, {0, 0, 1, 0, 1});
 
-  CheckDictEncode<BooleanType, bool>(&this->ctx_, boolean(),
-                                     {false, true, false, true, false}, {}, {false, true},
-                                     {}, {0, 1, 0, 1, 0});
+  CheckDictEncode<BooleanType, bool>(boolean(), {false, true, false, true, false}, {},
+                                     {false, true}, {}, {0, 1, 0, 1, 0});
 
   // Sliced
   CheckDictEncode(
-      &this->ctx_,
       ArrayFromJSON(boolean(), "[false, true, null, true, false]")->Slice(1, 3),
       ArrayFromJSON(boolean(), "[true]"), ArrayFromJSON(int32(), "[0, null, 0]"));
 }
@@ -354,8 +339,8 @@ class TestHashKernelBinaryTypes : public TestHashKernel {
                         const std::vector<std::string>& out_values,
                         const std::vector<bool>& out_is_valid,
                         const std::vector<int32_t>& out_indices) {
-    CheckDictEncode<ArrowType, std::string>(&this->ctx_, type(), in_values, in_is_valid,
-                                            out_values, out_is_valid, out_indices);
+    CheckDictEncode<ArrowType, std::string>(type(), in_values, in_is_valid, out_values,
+                                            out_is_valid, out_indices);
   }
 
   void CheckValueCountsP(const std::vector<std::string>& in_values,
@@ -363,16 +348,16 @@ class TestHashKernelBinaryTypes : public TestHashKernel {
                          const std::vector<std::string>& out_values,
                          const std::vector<bool>& out_is_valid,
                          const std::vector<int64_t>& out_counts) {
-    CheckValueCounts<ArrowType, std::string>(&this->ctx_, type(), in_values, in_is_valid,
-                                             out_values, out_is_valid, out_counts);
+    CheckValueCounts<ArrowType, std::string>(type(), in_values, in_is_valid, out_values,
+                                             out_is_valid, out_counts);
   }
 
   void CheckUniqueP(const std::vector<std::string>& in_values,
                     const std::vector<bool>& in_is_valid,
                     const std::vector<std::string>& out_values,
                     const std::vector<bool>& out_is_valid) {
-    CheckUnique<ArrowType, std::string>(&this->ctx_, type(), in_values, in_is_valid,
-                                        out_values, out_is_valid);
+    CheckUnique<ArrowType, std::string>(type(), in_values, in_is_valid, out_values,
+                                        out_is_valid);
   }
 };
 
@@ -381,9 +366,8 @@ TYPED_TEST_SUITE(TestHashKernelBinaryTypes, StringTypes);
 TYPED_TEST(TestHashKernelBinaryTypes, ZeroChunks) {
   auto type = this->type();
 
-  Datum result;
   auto zero_chunks = std::make_shared<ChunkedArray>(ArrayVector{}, type);
-  ASSERT_OK(DictionaryEncode(&this->ctx_, zero_chunks, &result));
+  ASSERT_OK_AND_ASSIGN(Datum result, DictionaryEncode(zero_chunks));
 
   ASSERT_EQ(result.kind(), Datum::CHUNKED_ARRAY);
   AssertChunkedEqual(*result.chunked_array(),
@@ -393,14 +377,13 @@ TYPED_TEST(TestHashKernelBinaryTypes, ZeroChunks) {
 TYPED_TEST(TestHashKernelBinaryTypes, TwoChunks) {
   auto type = this->type();
 
-  Datum result;
   auto two_chunks = std::make_shared<ChunkedArray>(
       ArrayVector{
           ArrayFromJSON(type, "[\"a\"]"),
           ArrayFromJSON(type, "[\"b\"]"),
       },
       type);
-  ASSERT_OK(DictionaryEncode(&this->ctx_, two_chunks, &result));
+  ASSERT_OK_AND_ASSIGN(Datum result, DictionaryEncode(two_chunks));
 
   auto dict_type = dictionary(int32(), type);
   auto dictionary = ArrayFromJSON(type, R"(["a", "b"])");
@@ -421,7 +404,6 @@ TYPED_TEST(TestHashKernelBinaryTypes, Unique) {
 
   // Sliced
   CheckUnique(
-      &this->ctx_,
       ArrayFromJSON(this->type(), R"(["ab", null, "cd", "ef", "cd", "gh"])")->Slice(1, 4),
       ArrayFromJSON(this->type(), R"([null, "cd", "ef"])"));
 }
@@ -432,7 +414,6 @@ TYPED_TEST(TestHashKernelBinaryTypes, ValueCounts) {
 
   // Sliced
   CheckValueCounts(
-      &this->ctx_,
       ArrayFromJSON(this->type(), R"(["ab", null, "cd", "ab", "cd", "ef"])")->Slice(1, 4),
       ArrayFromJSON(this->type(), R"([null, "cd", "ab"])"),
       ArrayFromJSON(int64(), "[1, 2, 1]"));
@@ -445,7 +426,6 @@ TYPED_TEST(TestHashKernelBinaryTypes, DictEncode) {
 
   // Sliced
   CheckDictEncode(
-      &this->ctx_,
       ArrayFromJSON(this->type(), R"(["ab", null, "cd", "ab", "cd", "ef"])")->Slice(1, 4),
       ArrayFromJSON(this->type(), R"(["cd", "ab"])"),
       ArrayFromJSON(int32(), "[null, 0, 1, 0]"));
@@ -487,13 +467,12 @@ TYPED_TEST(TestHashKernelBinaryTypes, BinaryResizeTable) {
 TEST_F(TestHashKernel, UniqueFixedSizeBinary) {
   auto type = fixed_size_binary(3);
 
-  CheckUnique<FixedSizeBinaryType, std::string>(
-      &this->ctx_, type, {"aaa", "", "bbb", "aaa"}, {true, false, true, true},
-      {"aaa", "", "bbb"}, {1, 0, 1});
+  CheckUnique<FixedSizeBinaryType, std::string>(type, {"aaa", "", "bbb", "aaa"},
+                                                {true, false, true, true},
+                                                {"aaa", "", "bbb"}, {1, 0, 1});
 
   // Sliced
   CheckUnique(
-      &this->ctx_,
       ArrayFromJSON(type, R"(["aaa", null, "bbb", "bbb", "ccc", "ddd"])")->Slice(1, 4),
       ArrayFromJSON(type, R"([null, "bbb", "ccc"])"));
 }
@@ -502,13 +481,11 @@ TEST_F(TestHashKernel, ValueCountsFixedSizeBinary) {
   auto type = fixed_size_binary(3);
   auto input = ArrayFromJSON(type, R"(["aaa", null, "bbb", "bbb", "ccc", null])");
 
-  CheckValueCounts(&this->ctx_, input,
-                   ArrayFromJSON(type, R"(["aaa", null, "bbb", "ccc"])"),
+  CheckValueCounts(input, ArrayFromJSON(type, R"(["aaa", null, "bbb", "ccc"])"),
                    ArrayFromJSON(int64(), "[1, 2, 2, 1]"));
 
   // Sliced
-  CheckValueCounts(&this->ctx_, input->Slice(1, 4),
-                   ArrayFromJSON(type, R"([null, "bbb", "ccc"])"),
+  CheckValueCounts(input->Slice(1, 4), ArrayFromJSON(type, R"([null, "bbb", "ccc"])"),
                    ArrayFromJSON(int64(), "[1, 2, 1]"));
 }
 
@@ -516,12 +493,11 @@ TEST_F(TestHashKernel, DictEncodeFixedSizeBinary) {
   auto type = fixed_size_binary(3);
 
   CheckDictEncode<FixedSizeBinaryType, std::string>(
-      &this->ctx_, type, {"bbb", "", "bbb", "aaa", "ccc"},
-      {true, false, true, true, true}, {"bbb", "aaa", "ccc"}, {}, {0, 0, 0, 1, 2});
+      type, {"bbb", "", "bbb", "aaa", "ccc"}, {true, false, true, true, true},
+      {"bbb", "aaa", "ccc"}, {}, {0, 0, 0, 1, 2});
 
   // Sliced
   CheckDictEncode(
-      &this->ctx_,
       ArrayFromJSON(type, R"(["aaa", null, "bbb", "bbb", "ccc", "ddd"])")->Slice(1, 4),
       ArrayFromJSON(type, R"(["bbb", "ccc"])"),
       ArrayFromJSON(int32(), "[null, 0, 0, 1]"));
@@ -555,17 +531,16 @@ TEST_F(TestHashKernel, FixedSizeBinaryResizeTable) {
   }
 
   auto type = fixed_size_binary(6);
-  CheckUnique<FixedSizeBinaryType, std::string>(&this->ctx_, type, values, {}, uniques,
-                                                {});
-  CheckDictEncode<FixedSizeBinaryType, std::string>(&this->ctx_, type, values, {},
-                                                    uniques, {}, indices);
+  CheckUnique<FixedSizeBinaryType, std::string>(type, values, {}, uniques, {});
+  CheckDictEncode<FixedSizeBinaryType, std::string>(type, values, {}, uniques, {},
+                                                    indices);
 }
 
 TEST_F(TestHashKernel, UniqueDecimal) {
   std::vector<Decimal128> values{12, 12, 11, 12};
   std::vector<Decimal128> expected{12, 0, 11};
 
-  CheckUnique<Decimal128Type, Decimal128>(&this->ctx_, decimal(2, 0), values,
+  CheckUnique<Decimal128Type, Decimal128>(decimal(2, 0), values,
                                           {true, false, true, true}, expected, {1, 0, 1});
 }
 
@@ -573,16 +548,15 @@ TEST_F(TestHashKernel, ValueCountsDecimal) {
   std::vector<Decimal128> values{12, 12, 11, 12};
   std::vector<Decimal128> expected{12, 0, 11};
 
-  CheckValueCounts<Decimal128Type, Decimal128>(&this->ctx_, decimal(2, 0), values,
-                                               {true, false, true, true}, expected,
-                                               {1, 0, 1}, {2, 1, 1});
+  CheckValueCounts<Decimal128Type, Decimal128>(
+      decimal(2, 0), values, {true, false, true, true}, expected, {1, 0, 1}, {2, 1, 1});
 }
 
 TEST_F(TestHashKernel, DictEncodeDecimal) {
   std::vector<Decimal128> values{12, 12, 11, 12, 13};
   std::vector<Decimal128> expected{12, 11, 13};
 
-  CheckDictEncode<Decimal128Type, Decimal128>(&this->ctx_, decimal(2, 0), values,
+  CheckDictEncode<Decimal128Type, Decimal128>(decimal(2, 0), values,
                                               {true, false, true, true, true}, expected,
                                               {}, {0, 0, 1, 0, 2});
 }
@@ -592,11 +566,11 @@ TEST_F(TestHashKernel, DictEncodeDecimal) {
 TEST_F(TestHashKernel, ValueCountsFloat) {
 
     // No nulls
-  CheckValueCounts<FloatType, float>(&this->ctx_, float32(), {1.0f, 0.0f, -0.0f,
+  CheckValueCounts<FloatType, float>(float32(), {1.0f, 0.0f, -0.0f,
 std::nan("1"), std::nan("2")  },
                                       {}, {0.0f, 1.0f, std::nan("1")}, {}, {});
 
-  CheckValueCounts<DoubleType, double>(&this->ctx_, float64(), {1.0f, 0.0f, -0.0f,
+  CheckValueCounts<DoubleType, double>(float64(), {1.0f, 0.0f, -0.0f,
 std::nan("1"), std::nan("2")  },
                                       {}, {0.0f, 1.0f, std::nan("1")}, {}, {});
 }
@@ -620,8 +594,7 @@ TEST_F(TestHashKernel, ChunkedArrayInvoke) {
   auto carr = std::make_shared<ChunkedArray>(arrays);
 
   // Unique
-  std::shared_ptr<Array> result;
-  ASSERT_OK(Unique(&this->ctx_, carr, &result));
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> result, Unique(carr));
   ASSERT_ARRAYS_EQUAL(*ex_dict, *result);
 
   // Dictionary encode
@@ -635,15 +608,13 @@ TEST_F(TestHashKernel, ChunkedArrayInvoke) {
   auto dict_carr = std::make_shared<ChunkedArray>(dict_arrays);
 
   // Unique counts
-  std::shared_ptr<Array> counts_array;
-  ASSERT_OK(ValueCounts(&this->ctx_, carr, &counts_array));
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> counts_array, ValueCounts(carr));
   auto counts_struct = std::dynamic_pointer_cast<StructArray>(counts_array);
   ASSERT_ARRAYS_EQUAL(*ex_dict, *counts_struct->field(0));
   ASSERT_ARRAYS_EQUAL(*ex_counts, *counts_struct->field(1));
 
   // Dictionary encode
-  Datum encoded_out;
-  ASSERT_OK(DictionaryEncode(&this->ctx_, carr, &encoded_out));
+  ASSERT_OK_AND_ASSIGN(Datum encoded_out, DictionaryEncode(carr));
   ASSERT_EQ(Datum::CHUNKED_ARRAY, encoded_out.kind());
 
   AssertChunkedEqual(*dict_carr, *encoded_out.chunked_array());
@@ -652,8 +623,7 @@ TEST_F(TestHashKernel, ChunkedArrayInvoke) {
 TEST_F(TestHashKernel, ZeroLengthDictionaryEncode) {
   // ARROW-7008
   auto values = ArrayFromJSON(utf8(), "[]");
-  Datum datum_result;
-  ASSERT_OK(DictionaryEncode(&this->ctx_, values, &datum_result));
+  ASSERT_OK_AND_ASSIGN(Datum datum_result, DictionaryEncode(values));
 
   std::shared_ptr<Array> result = datum_result.make_array();
   const auto& dict_result = checked_cast<const DictionaryArray&>(*result);
@@ -665,20 +635,17 @@ TEST_F(TestHashKernel, ChunkedArrayZeroChunk) {
   // ARROW-6857
   auto chunked_array = std::make_shared<ChunkedArray>(ArrayVector{}, utf8());
 
-  std::shared_ptr<Array> result_array, expected;
-  Datum result_datum;
-
-  ASSERT_OK(Unique(&this->ctx_, chunked_array, &result_array));
-  expected = ArrayFromJSON(chunked_array->type(), "[]");
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> result_array, Unique(chunked_array));
+  auto expected = ArrayFromJSON(chunked_array->type(), "[]");
   AssertArraysEqual(*expected, *result_array);
 
-  ASSERT_OK(ValueCounts(&this->ctx_, chunked_array, &result_array));
+  ASSERT_OK_AND_ASSIGN(result_array, ValueCounts(chunked_array));
   expected = ArrayFromJSON(struct_({field(kValuesFieldName, chunked_array->type()),
                                     field(kCountsFieldName, int64())}),
                            "[]");
   AssertArraysEqual(*expected, *result_array);
 
-  ASSERT_OK(DictionaryEncode(&this->ctx_, chunked_array, &result_datum));
+  ASSERT_OK_AND_ASSIGN(Datum result_datum, DictionaryEncode(chunked_array));
   auto dict_type = dictionary(int32(), chunked_array->type());
   ASSERT_EQ(result_datum.kind(), Datum::CHUNKED_ARRAY);
 
diff --git a/cpp/src/arrow/compute/kernels/vector_partition.cc b/cpp/src/arrow/compute/kernels/vector_partition.cc
new file mode 100644
index 00000000000..f2dec8627ab
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/vector_partition.cc
@@ -0,0 +1,107 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+#include <numeric>
+#include <utility>
+
+#include "arrow/compute/kernels/common.h"
+
+namespace arrow {
+namespace compute {
+
+namespace {
+
+// We need to preserve the options
+struct PartitionIndicesState : public KernelState {
+  explicit PartitionIndicesState(int64_t pivot) : pivot(pivot) {}
+  int64_t pivot;
+};
+
+template <typename OutType, typename InType>
+struct PartitionIndices {
+  using ArrayType = typename TypeTraits<InType>::ArrayType;
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    ArrayType arr(batch[0].array());
+
+    int64_t pivot = checked_cast<const PartitionIndicesState&>(*ctx->state()).pivot;
+    if (pivot > arr.length()) {
+      ctx->SetStatus(Status::IndexError("NthToIndices index out of bound"));
+      return;
+    }
+    ArrayData* out_arr = out->mutable_array();
+    uint64_t* out_begin = out_arr->GetMutableValues<uint64_t>(1);
+    uint64_t* out_end = out_begin + arr.length();
+    std::iota(out_begin, out_end, 0);
+    if (pivot == arr.length()) {
+      return;
+    }
+    uint64_t* nulls_begin = out_end;
+    if (arr.null_count()) {
+      nulls_begin = std::stable_partition(
+          out_begin, out_end, [&arr](uint64_t ind) { return !arr.IsNull(ind); });
+    }
+    auto nth_begin = out_begin + pivot;
+    if (nth_begin < nulls_begin) {
+      std::nth_element(out_begin, nth_begin, nulls_begin,
+                       [&arr](uint64_t left, uint64_t right) {
+                         return arr.GetView(left) < arr.GetView(right);
+                       });
+    }
+  }
+};
+
+}  // namespace
+
+namespace internal {
+
+// Partition kernel implemented for
+//
+// * Number types
+// * Base binary types
+
+std::unique_ptr<KernelState> InitPartitionIndices(KernelContext*, const Kernel&,
+                                                  const FunctionOptions* options) {
+  int64_t pivot = static_cast<const PartitionOptions*>(options)->pivot;
+  return std::unique_ptr<KernelState>(new PartitionIndicesState(pivot));
+}
+
+void RegisterVectorPartitionFunctions(FunctionRegistry* registry) {
+  auto func = std::make_shared<VectorFunction>("partition_indices", /*arity=*/1);
+  VectorKernel base;
+  base.init = InitPartitionIndices;
+
+  // The kernel outputs into preallocated memory and is never null
+  base.mem_allocation = MemAllocation::PREALLOCATE;
+  base.null_handling = NullHandling::OUTPUT_NOT_NULL;
+
+  for (const auto& ty : codegen::NumericTypes()) {
+    base.signature = KernelSignature::Make({ty}, uint64());
+    base.exec = codegen::NumericSetReturn<PartitionIndices, UInt64Type>(*ty);
+    DCHECK_OK(func->AddKernel(base));
+  }
+  for (const auto& ty : codegen::BaseBinaryTypes()) {
+    base.signature = KernelSignature::Make({ty}, uint64());
+    base.exec = codegen::BaseBinarySetReturn<PartitionIndices, UInt64Type>(*ty);
+    DCHECK_OK(func->AddKernel(base));
+  }
+  DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
+}  // namespace internal
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/nth_to_indices_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_partition_benchmark.cc
similarity index 90%
rename from cpp/src/arrow/compute/kernels/nth_to_indices_benchmark.cc
rename to cpp/src/arrow/compute/kernels/vector_partition_benchmark.cc
index c5647064ba8..dceaf799fc7 100644
--- a/cpp/src/arrow/compute/kernels/nth_to_indices_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/vector_partition_benchmark.cc
@@ -17,7 +17,7 @@
 
 #include "benchmark/benchmark.h"
 
-#include "arrow/compute/kernels/nth_to_indices.h"
+#include "arrow/compute/kernel.h"
 
 #include "arrow/compute/benchmark_util.h"
 #include "arrow/compute/test_util.h"
@@ -30,11 +30,8 @@ constexpr auto kSeed = 0x0ff1ce;
 
 static void NthToIndicesBenchmark(benchmark::State& state,
                                   const std::shared_ptr<Array>& values, int64_t n) {
-  FunctionContext ctx;
   for (auto _ : state) {
-    std::shared_ptr<Array> out;
-    ABORT_NOT_OK(NthToIndices(&ctx, *values, n, &out));
-    benchmark::DoNotOptimize(out);
+    ABORT_NOT_OK(NthToIndices(*values, n).status());
   }
   state.SetItemsProcessed(state.iterations() * values->length());
 }
diff --git a/cpp/src/arrow/compute/kernels/nth_to_indices_test.cc b/cpp/src/arrow/compute/kernels/vector_partition_test.cc
similarity index 59%
rename from cpp/src/arrow/compute/kernels/nth_to_indices_test.cc
rename to cpp/src/arrow/compute/kernels/vector_partition_test.cc
index 20f9c89542a..b2d6e391e5c 100644
--- a/cpp/src/arrow/compute/kernels/nth_to_indices_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_partition_test.cc
@@ -20,8 +20,7 @@
 #include <string>
 #include <vector>
 
-#include "arrow/compute/context.h"
-#include "arrow/compute/kernels/nth_to_indices.h"
+#include "arrow/compute/api_eager.h"
 #include "arrow/compute/test_util.h"
 #include "arrow/testing/gtest_common.h"
 #include "arrow/testing/gtest_util.h"
@@ -46,12 +45,12 @@ class Comparator {
 };
 
 template <typename ArrowType>
-class TestNthToIndicesKernel : public ComputeFixture, public TestBase {
+class TestPartitionIndices : public TestBase {
   using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
 
  private:
   template <typename ArrayType>
-  void ValidateNth(const ArrayType& array, int n, UInt64Array& offsets) {
+  void Validate(const ArrayType& array, int n, UInt64Array& offsets) {
     if (n >= array.length()) {
       for (int i = 0; i < array.length(); ++i) {
         ASSERT_TRUE(offsets.Value(i) == (uint64_t)i);
@@ -72,57 +71,60 @@ class TestNthToIndicesKernel : public ComputeFixture, public TestBase {
   }
 
  protected:
-  void AssertNthToIndicesArray(const std::shared_ptr<Array> values, int n) {
-    std::shared_ptr<Array> offsets;
-    ASSERT_OK(arrow::compute::NthToIndices(&this->ctx_, *values, n, &offsets));
+  void AssertPartitionIndicesArray(const std::shared_ptr<Array> values, int n) {
+    ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> offsets, PartitionIndices(*values, n));
     ASSERT_OK(offsets->ValidateFull());
-    ValidateNth<ArrayType>(*checked_pointer_cast<ArrayType>(values), n,
-                           *checked_pointer_cast<UInt64Array>(offsets));
+    Validate<ArrayType>(*checked_pointer_cast<ArrayType>(values), n,
+                        *checked_pointer_cast<UInt64Array>(offsets));
   }
 
-  void AssertNthToIndicesJson(const std::string& values, int n) {
+  void AssertPartitionIndicesJson(const std::string& values, int n) {
     auto type = TypeTraits<ArrowType>::type_singleton();
-    AssertNthToIndicesArray(ArrayFromJSON(type, values), n);
+    AssertPartitionIndicesArray(ArrayFromJSON(type, values), n);
   }
 };
 
 template <typename ArrowType>
-class TestNthToIndicesKernelForReal : public TestNthToIndicesKernel<ArrowType> {};
-TYPED_TEST_SUITE(TestNthToIndicesKernelForReal, RealArrowTypes);
+class TestPartitionIndicesForReal : public TestPartitionIndices<ArrowType> {};
+TYPED_TEST_SUITE(TestPartitionIndicesForReal, RealArrowTypes);
 
 template <typename ArrowType>
-class TestNthToIndicesKernelForIntegral : public TestNthToIndicesKernel<ArrowType> {};
-TYPED_TEST_SUITE(TestNthToIndicesKernelForIntegral, IntegralArrowTypes);
+class TestPartitionIndicesForIntegral : public TestPartitionIndices<ArrowType> {};
+TYPED_TEST_SUITE(TestPartitionIndicesForIntegral, IntegralArrowTypes);
 
 template <typename ArrowType>
-class TestNthToIndicesKernelForStrings : public TestNthToIndicesKernel<ArrowType> {};
-TYPED_TEST_SUITE(TestNthToIndicesKernelForStrings, testing::Types<StringType>);
-
-TYPED_TEST(TestNthToIndicesKernelForReal, NthReal) {
-  this->AssertNthToIndicesJson("[null, 1, 3.3, null, 2, 5.3]", 0);
-  this->AssertNthToIndicesJson("[null, 1, 3.3, null, 2, 5.3]", 2);
-  this->AssertNthToIndicesJson("[null, 1, 3.3, null, 2, 5.3]", 5);
-  this->AssertNthToIndicesJson("[null, 1, 3.3, null, 2, 5.3]", 6);
+class TestPartitionIndicesForStrings : public TestPartitionIndices<ArrowType> {};
+TYPED_TEST_SUITE(TestPartitionIndicesForStrings, testing::Types<StringType>);
+
+TYPED_TEST(TestPartitionIndicesForReal, Real) {
+  this->AssertPartitionIndicesJson("[null, 1, 3.3, null, 2, 5.3]", 0);
+  this->AssertPartitionIndicesJson("[null, 1, 3.3, null, 2, 5.3]", 2);
+  this->AssertPartitionIndicesJson("[null, 1, 3.3, null, 2, 5.3]", 5);
+  this->AssertPartitionIndicesJson("[null, 1, 3.3, null, 2, 5.3]", 6);
 }
 
-TYPED_TEST(TestNthToIndicesKernelForIntegral, NthIntegral) {
-  this->AssertNthToIndicesJson("[null, 1, 3, null, 2, 5]", 0);
-  this->AssertNthToIndicesJson("[null, 1, 3, null, 2, 5]", 2);
-  this->AssertNthToIndicesJson("[null, 1, 3, null, 2, 5]", 5);
-  this->AssertNthToIndicesJson("[null, 1, 3, null, 2, 5]", 6);
+TYPED_TEST(TestPartitionIndicesForIntegral, Integral) {
+  this->AssertPartitionIndicesJson("[null, 1, 3, null, 2, 5]", 0);
+  this->AssertPartitionIndicesJson("[null, 1, 3, null, 2, 5]", 2);
+  this->AssertPartitionIndicesJson("[null, 1, 3, null, 2, 5]", 5);
+  this->AssertPartitionIndicesJson("[null, 1, 3, null, 2, 5]", 6);
 }
 
-TYPED_TEST(TestNthToIndicesKernelForStrings, NthStrings) {
-  this->AssertNthToIndicesJson(R"(["testing", null, "nth", "for", null, "strings"])", 0);
-  this->AssertNthToIndicesJson(R"(["testing", null, "nth", "for", null, "strings"])", 2);
-  this->AssertNthToIndicesJson(R"(["testing", null, "nth", "for", null, "strings"])", 5);
-  this->AssertNthToIndicesJson(R"(["testing", null, "nth", "for", null, "strings"])", 6);
+TYPED_TEST(TestPartitionIndicesForStrings, Strings) {
+  this->AssertPartitionIndicesJson(R"(["testing", null, "nth", "for", null, "strings"])",
+                                   0);
+  this->AssertPartitionIndicesJson(R"(["testing", null, "nth", "for", null, "strings"])",
+                                   2);
+  this->AssertPartitionIndicesJson(R"(["testing", null, "nth", "for", null, "strings"])",
+                                   5);
+  this->AssertPartitionIndicesJson(R"(["testing", null, "nth", "for", null, "strings"])",
+                                   6);
 }
 
 template <typename ArrowType>
-class TestNthToIndicesKernelRandom : public TestNthToIndicesKernel<ArrowType> {};
+class TestPartitionIndicesRandom : public TestPartitionIndices<ArrowType> {};
 
-using NthToIndicesableTypes =
+using PartitionIndicesableTypes =
     ::testing::Types<UInt8Type, UInt16Type, UInt32Type, UInt64Type, Int8Type, Int16Type,
                      Int32Type, Int64Type, FloatType, DoubleType, StringType>;
 
@@ -157,16 +159,16 @@ class Random<StringType> : public RandomImpl {
   }
 };
 
-TYPED_TEST_SUITE(TestNthToIndicesKernelRandom, NthToIndicesableTypes);
+TYPED_TEST_SUITE(TestPartitionIndicesRandom, PartitionIndicesableTypes);
 
-TYPED_TEST(TestNthToIndicesKernelRandom, NthRandomValues) {
+TYPED_TEST(TestPartitionIndicesRandom, RandomValues) {
   Random<TypeParam> rand(0x61549225);
   int length = 100;
   for (auto null_probability : {0.0, 0.1, 0.5, 1.0}) {
     // Try n from 0 to out of bound
     for (int n = 0; n <= length; ++n) {
       auto array = rand.Generate(length, null_probability);
-      this->AssertNthToIndicesArray(array, n);
+      this->AssertPartitionIndicesArray(array, n);
     }
   }
 }
diff --git a/cpp/src/arrow/compute/kernels/sort_to_indices.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc
similarity index 99%
rename from cpp/src/arrow/compute/kernels/sort_to_indices.cc
rename to cpp/src/arrow/compute/kernels/vector_sort.cc
index 65280e3b636..31e5d348b8e 100644
--- a/cpp/src/arrow/compute/kernels/sort_to_indices.cc
+++ b/cpp/src/arrow/compute/kernels/vector_sort.cc
@@ -15,8 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "arrow/compute/kernels/sort_to_indices.h"
-
 #include <algorithm>
 #include <limits>
 #include <numeric>
@@ -24,7 +22,7 @@
 #include <vector>
 
 #include "arrow/builder.h"
-#include "arrow/compute/context.h"
+#include "arrow/compute/kernel.h"
 #include "arrow/type_traits.h"
 #include "arrow/visitor_inline.h"
 
diff --git a/cpp/src/arrow/compute/kernels/sort_to_indices_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_sort_benchmark.cc
similarity index 92%
rename from cpp/src/arrow/compute/kernels/sort_to_indices_benchmark.cc
rename to cpp/src/arrow/compute/kernels/vector_sort_benchmark.cc
index 5bf291e6691..06d7b2cf1e4 100644
--- a/cpp/src/arrow/compute/kernels/sort_to_indices_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/vector_sort_benchmark.cc
@@ -17,9 +17,8 @@
 
 #include "benchmark/benchmark.h"
 
-#include "arrow/compute/kernels/sort_to_indices.h"
-
 #include "arrow/compute/benchmark_util.h"
+#include "arrow/compute/kernel.h"
 #include "arrow/compute/test_util.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/random.h"
@@ -30,11 +29,8 @@ constexpr auto kSeed = 0x0ff1ce;
 
 static void SortToIndicesBenchmark(benchmark::State& state,
                                    const std::shared_ptr<Array>& values) {
-  FunctionContext ctx;
   for (auto _ : state) {
-    std::shared_ptr<Array> out;
-    ABORT_NOT_OK(SortToIndices(&ctx, *values, &out));
-    benchmark::DoNotOptimize(out);
+    ABORT_NOT_OK(SortToIndices(*values).status());
   }
   state.SetItemsProcessed(state.iterations() * values->length());
 }
diff --git a/cpp/src/arrow/compute/kernels/sort_to_indices_test.cc b/cpp/src/arrow/compute/kernels/vector_sort_test.cc
similarity index 90%
rename from cpp/src/arrow/compute/kernels/sort_to_indices_test.cc
rename to cpp/src/arrow/compute/kernels/vector_sort_test.cc
index dd0170c0b09..209251d8cdb 100644
--- a/cpp/src/arrow/compute/kernels/sort_to_indices_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_sort_test.cc
@@ -20,8 +20,7 @@
 #include <string>
 #include <vector>
 
-#include "arrow/compute/context.h"
-#include "arrow/compute/kernels/sort_to_indices.h"
+#include "arrow/compute/api.h"
 #include "arrow/compute/test_util.h"
 #include "arrow/testing/gtest_common.h"
 #include "arrow/testing/gtest_util.h"
@@ -35,12 +34,11 @@ namespace compute {
 using arrow::internal::checked_pointer_cast;
 
 template <typename ArrowType>
-class TestSortToIndicesKernel : public ComputeFixture, public TestBase {
+class TestSortToIndicesKernel : public TestBase {
  private:
   void AssertSortToIndicesArrays(const std::shared_ptr<Array> values,
                                  const std::shared_ptr<Array> expected) {
-    std::shared_ptr<Array> actual;
-    ASSERT_OK(arrow::compute::SortToIndices(&this->ctx_, *values, &actual));
+    ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> actual, SortToIndices(*values));
     ASSERT_OK(actual->ValidateFull());
     AssertArraysEqual(*expected, *actual);
   }
@@ -121,13 +119,13 @@ TYPED_TEST(TestSortToIndicesKernelForInt8, SortInt8) {
 }
 
 template <typename ArrowType>
-class TestSortToIndicesKernelRandom : public ComputeFixture, public TestBase {};
+class TestSortToIndicesKernelRandom : public TestBase {};
 
 template <typename ArrowType>
-class TestSortToIndicesKernelRandomCount : public ComputeFixture, public TestBase {};
+class TestSortToIndicesKernelRandomCount : public TestBase {};
 
 template <typename ArrowType>
-class TestSortToIndicesKernelRandomCompare : public ComputeFixture, public TestBase {};
+class TestSortToIndicesKernelRandomCompare : public TestBase {};
 
 using SortToIndicesableTypes =
     ::testing::Types<UInt8Type, UInt16Type, UInt32Type, UInt64Type, Int8Type, Int16Type,
@@ -214,8 +212,7 @@ TYPED_TEST(TestSortToIndicesKernelRandom, SortRandomValues) {
   for (int test = 0; test < times; test++) {
     for (auto null_probability : {0.0, 0.1, 0.5, 1.0}) {
       auto array = rand.Generate(length, null_probability);
-      std::shared_ptr<Array> offsets;
-      ASSERT_OK(arrow::compute::SortToIndices(&this->ctx_, *array, &offsets));
+      ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> offsets, SortToIndices(*array));
       ValidateSorted<ArrayType>(*checked_pointer_cast<ArrayType>(array),
                                 *checked_pointer_cast<UInt64Array>(offsets));
     }
@@ -237,8 +234,7 @@ TYPED_TEST(TestSortToIndicesKernelRandomCount, SortRandomValuesCount) {
   for (int test = 0; test < times; test++) {
     for (auto null_probability : {0.0, 0.1, 0.5, 1.0}) {
       auto array = rand.Generate(length, range, null_probability);
-      std::shared_ptr<Array> offsets;
-      ASSERT_OK(arrow::compute::SortToIndices(&this->ctx_, *array, &offsets));
+      ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> offsets, SortToIndices(*array));
       ValidateSorted<ArrayType>(*checked_pointer_cast<ArrayType>(array),
                                 *checked_pointer_cast<UInt64Array>(offsets));
     }
@@ -257,8 +253,7 @@ TYPED_TEST(TestSortToIndicesKernelRandomCompare, SortRandomValuesCompare) {
   for (int test = 0; test < times; test++) {
     for (auto null_probability : {0.0, 0.1, 0.5, 1.0}) {
       auto array = rand.Generate(length, null_probability);
-      std::shared_ptr<Array> offsets;
-      ASSERT_OK(arrow::compute::SortToIndices(&this->ctx_, *array, &offsets));
+      ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> offsets, SortToIndices(*array));
       ValidateSorted<ArrayType>(*checked_pointer_cast<ArrayType>(array),
                                 *checked_pointer_cast<UInt64Array>(offsets));
     }
diff --git a/cpp/src/arrow/compute/kernels/take.cc b/cpp/src/arrow/compute/kernels/vector_take.cc
similarity index 87%
rename from cpp/src/arrow/compute/kernels/take.cc
rename to cpp/src/arrow/compute/kernels/vector_take.cc
index 2fa860c7682..d7c41c249f8 100644
--- a/cpp/src/arrow/compute/kernels/take.cc
+++ b/cpp/src/arrow/compute/kernels/vector_take.cc
@@ -15,14 +15,22 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include "arrow/compute/kernels/take.h"
+
+#include <algorithm>
 #include <limits>
 #include <memory>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
 #include "arrow/array/concatenate.h"
-#include "arrow/compute/kernels/take.h"
+#include "arrow/builder.h"
+#include "arrow/compute/kernel.h"
 #include "arrow/compute/kernels/take_internal.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/checked_cast.h"
 #include "arrow/util/logging.h"
 #include "arrow/visitor_inline.h"
 
@@ -39,7 +47,7 @@ class TakeKernelImpl : public TakeKernel {
     return Taker<ArrayIndexSequence<IndexType>>::Make(this->type_, &taker_);
   }
 
-  Status Take(FunctionContext* ctx, const Array& values, const Array& indices_array,
+  Status Take(KernelContext* ctx, const Array& values, const Array& indices_array,
               std::shared_ptr<Array>* out) override {
     RETURN_NOT_OK(taker_->SetContext(ctx));
     RETURN_NOT_OK(taker_->Take(values, ArrayIndexSequence<IndexType>(indices_array)));
@@ -72,7 +80,7 @@ Status TakeKernel::Make(const std::shared_ptr<DataType>& value_type,
   return VisitTypeInline(*index_type, &visitor);
 }
 
-Status TakeKernel::Call(FunctionContext* ctx, const Datum& values, const Datum& indices,
+Status TakeKernel::Call(KernelContext* ctx, const Datum& values, const Datum& indices,
                         Datum* out) {
   if (!values.is_array() || !indices.is_array()) {
     return Status::Invalid("TakeKernel expects array values and indices");
@@ -85,7 +93,7 @@ Status TakeKernel::Call(FunctionContext* ctx, const Datum& values, const Datum&
   return Status::OK();
 }
 
-Status Take(FunctionContext* ctx, const Array& values, const Array& indices,
+Status Take(ExecContext* ctx, const Array& values, const Array& indices,
             const TakeOptions& options, std::shared_ptr<Array>* out) {
   Datum out_datum;
   RETURN_NOT_OK(
@@ -94,14 +102,14 @@ Status Take(FunctionContext* ctx, const Array& values, const Array& indices,
   return Status::OK();
 }
 
-Status Take(FunctionContext* ctx, const Datum& values, const Datum& indices,
+Status Take(ExecContext* ctx, const Datum& values, const Datum& indices,
             const TakeOptions& options, Datum* out) {
   std::unique_ptr<TakeKernel> kernel;
   RETURN_NOT_OK(TakeKernel::Make(values.type(), indices.type(), &kernel));
   return kernel->Call(ctx, values, indices, out);
 }
 
-Status Take(FunctionContext* ctx, const ChunkedArray& values, const Array& indices,
+Status Take(ExecContext* ctx, const ChunkedArray& values, const Array& indices,
             const TakeOptions& options, std::shared_ptr<ChunkedArray>* out) {
   auto num_chunks = values.num_chunks();
   std::vector<std::shared_ptr<Array>> new_chunks(1);  // Hard-coded 1 for now
@@ -125,7 +133,7 @@ Status Take(FunctionContext* ctx, const ChunkedArray& values, const Array& indic
   return Status::OK();
 }
 
-Status Take(FunctionContext* ctx, const ChunkedArray& values, const ChunkedArray& indices,
+Status Take(ExecContext* ctx, const ChunkedArray& values, const ChunkedArray& indices,
             const TakeOptions& options, std::shared_ptr<ChunkedArray>* out) {
   auto num_chunks = indices.num_chunks();
   std::vector<std::shared_ptr<Array>> new_chunks(num_chunks);
@@ -144,7 +152,7 @@ Status Take(FunctionContext* ctx, const ChunkedArray& values, const ChunkedArray
   return Status::OK();
 }
 
-Status Take(FunctionContext* ctx, const Array& values, const ChunkedArray& indices,
+Status Take(ExecContext* ctx, const Array& values, const ChunkedArray& indices,
             const TakeOptions& options, std::shared_ptr<ChunkedArray>* out) {
   auto num_chunks = indices.num_chunks();
   std::vector<std::shared_ptr<Array>> new_chunks(num_chunks);
@@ -157,7 +165,7 @@ Status Take(FunctionContext* ctx, const Array& values, const ChunkedArray& indic
   return Status::OK();
 }
 
-Status Take(FunctionContext* ctx, const RecordBatch& batch, const Array& indices,
+Status Take(ExecContext* ctx, const RecordBatch& batch, const Array& indices,
             const TakeOptions& options, std::shared_ptr<RecordBatch>* out) {
   auto ncols = batch.num_columns();
   auto nrows = indices.length();
@@ -171,7 +179,7 @@ Status Take(FunctionContext* ctx, const RecordBatch& batch, const Array& indices
   return Status::OK();
 }
 
-Status Take(FunctionContext* ctx, const Table& table, const Array& indices,
+Status Take(ExecContext* ctx, const Table& table, const Array& indices,
             const TakeOptions& options, std::shared_ptr<Table>* out) {
   auto ncols = table.num_columns();
   std::vector<std::shared_ptr<ChunkedArray>> columns(ncols);
@@ -183,7 +191,7 @@ Status Take(FunctionContext* ctx, const Table& table, const Array& indices,
   return Status::OK();
 }
 
-Status Take(FunctionContext* ctx, const Table& table, const ChunkedArray& indices,
+Status Take(ExecContext* ctx, const Table& table, const ChunkedArray& indices,
             const TakeOptions& options, std::shared_ptr<Table>* out) {
   auto ncols = table.num_columns();
   std::vector<std::shared_ptr<ChunkedArray>> columns(ncols);
diff --git a/cpp/src/arrow/compute/kernels/take_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_take_benchmark.cc
similarity index 95%
rename from cpp/src/arrow/compute/kernels/take_benchmark.cc
rename to cpp/src/arrow/compute/kernels/vector_take_benchmark.cc
index 105905c6c75..00b0a7bbd3a 100644
--- a/cpp/src/arrow/compute/kernels/take_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/vector_take_benchmark.cc
@@ -17,7 +17,7 @@
 
 #include "benchmark/benchmark.h"
 
-#include "arrow/compute/kernels/take.h"
+#include "arrow/compute/api.h"
 
 #include "arrow/compute/benchmark_util.h"
 #include "arrow/compute/test_util.h"
@@ -31,12 +31,8 @@ constexpr auto kSeed = 0x0ff1ce;
 
 static void TakeBenchmark(benchmark::State& state, const std::shared_ptr<Array>& values,
                           const std::shared_ptr<Array>& indices) {
-  FunctionContext ctx;
-  TakeOptions options;
   for (auto _ : state) {
-    Datum out;
-    ABORT_NOT_OK(Take(&ctx, Datum(values), Datum(indices), options, &out));
-    benchmark::DoNotOptimize(out);
+    ABORT_NOT_OK(Take(values, indices).status());
   }
 }
 
diff --git a/cpp/src/arrow/compute/kernels/take_internal.h b/cpp/src/arrow/compute/kernels/vector_take_internal.h
similarity index 99%
rename from cpp/src/arrow/compute/kernels/take_internal.h
rename to cpp/src/arrow/compute/kernels/vector_take_internal.h
index 4ab9fb68703..68d0e223517 100644
--- a/cpp/src/arrow/compute/kernels/take_internal.h
+++ b/cpp/src/arrow/compute/kernels/vector_take_internal.h
@@ -25,7 +25,7 @@
 #include <vector>
 
 #include "arrow/builder.h"
-#include "arrow/compute/context.h"
+#include "arrow/compute/kernel.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/bit_util.h"
 #include "arrow/util/checked_cast.h"
diff --git a/cpp/src/arrow/compute/kernels/take_test.cc b/cpp/src/arrow/compute/kernels/vector_take_test.cc
similarity index 93%
rename from cpp/src/arrow/compute/kernels/take_test.cc
rename to cpp/src/arrow/compute/kernels/vector_take_test.cc
index eaaa579df00..1ee336c1a7f 100644
--- a/cpp/src/arrow/compute/kernels/take_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_take_test.cc
@@ -21,8 +21,7 @@
 #include <utility>
 #include <vector>
 
-#include "arrow/compute/context.h"
-#include "arrow/compute/kernels/take.h"
+#include "arrow/compute/kernel.h"
 #include "arrow/compute/test_util.h"
 #include "arrow/testing/gtest_common.h"
 #include "arrow/testing/gtest_util.h"
@@ -39,14 +38,12 @@ using util::string_view;
 constexpr auto kSeed = 0x0ff1ce;
 
 template <typename ArrowType>
-class TestTakeKernel : public ComputeFixture, public TestBase {
+class TestTakeKernel : public TestBase {
  protected:
   void AssertTakeArrays(const std::shared_ptr<Array>& values,
                         const std::shared_ptr<Array>& indices,
                         const std::shared_ptr<Array>& expected) {
-    std::shared_ptr<Array> actual;
-    TakeOptions options;
-    ASSERT_OK(arrow::compute::Take(&this->ctx_, *values, *indices, options, &actual));
+    ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> actual, Take(*values, *indices));
     ASSERT_OK(actual->ValidateFull());
     AssertArraysEqual(*expected, *actual);
   }
@@ -65,9 +62,8 @@ class TestTakeKernel : public ComputeFixture, public TestBase {
   Status Take(const std::shared_ptr<DataType>& type, const std::string& values,
               const std::shared_ptr<DataType>& index_type, const std::string& indices,
               std::shared_ptr<Array>* out) {
-    TakeOptions options;
-    return arrow::compute::Take(&this->ctx_, *ArrayFromJSON(type, values),
-                                *ArrayFromJSON(index_type, indices), options, out);
+    return Take(*ArrayFromJSON(type, values), *ArrayFromJSON(index_type, indices))
+        .Value(out);
   }
 };
 
@@ -130,10 +126,7 @@ class TestTakeKernelWithNumeric : public TestTakeKernel<ArrowType> {
 
   void ValidateTake(const std::shared_ptr<Array>& values,
                     const std::shared_ptr<Array>& indices_boxed) {
-    std::shared_ptr<Array> taken;
-    TakeOptions options;
-    ASSERT_OK(
-        arrow::compute::Take(&this->ctx_, *values, *indices_boxed, options, &taken));
+    ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> taken, Take(*values, *indices_boxed));
     ASSERT_OK(taken->ValidateFull());
     ASSERT_EQ(indices_boxed->length(), taken->length());
 
@@ -412,13 +405,11 @@ TEST_F(TestTakeKernelWithUnion, TakeUnion) {
   }
 }
 
-class TestPermutationsWithTake : public ComputeFixture, public TestBase {
+class TestPermutationsWithTake : public TestBase {
  protected:
   void Take(const Int16Array& values, const Int16Array& indices,
             std::shared_ptr<Int16Array>* out) {
-    TakeOptions options;
-    std::shared_ptr<Array> boxed_out;
-    ASSERT_OK(arrow::compute::Take(&this->ctx_, values, indices, options, &boxed_out));
+    ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> boxed_out, Take(values, indices));
     ASSERT_OK(boxed_out->ValidateFull());
     *out = checked_pointer_cast<Int16Array>(std::move(boxed_out));
   }
@@ -544,9 +535,7 @@ class TestTakeKernelWithRecordBatch : public TestTakeKernel<RecordBatch> {
               const std::shared_ptr<DataType>& index_type, const std::string& indices,
               std::shared_ptr<RecordBatch>* out) {
     auto batch = RecordBatchFromJSON(schm, batch_json);
-    TakeOptions options;
-    return arrow::compute::Take(&this->ctx_, *batch, *ArrayFromJSON(index_type, indices),
-                                options, out);
+    return Take(*batch, *ArrayFromJSON(index_type, indices)).Value(out);
   }
 };
 
@@ -609,18 +598,17 @@ class TestTakeKernelWithChunkedArray : public TestTakeKernel<ChunkedArray> {
   Status TakeWithArray(const std::shared_ptr<DataType>& type,
                        const std::vector<std::string>& values, const std::string& indices,
                        std::shared_ptr<ChunkedArray>* out) {
-    TakeOptions options;
-    return arrow::compute::Take(&this->ctx_, *ChunkedArrayFromJSON(type, values),
-                                *ArrayFromJSON(int8(), indices), options, out);
+    return Take(*ChunkedArrayFromJSON(type, values), *ArrayFromJSON(int8(), indices))
+        .Value(out);
   }
 
   Status TakeWithChunkedArray(const std::shared_ptr<DataType>& type,
                               const std::vector<std::string>& values,
                               const std::vector<std::string>& indices,
                               std::shared_ptr<ChunkedArray>* out) {
-    TakeOptions options;
-    return arrow::compute::Take(&this->ctx_, *ChunkedArrayFromJSON(type, values),
-                                *ChunkedArrayFromJSON(int8(), indices), options, out);
+    return Take(*ChunkedArrayFromJSON(type, values),
+                *ChunkedArrayFromJSON(int8(), indices))
+        .Value(out);
   }
 };
 
@@ -666,18 +654,15 @@ class TestTakeKernelWithTable : public TestTakeKernel<Table> {
   Status TakeWithArray(const std::shared_ptr<Schema>& schm,
                        const std::vector<std::string>& values, const std::string& indices,
                        std::shared_ptr<Table>* out) {
-    TakeOptions options;
-    return arrow::compute::Take(&this->ctx_, *TableFromJSON(schm, values),
-                                *ArrayFromJSON(int8(), indices), options, out);
+    return Take(*TableFromJSON(schm, values), *ArrayFromJSON(int8(), indices)).Value(out);
   }
 
   Status TakeWithChunkedArray(const std::shared_ptr<Schema>& schm,
                               const std::vector<std::string>& values,
                               const std::vector<std::string>& indices,
                               std::shared_ptr<Table>* out) {
-    TakeOptions options;
-    return arrow::compute::Take(&this->ctx_, *TableFromJSON(schm, values),
-                                *ChunkedArrayFromJSON(int8(), indices), options, out);
+    return Take(*TableFromJSON(schm, values), *ChunkedArrayFromJSON(int8(), indices))
+        .Value(out);
   }
 };
 
diff --git a/cpp/src/arrow/compute/options.h b/cpp/src/arrow/compute/options.h
new file mode 100644
index 00000000000..aeb659953c7
--- /dev/null
+++ b/cpp/src/arrow/compute/options.h
@@ -0,0 +1,155 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <utility>
+
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Array;
+class DataType;
+
+namespace compute {
+
+struct ARROW_EXPORT FunctionOptions {};
+
+struct ARROW_EXPORT CastOptions : public FunctionOptions {
+  CastOptions()
+      : allow_int_overflow(false),
+        allow_time_truncate(false),
+        allow_time_overflow(false),
+        allow_decimal_truncate(false),
+        allow_float_truncate(false),
+        allow_invalid_utf8(false) {}
+
+  explicit CastOptions(bool safe)
+      : allow_int_overflow(!safe),
+        allow_time_truncate(!safe),
+        allow_time_overflow(!safe),
+        allow_decimal_truncate(!safe),
+        allow_float_truncate(!safe),
+        allow_invalid_utf8(!safe) {}
+
+  static CastOptions Safe() { return CastOptions(true); }
+
+  static CastOptions Unsafe() { return CastOptions(false); }
+
+  // Type being casted to. May be passed separate to eager function
+  // compute::Cast
+  std::shared_ptr<DataType> to_type;
+
+  bool allow_int_overflow;
+  bool allow_time_truncate;
+  bool allow_time_overflow;
+  bool allow_decimal_truncate;
+  bool allow_float_truncate;
+  // Indicate if conversions from Binary/FixedSizeBinary to string must
+  // validate the utf8 payload.
+  bool allow_invalid_utf8;
+};
+
+enum CompareOperator {
+  EQUAL,
+  NOT_EQUAL,
+  GREATER,
+  GREATER_EQUAL,
+  LESS,
+  LESS_EQUAL,
+};
+
+struct CompareOptions : public FunctionOptions {
+  explicit CompareOptions(CompareOperator op) : op(op) {}
+
+  enum CompareOperator op;
+};
+
+/// \class CountOptions
+///
+/// The user control the Count kernel behavior with this class. By default, the
+/// it will count all non-null values.
+struct ARROW_EXPORT CountOptions : public FunctionOptions {
+  enum mode {
+    // Count all non-null values.
+    COUNT_ALL = 0,
+    // Count all null values.
+    COUNT_NULL,
+  };
+
+  explicit CountOptions(enum mode count_mode) : count_mode(count_mode) {}
+
+  static CountOptions Defaults() { return CountOptions(COUNT_ALL); }
+
+  enum mode count_mode = COUNT_ALL;
+};
+
+/// For set lookup operations like IsIn, Match
+struct ARROW_EXPORT SetLookupOptions : public FunctionOptions {
+  explicit SetLookupOptions(std::shared_ptr<Array> value_set, bool skip_nulls)
+      : value_set(std::move(value_set)), skip_nulls(skip_nulls) {}
+
+  std::shared_ptr<Array> value_set;
+  bool skip_nulls;
+};
+
+struct FilterOptions {
+  /// Configure the action taken when a slot of the selection mask is null
+  enum NullSelectionBehavior {
+    /// the corresponding filtered value will be removed in the output
+    DROP,
+    /// the corresponding filtered value will be null in the output
+    EMIT_NULL,
+  };
+
+  static FilterOptions Defaults() { return FilterOptions{}; }
+
+  NullSelectionBehavior null_selection_behavior = DROP;
+};
+
+struct ARROW_EXPORT TakeOptions : public FunctionOptions {
+  static TakeOptions Defaults() { return TakeOptions{}; }
+};
+
+/// \class MinMaxOptions
+///
+/// The user can control the MinMax kernel behavior with this class. By default,
+/// it will skip null if there is a null value present.
+struct ARROW_EXPORT MinMaxOptions : public FunctionOptions {
+  enum mode {
+    /// skip null values
+    SKIP = 0,
+    /// any nulls will result in null output
+    OUTPUT_NULL
+  };
+
+  explicit MinMaxOptions(enum mode null_handling = SKIP) : null_handling(null_handling) {}
+
+  static MinMaxOptions Defaults() { return MinMaxOptions{}; }
+
+  enum mode null_handling = SKIP;
+};
+
+struct PartitionOptions : public FunctionOptions {
+  explicit PartitionOptions(int64_t pivot) : pivot(pivot) {}
+  int64_t pivot;
+};
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/registry.cc b/cpp/src/arrow/compute/registry.cc
new file mode 100644
index 00000000000..27caee94746
--- /dev/null
+++ b/cpp/src/arrow/compute/registry.cc
@@ -0,0 +1,124 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/registry.h"
+
+#include <algorithm>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+
+#include "arrow/compute/function.h"
+#include "arrow/compute/kernels/registry.h"
+#include "arrow/status.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace compute {
+
+class FunctionRegistry::FunctionRegistryImpl {
+ public:
+  Status AddFunction(std::shared_ptr<const Function> function, bool allow_overwrite) {
+    std::lock_guard<std::mutex> mutation_guard(lock_);
+
+    const std::string& name = function->name();
+    auto it = name_to_function_.find(name);
+    if (it != name_to_function_.end() && !allow_overwrite) {
+      return Status::KeyError("Already have a function registered with name: ", name);
+    }
+    name_to_function_[name] = std::move(function);
+    return Status::OK();
+  }
+
+  Result<std::shared_ptr<const Function>> GetFunction(const std::string& name) const {
+    auto it = name_to_function_.find(name);
+    if (it == name_to_function_.end()) {
+      return Status::KeyError("No function registered with name: ", name);
+    }
+    return it->second;
+  }
+
+  std::vector<std::string> GetFunctionNames() const {
+    std::vector<std::string> results;
+    for (auto it : name_to_function_) {
+      results.push_back(it.first);
+    }
+    std::sort(results.begin(), results.end());
+    return results;
+  }
+
+  int num_functions() const { return static_cast<int>(name_to_function_.size()); }
+
+ private:
+  std::mutex lock_;
+  std::unordered_map<std::string, std::shared_ptr<const Function>> name_to_function_;
+};
+
+std::unique_ptr<FunctionRegistry> FunctionRegistry::Make() {
+  return std::unique_ptr<FunctionRegistry>(new FunctionRegistry());
+}
+
+FunctionRegistry::FunctionRegistry() { impl_.reset(new FunctionRegistryImpl()); }
+
+FunctionRegistry::~FunctionRegistry() {}
+
+Status FunctionRegistry::AddFunction(std::shared_ptr<const Function> function,
+                                     bool allow_overwrite) {
+  return impl_->AddFunction(std::move(function), allow_overwrite);
+}
+
+Result<std::shared_ptr<const Function>> FunctionRegistry::GetFunction(
+    const std::string& name) const {
+  return impl_->GetFunction(name);
+}
+
+std::vector<std::string> FunctionRegistry::GetFunctionNames() const {
+  return impl_->GetFunctionNames();
+}
+
+int FunctionRegistry::num_functions() const { return impl_->num_functions(); }
+
+static std::unique_ptr<FunctionRegistry> g_registry;
+static std::once_flag func_registry_initialized;
+
+namespace internal {
+
+static void CreateBuiltInRegistry() {
+  g_registry = FunctionRegistry::Make();
+
+  // Scalar functions
+  RegisterArithmeticFunctions(g_registry.get());
+  RegisterBooleanFunctions(g_registry.get());
+  RegisterComparisonFunctions(g_registry.get());
+  RegisterSetLookupFunctions(g_registry.get());
+
+  // Aggregate functions
+  RegisterBasicAggregateFunctions(g_registry.get());
+
+  // Vector functions
+  RegisterVectorPartitionFunctions(g_registry.get());
+}
+
+}  // namespace internal
+
+FunctionRegistry* GetFunctionRegistry() {
+  std::call_once(func_registry_initialized, internal::CreateBuiltInRegistry);
+  return g_registry.get();
+}
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/registry.h b/cpp/src/arrow/compute/registry.h
new file mode 100644
index 00000000000..5a9774eeb72
--- /dev/null
+++ b/cpp/src/arrow/compute/registry.h
@@ -0,0 +1,74 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// NOTE: API is EXPERIMENTAL and will change without going through a
+// deprecation cycle
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace compute {
+
+class Function;
+
+/// \brief A mutable central function registry for built-in functions
+/// and user-defined functions.
+class ARROW_EXPORT FunctionRegistry {
+ public:
+  ~FunctionRegistry();
+
+  /// \brief Construct a new kernel registry. Most users only need to use the
+  /// global registry
+  static std::unique_ptr<FunctionRegistry> Make();
+
+  /// \brief Add a new kernel to the registry. Returns Status::KeyError if a
+  /// kernel with the same name is already registered
+  Status AddFunction(std::shared_ptr<const Function> function,
+                     bool allow_overwrite = false);
+
+  /// \brief Retrieve a kernel by name from the registry
+  Result<std::shared_ptr<const Function>> GetFunction(const std::string& name) const;
+
+  /// \brief Return vector of all entry names in the registry. Helpful for
+  /// displaying a manifest of available kernels
+  std::vector<std::string> GetFunctionNames() const;
+
+  /// \brief The number of currently registered functions
+  int num_functions() const;
+
+ private:
+  FunctionRegistry();
+
+  /// Use PIMPL pattern to not have std::unordered_map here
+  class FunctionRegistryImpl;
+  std::unique_ptr<FunctionRegistryImpl> impl_;
+};
+
+// \brief Return the process-global kernel registry
+ARROW_EXPORT FunctionRegistry* GetFunctionRegistry();
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/registry_test.cc b/cpp/src/arrow/compute/registry_test.cc
new file mode 100644
index 00000000000..2166155f9c1
--- /dev/null
+++ b/cpp/src/arrow/compute/registry_test.cc
@@ -0,0 +1,89 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "arrow/array.h"
+#include "arrow/compute/function.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/compute/registry.h"
+#include "arrow/result.h"
+#include "arrow/scalar.h"
+#include "arrow/status.h"
+#include "arrow/testing/gtest_common.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace compute {
+
+class TestRegistry : public ::testing::Test {
+ public:
+  void SetUp() { registry_ = FunctionRegistry::Make(); }
+
+ protected:
+  std::unique_ptr<FunctionRegistry> registry_;
+};
+
+TEST_F(TestRegistry, CreateBuiltInRegistry) {
+  // This does DCHECK_OK internally for now so this will fail in debug builds
+  // if there is a problem initializing the global function registry
+  FunctionRegistry* registry = GetFunctionRegistry();
+  ARROW_UNUSED(registry);
+}
+
+TEST_F(TestRegistry, Basics) {
+  ASSERT_EQ(0, registry_->num_functions());
+
+  std::shared_ptr<Function> func = std::make_shared<ScalarFunction>("f1", 1);
+  ASSERT_OK(registry_->AddFunction(func));
+  ASSERT_EQ(1, registry_->num_functions());
+
+  func = std::make_shared<VectorFunction>("f0", 2);
+  ASSERT_OK(registry_->AddFunction(func));
+  ASSERT_EQ(2, registry_->num_functions());
+
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<const Function> f1, registry_->GetFunction("f1"));
+  ASSERT_EQ("f1", f1->name());
+
+  // Non-existent function
+  ASSERT_RAISES(KeyError, registry_->GetFunction("f2"));
+
+  // Try adding a function with name collision
+  func = std::make_shared<ScalarAggregateFunction>("f1", 1);
+  ASSERT_RAISES(KeyError, registry_->AddFunction(func));
+
+  // Allow overwriting by flag
+  ASSERT_OK(registry_->AddFunction(func, /*allow_overwrite=*/true));
+  ASSERT_OK_AND_ASSIGN(f1, registry_->GetFunction("f1"));
+  ASSERT_EQ(Function::SCALAR_AGGREGATE, f1->kind());
+
+  std::vector<std::string> expected_names = {"f0", "f1"};
+  ASSERT_EQ(expected_names, registry_->GetFunctionNames());
+}
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/take.h b/cpp/src/arrow/compute/take.h
similarity index 60%
rename from cpp/src/arrow/compute/kernels/take.h
rename to cpp/src/arrow/compute/take.h
index 26302b3bc5c..81a54a25343 100644
--- a/cpp/src/arrow/compute/kernels/take.h
+++ b/cpp/src/arrow/compute/take.h
@@ -19,19 +19,14 @@
 
 #include <memory>
 
-#include "arrow/compute/kernel.h"
-#include "arrow/status.h"
-#include "arrow/util/visibility.h"
+#include "arrow/compute/options.h"
+#include "arrow/datum.h"
+#include "arrow/result.h"
 
 namespace arrow {
-
-class Array;
-
 namespace compute {
 
-class FunctionContext;
-
-struct ARROW_EXPORT TakeOptions {};
+class ExecContext;
 
 /// \brief Take from an array of values at indices in another array
 ///
@@ -44,14 +39,15 @@ struct ARROW_EXPORT TakeOptions {};
 /// = [values[2], values[1], null, values[3]]
 /// = ["c", "b", null, null]
 ///
-/// \param[in] ctx the FunctionContext
 /// \param[in] values array from which to take
 /// \param[in] indices which values to take
 /// \param[in] options options
-/// \param[out] out resulting array
+/// \param[in] context the function execution context, optional
+/// \return the resulting array
 ARROW_EXPORT
-Status Take(FunctionContext* ctx, const Array& values, const Array& indices,
-            const TakeOptions& options, std::shared_ptr<Array>* out);
+Result<std::shared_ptr<Array>> Take(const Array& values, const Array& indices,
+                                    const TakeOptions& options = TakeOptions::Defaults(),
+                                    ExecContext* context = NULLPTR);
 
 /// \brief Take from a chunked array of values at indices in another array
 ///
@@ -64,15 +60,16 @@ Status Take(FunctionContext* ctx, const Array& values, const Array& indices,
 /// = [values[2], values[1], null, values[3]]
 /// = ["c", "b", null, null]
 ///
-/// \param[in] ctx the FunctionContext
 /// \param[in] values chunked array from which to take
 /// \param[in] indices which values to take
 /// \param[in] options options
-/// \param[out] out resulting chunked array
+/// \param[in] context the function execution context, optional
+/// \return the resulting chunked array
 /// NOTE: Experimental API
 ARROW_EXPORT
-Status Take(FunctionContext* ctx, const ChunkedArray& values, const Array& indices,
-            const TakeOptions& options, std::shared_ptr<ChunkedArray>* out);
+Result<std::shared_ptr<ChunkedArray>> Take(
+    const ChunkedArray& values, const Array& indices,
+    const TakeOptions& options = TakeOptions::Defaults(), ExecContext* context = NULLPTR);
 
 /// \brief Take from a chunked array of values at indices in a chunked array
 ///
@@ -86,15 +83,16 @@ Status Take(FunctionContext* ctx, const ChunkedArray& values, const Array& indic
 /// = [values[2], values[1], null, values[3]]
 /// = ["c", "b", null, null]
 ///
-/// \param[in] ctx the FunctionContext
 /// \param[in] values chunked array from which to take
 /// \param[in] indices which values to take
 /// \param[in] options options
-/// \param[out] out resulting chunked array
+/// \param[in] context the function execution context, optional
+/// \return the resulting chunked array
 /// NOTE: Experimental API
 ARROW_EXPORT
-Status Take(FunctionContext* ctx, const ChunkedArray& values, const ChunkedArray& indices,
-            const TakeOptions& options, std::shared_ptr<ChunkedArray>* out);
+Result<std::shared_ptr<ChunkedArray>> Take(
+    const ChunkedArray& values, const ChunkedArray& indices,
+    const TakeOptions& options = TakeOptions::Defaults(), ExecContext* context = NULLPTR);
 
 /// \brief Take from an array of values at indices in a chunked array
 ///
@@ -108,15 +106,16 @@ Status Take(FunctionContext* ctx, const ChunkedArray& values, const ChunkedArray
 /// = [values[2], values[1], null, values[3]]
 /// = ["c", "b", null, null]
 ///
-/// \param[in] ctx the FunctionContext
 /// \param[in] values array from which to take
 /// \param[in] indices which values to take
 /// \param[in] options options
-/// \param[out] out resulting chunked array
+/// \param[in] context the function execution context, optional
+/// \return the resulting chunked array
 /// NOTE: Experimental API
 ARROW_EXPORT
-Status Take(FunctionContext* ctx, const Array& values, const ChunkedArray& indices,
-            const TakeOptions& options, std::shared_ptr<ChunkedArray>* out);
+Result<std::shared_ptr<ChunkedArray>> Take(
+    const Array& values, const ChunkedArray& indices,
+    const TakeOptions& options = TakeOptions::Defaults(), ExecContext* context = NULLPTR);
 
 /// \brief Take from a record batch at indices in another array
 ///
@@ -124,15 +123,16 @@ Status Take(FunctionContext* ctx, const Array& values, const ChunkedArray& indic
 /// with rows taken from the columns in the batch at the given
 /// indices. If an index is null then the taken element will be null.
 ///
-/// \param[in] ctx the FunctionContext
 /// \param[in] batch record batch from which to take
 /// \param[in] indices which values to take
 /// \param[in] options options
-/// \param[out] out resulting record batch
+/// \param[in] context the function execution context, optional
+/// \return the resulting record batch
 /// NOTE: Experimental API
 ARROW_EXPORT
-Status Take(FunctionContext* ctx, const RecordBatch& batch, const Array& indices,
-            const TakeOptions& options, std::shared_ptr<RecordBatch>* out);
+Result<std::shared_ptr<RecordBatch>> Take(
+    const RecordBatch& batch, const Array& indices,
+    const TakeOptions& options = TakeOptions::Defaults(), ExecContext* context = NULLPTR);
 
 /// \brief Take from a table at indices in an array
 ///
@@ -140,15 +140,16 @@ Status Take(FunctionContext* ctx, const RecordBatch& batch, const Array& indices
 /// with rows taken from the columns in the table at the given
 /// indices. If an index is null then the taken element will be null.
 ///
-/// \param[in] ctx the FunctionContext
 /// \param[in] table table from which to take
 /// \param[in] indices which values to take
 /// \param[in] options options
-/// \param[out] out resulting table
+/// \param[in] context the function execution context, optional
+/// \return the resulting table
 /// NOTE: Experimental API
 ARROW_EXPORT
-Status Take(FunctionContext* ctx, const Table& table, const Array& indices,
-            const TakeOptions& options, std::shared_ptr<Table>* out);
+Result<std::shared_ptr<Table>> Take(const Table& table, const Array& indices,
+                                    const TakeOptions& options = TakeOptions::Defaults(),
+                                    ExecContext* context = NULLPTR);
 
 /// \brief Take from a table at indices in a chunked array
 ///
@@ -156,59 +157,28 @@ Status Take(FunctionContext* ctx, const Table& table, const Array& indices,
 /// with rows taken from the values array at the given
 /// indices. If an index is null then the taken element will be null.
 ///
-/// \param[in] ctx the FunctionContext
 /// \param[in] table table from which to take
 /// \param[in] indices which values to take
 /// \param[in] options options
-/// \param[out] out resulting table
+/// \param[in] context the function execution context, optional
+/// \return the resulting table
 /// NOTE: Experimental API
 ARROW_EXPORT
-Status Take(FunctionContext* ctx, const Table& table, const ChunkedArray& indices,
-            const TakeOptions& options, std::shared_ptr<Table>* out);
+Result<std::shared_ptr<Table>> Take(const Table& table, const ChunkedArray& indices,
+                                    const TakeOptions& options = TakeOptions::Defaults(),
+                                    ExecContext* context = NULLPTR);
 
 /// \brief Take from an array of values at indices in another array
 ///
-/// \param[in] ctx the FunctionContext
 /// \param[in] values datum from which to take
 /// \param[in] indices which values to take
 /// \param[in] options options
-/// \param[out] out resulting datum
+/// \param[in] context the function execution context, optional
+/// \return the resulting datum
 ARROW_EXPORT
-Status Take(FunctionContext* ctx, const Datum& values, const Datum& indices,
-            const TakeOptions& options, Datum* out);
-
-/// \brief BinaryKernel implementing Take operation
-class ARROW_EXPORT TakeKernel : public BinaryKernel {
- public:
-  explicit TakeKernel(const std::shared_ptr<DataType>& type, TakeOptions options = {})
-      : type_(type) {}
-
-  /// \brief BinaryKernel interface
-  ///
-  /// delegates to subclasses via Take()
-  Status Call(FunctionContext* ctx, const Datum& values, const Datum& indices,
-              Datum* out) override;
-
-  /// \brief output type of this kernel (identical to type of values taken)
-  std::shared_ptr<DataType> out_type() const override { return type_; }
-
-  /// \brief factory for TakeKernels
-  ///
-  /// \param[in] value_type constructed TakeKernel will support taking
-  ///            values of this type
-  /// \param[in] index_type constructed TakeKernel will support taking
-  ///            with indices of this type
-  /// \param[out] out created kernel
-  static Status Make(const std::shared_ptr<DataType>& value_type,
-                     const std::shared_ptr<DataType>& index_type,
-                     std::unique_ptr<TakeKernel>* out);
-
-  /// \brief single-array implementation
-  virtual Status Take(FunctionContext* ctx, const Array& values, const Array& indices,
-                      std::shared_ptr<Array>* out) = 0;
+Result<Datum> Take(const Datum& values, const Datum& indices,
+                   const TakeOptions& options = TakeOptions::Defaults(),
+                   ExecContext* context = NULLPTR);
 
- protected:
-  std::shared_ptr<DataType> type_;
-};
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/test_util.h b/cpp/src/arrow/compute/test_util.h
index 0da0ea379a4..ba124964525 100644
--- a/cpp/src/arrow/compute/test_util.h
+++ b/cpp/src/arrow/compute/test_util.h
@@ -23,38 +23,18 @@
 #include <gmock/gmock.h>
 
 #include "arrow/array.h"
+#include "arrow/datum.h"
 #include "arrow/memory_pool.h"
+#include "arrow/pretty_print.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/util.h"
 #include "arrow/type.h"
 
-#include "arrow/compute/context.h"
 #include "arrow/compute/kernel.h"
 
 namespace arrow {
 namespace compute {
 
-class ComputeFixture {
- public:
-  ComputeFixture() : ctx_(default_memory_pool()) {}
-
- protected:
-  FunctionContext ctx_;
-};
-
-class MockUnaryKernel : public UnaryKernel {
- public:
-  MOCK_METHOD3(Call, Status(FunctionContext* ctx, const Datum& input, Datum* out));
-  MOCK_CONST_METHOD0(out_type, std::shared_ptr<DataType>());
-};
-
-class MockBinaryKernel : public BinaryKernel {
- public:
-  MOCK_METHOD4(Call, Status(FunctionContext* ctx, const Datum& left, const Datum& right,
-                            Datum* out));
-  MOCK_CONST_METHOD0(out_type, std::shared_ptr<DataType>());
-};
-
 template <typename Type, typename T>
 std::shared_ptr<Array> _MakeArray(const std::shared_ptr<DataType>& type,
                                   const std::vector<T>& values,
diff --git a/cpp/src/arrow/dataset/filter.cc b/cpp/src/arrow/dataset/filter.cc
index bde49edaa76..c7ccea4509f 100644
--- a/cpp/src/arrow/dataset/filter.cc
+++ b/cpp/src/arrow/dataset/filter.cc
@@ -28,12 +28,7 @@
 
 #include "arrow/buffer.h"
 #include "arrow/buffer_builder.h"
-#include "arrow/compute/context.h"
-#include "arrow/compute/kernels/boolean.h"
-#include "arrow/compute/kernels/cast.h"
-#include "arrow/compute/kernels/compare.h"
-#include "arrow/compute/kernels/filter.h"
-#include "arrow/compute/kernels/isin.h"
+#include "arrow/compute/api_eager.h"
 #include "arrow/dataset/dataset.h"
 #include "arrow/io/memory.h"
 #include "arrow/ipc/reader.h"
@@ -49,9 +44,12 @@
 #include "arrow/visitor_inline.h"
 
 namespace arrow {
+
+using compute::CompareOperator;
+using compute::ExecContext;
+
 namespace dataset {
 
-using arrow::compute::Datum;
 using arrow::internal::checked_cast;
 using arrow::internal::checked_pointer_cast;
 
@@ -186,9 +184,7 @@ Result<Comparison::type> Compare(const Scalar& lhs, const Scalar& rhs) {
   return vis.result_;
 }
 
-compute::CompareOperator InvertCompareOperator(compute::CompareOperator op) {
-  using compute::CompareOperator;
-
+CompareOperator InvertCompareOperator(CompareOperator op) {
   switch (op) {
     case CompareOperator::EQUAL:
       return CompareOperator::NOT_EQUAL;
@@ -259,7 +255,7 @@ std::shared_ptr<Expression> Invert(const Expression& expr) {
 std::shared_ptr<Expression> Expression::Assume(const Expression& given) const {
   if (given.type() == ExpressionType::COMPARISON) {
     const auto& given_cmp = checked_cast<const ComparisonExpression&>(given);
-    if (given_cmp.op() == compute::CompareOperator::EQUAL) {
+    if (given_cmp.op() == CompareOperator::EQUAL) {
       if (this->Equals(given_cmp.left_operand()) &&
           given_cmp.right_operand()->type() == ExpressionType::SCALAR) {
         return given_cmp.right_operand();
@@ -353,8 +349,6 @@ std::shared_ptr<Expression> ComparisonExpression::AssumeGivenComparison(
   static auto always = scalar(true);
   static auto never = scalar(false);
 
-  using compute::CompareOperator;
-
   if (cmp == Comparison::GREATER) {
     // the rhs of e is greater than that of given
     switch (op()) {
@@ -571,13 +565,14 @@ std::shared_ptr<Expression> InExpression::Assume(const Expression& given) const
 
   const auto& value = checked_cast<const ScalarExpression&>(*operand).value();
 
-  Datum out;
-  compute::FunctionContext ctx;
-  arrow::compute::CompareOptions eq(compute::CompareOperator::EQUAL);
-  if (!compute::Compare(&ctx, Datum(set_), Datum(value), eq, &out).ok()) {
+  compute::CompareOptions eq(CompareOperator::EQUAL);
+  Result<Datum> out_result = compute::Compare(set_, value, eq);
+  if (!out_result.ok()) {
     return std::make_shared<InExpression>(std::move(operand), set_);
   }
 
+  Datum out = out_result.ValueOrDie();
+
   DCHECK(out.is_array());
   DCHECK_EQ(out.type()->id(), Type::BOOL);
   auto out_array = checked_pointer_cast<BooleanArray>(out.make_array());
@@ -629,7 +624,6 @@ const std::shared_ptr<Expression>& CastExpression::like_expr() const {
 std::string FieldExpression::ToString() const { return name_; }
 
 std::string OperatorName(compute::CompareOperator op) {
-  using compute::CompareOperator;
   switch (op) {
     case CompareOperator::EQUAL:
       return "==";
@@ -908,8 +902,10 @@ Result<std::shared_ptr<DataType>> CastExpression::Validate(const Schema& schema)
     return to_type;
   }
 
-  std::unique_ptr<compute::UnaryKernel> kernel;
-  RETURN_NOT_OK(GetCastFunction(*operand_type, to_type, options_, &kernel));
+  if (!compute::CanCast(*operand_type, *to_type)) {
+    return Status::Invalid("Cannot cast to ", to_type->ToString());
+  }
+
   return to_type;
 }
 
@@ -956,9 +952,7 @@ struct InsertImplicitCastsImpl {
 
     if (!op.type->Equals(set->type())) {
       // cast the set (which we assume to be small) to match op.type
-      compute::FunctionContext ctx;
-      const auto options = compute::CastOptions::Safe();
-      RETURN_NOT_OK(arrow::compute::Cast(&ctx, *set, op.type, options, &set));
+      ARROW_ASSIGN_OR_RAISE(set, compute::Cast(*set, op.type));
     }
 
     return std::make_shared<InExpression>(std::move(op.expr), std::move(set));
@@ -1118,10 +1112,9 @@ struct TreeEvaluator::Impl {
   }
 
   Result<Datum> EvaluateBoolean(const BinaryExpression& expr,
-                                Status kernel(compute::FunctionContext* context,
-                                              const compute::Datum& left,
-                                              const compute::Datum& right,
-                                              compute::Datum* out)) const {
+                                Result<Datum> kernel(const Datum& left,
+                                                     const Datum& right,
+                                                     ExecContext* ctx)) const {
     ARROW_ASSIGN_OR_RAISE(auto lhs, Evaluate(*expr.left_operand()));
     ARROW_ASSIGN_OR_RAISE(auto rhs, Evaluate(*expr.right_operand()));
 
@@ -1139,9 +1132,7 @@ struct TreeEvaluator::Impl {
       rhs = Datum(std::move(rhs_array));
     }
 
-    Datum out;
-    RETURN_NOT_OK(kernel(&ctx_, lhs, rhs, &out));
-    return std::move(out);
+    return kernel(lhs, rhs, &ctx_);
   }
 
   Result<Datum> operator()(const NotExpression& expr) const {
@@ -1155,11 +1146,7 @@ struct TreeEvaluator::Impl {
           checked_cast<const BooleanScalar&>(*to_invert.scalar()).value;
       return Datum(std::make_shared<BooleanScalar>(!trivial_condition));
     }
-
-    DCHECK(to_invert.is_array());
-    Datum out;
-    RETURN_NOT_OK(arrow::compute::Invert(&ctx_, to_invert, &out));
-    return std::move(out);
+    return compute::Invert(to_invert, &ctx_);
   }
 
   Result<Datum> operator()(const InExpression& expr) const {
@@ -1169,9 +1156,7 @@ struct TreeEvaluator::Impl {
     }
 
     DCHECK(operand_values.is_array());
-    Datum out;
-    RETURN_NOT_OK(arrow::compute::IsIn(&ctx_, operand_values, expr.set(), &out));
-    return std::move(out);
+    return compute::IsIn(operand_values, expr.set(), &ctx_);
   }
 
   Result<Datum> operator()(const IsValidExpression& expr) const {
@@ -1202,9 +1187,7 @@ struct TreeEvaluator::Impl {
     }
 
     DCHECK(to_cast.is_array());
-    Datum out;
-    RETURN_NOT_OK(arrow::compute::Cast(&ctx_, to_cast, to_type, expr.options(), &out));
-    return std::move(out);
+    return compute::Cast(to_cast, to_type, expr.options());
   }
 
   Result<Datum> operator()(const ComparisonExpression& expr) const {
@@ -1217,10 +1200,7 @@ struct TreeEvaluator::Impl {
 
     DCHECK(lhs.is_array());
 
-    Datum out;
-    RETURN_NOT_OK(arrow::compute::Compare(
-        &ctx_, lhs, rhs, arrow::compute::CompareOptions(expr.op()), &out));
-    return std::move(out);
+    return compute::Compare(lhs, rhs, compute::CompareOptions(expr.op()), &ctx_);
   }
 
   Result<Datum> operator()(const Expression& expr) const {
@@ -1233,7 +1213,7 @@ struct TreeEvaluator::Impl {
 
   const TreeEvaluator* this_;
   const RecordBatch& batch_;
-  mutable compute::FunctionContext ctx_;
+  mutable compute::ExecContext ctx_;
 };
 
 Result<Datum> TreeEvaluator::Evaluate(const Expression& expr, const RecordBatch& batch,
@@ -1242,13 +1222,13 @@ Result<Datum> TreeEvaluator::Evaluate(const Expression& expr, const RecordBatch&
 }
 
 Result<std::shared_ptr<RecordBatch>> TreeEvaluator::Filter(
-    const compute::Datum& selection, const std::shared_ptr<RecordBatch>& batch,
+    const Datum& selection, const std::shared_ptr<RecordBatch>& batch,
     MemoryPool* pool) const {
   if (selection.is_array()) {
     auto selection_array = selection.make_array();
-    compute::Datum filtered;
-    compute::FunctionContext ctx{pool};
-    RETURN_NOT_OK(compute::Filter(&ctx, batch, selection_array, {}, &filtered));
+    compute::ExecContext ctx(pool);
+    ARROW_ASSIGN_OR_RAISE(Datum filtered,
+                          compute::Filter(batch, selection_array, {}, &ctx));
     return filtered.record_batch();
   }
 
@@ -1425,9 +1405,9 @@ struct DeserializeImpl {
         ARROW_ASSIGN_OR_RAISE(auto left_operand, FromArray(*struct_array.field(0)));
         ARROW_ASSIGN_OR_RAISE(auto right_operand, FromArray(*struct_array.field(1)));
         ARROW_ASSIGN_OR_RAISE(auto op, GetView<Int32Type>(struct_array, 2));
-        return std::make_shared<ComparisonExpression>(
-            static_cast<compute::CompareOperator>(op), std::move(left_operand),
-            std::move(right_operand));
+        return std::make_shared<ComparisonExpression>(static_cast<CompareOperator>(op),
+                                                      std::move(left_operand),
+                                                      std::move(right_operand));
       }
 
       case ExpressionType::IS_VALID: {
diff --git a/cpp/src/arrow/dataset/filter.h b/cpp/src/arrow/dataset/filter.h
index fa6a4a5535b..96366268a8a 100644
--- a/cpp/src/arrow/dataset/filter.h
+++ b/cpp/src/arrow/dataset/filter.h
@@ -25,12 +25,10 @@
 #include <utility>
 #include <vector>
 
-#include "arrow/compute/context.h"
-#include "arrow/compute/kernel.h"
-#include "arrow/compute/kernels/cast.h"
-#include "arrow/compute/kernels/compare.h"
+#include "arrow/compute/options.h"
 #include "arrow/dataset/type_fwd.h"
 #include "arrow/dataset/visibility.h"
+#include "arrow/datum.h"
 #include "arrow/result.h"
 #include "arrow/scalar.h"
 #include "arrow/type_fwd.h"
@@ -40,6 +38,9 @@
 namespace arrow {
 namespace dataset {
 
+using compute::CastOptions;
+using compute::CompareOperator;
+
 struct ExpressionType {
   enum type {
     /// a reference to a column within a record batch, will evaluate to an array
@@ -220,13 +221,13 @@ class ARROW_DS_EXPORT Expression {
   IsValidExpression IsValid() const;
 
   CastExpression CastTo(std::shared_ptr<DataType> type,
-                        compute::CastOptions options = compute::CastOptions()) const;
+                        CastOptions options = CastOptions()) const;
 
   CastExpression CastLike(const Expression& expr,
-                          compute::CastOptions options = compute::CastOptions()) const;
+                          CastOptions options = CastOptions()) const;
 
   CastExpression CastLike(std::shared_ptr<Expression> expr,
-                          compute::CastOptions options = compute::CastOptions()) const;
+                          CastOptions options = CastOptions()) const;
 
  protected:
   ExpressionType::type type_;
@@ -284,8 +285,7 @@ class ARROW_DS_EXPORT ComparisonExpression final
     : public ExpressionImpl<BinaryExpression, ComparisonExpression,
                             ExpressionType::COMPARISON> {
  public:
-  ComparisonExpression(compute::CompareOperator op,
-                       std::shared_ptr<Expression> left_operand,
+  ComparisonExpression(CompareOperator op, std::shared_ptr<Expression> left_operand,
                        std::shared_ptr<Expression> right_operand)
       : ExpressionImpl(std::move(left_operand), std::move(right_operand)), op_(op) {}
 
@@ -295,7 +295,7 @@ class ARROW_DS_EXPORT ComparisonExpression final
 
   std::shared_ptr<Expression> Assume(const Expression& given) const override;
 
-  compute::CompareOperator op() const { return op_; }
+  CompareOperator op() const { return op_; }
 
   Result<std::shared_ptr<DataType>> Validate(const Schema& schema) const override;
 
@@ -303,7 +303,7 @@ class ARROW_DS_EXPORT ComparisonExpression final
   std::shared_ptr<Expression> AssumeGivenComparison(
       const ComparisonExpression& given) const;
 
-  compute::CompareOperator op_;
+  CompareOperator op_;
 };
 
 class ARROW_DS_EXPORT AndExpression final
@@ -379,7 +379,7 @@ class ARROW_DS_EXPORT CastExpression final
     : public ExpressionImpl<UnaryExpression, CastExpression, ExpressionType::CAST> {
  public:
   CastExpression(std::shared_ptr<Expression> operand, std::shared_ptr<DataType> to,
-                 compute::CastOptions options)
+                 CastOptions options)
       : ExpressionImpl(std::move(operand)),
         to_(std::move(to)),
         options_(std::move(options)) {}
@@ -387,7 +387,7 @@ class ARROW_DS_EXPORT CastExpression final
   /// The operand will be cast to whatever type `like` would evaluate to, given the same
   /// schema.
   CastExpression(std::shared_ptr<Expression> operand, std::shared_ptr<Expression> like,
-                 compute::CastOptions options)
+                 CastOptions options)
       : ExpressionImpl(std::move(operand)),
         to_(std::move(like)),
         options_(std::move(options)) {}
@@ -398,7 +398,7 @@ class ARROW_DS_EXPORT CastExpression final
 
   Result<std::shared_ptr<DataType>> Validate(const Schema& schema) const override;
 
-  const compute::CastOptions& options() const { return options_; }
+  const CastOptions& options() const { return options_; }
 
   /// Return the target type of this CastTo expression, or nullptr if this is a
   /// CastLike expression.
@@ -410,7 +410,7 @@ class ARROW_DS_EXPORT CastExpression final
 
  private:
   util::variant<std::shared_ptr<DataType>, std::shared_ptr<Expression>> to_;
-  compute::CastOptions options_;
+  CastOptions options_;
 };
 
 /// Represents a scalar value; thin wrapper around arrow::Scalar
@@ -486,23 +486,22 @@ auto scalar(T&& value) -> decltype(scalar(MakeScalar(std::forward<T>(value)))) {
   return scalar(MakeScalar(std::forward<T>(value)));
 }
 
-#define COMPARISON_FACTORY(NAME, FACTORY_NAME, OP)                                       \
-  inline std::shared_ptr<ComparisonExpression> FACTORY_NAME(                             \
-      const std::shared_ptr<Expression>& lhs, const std::shared_ptr<Expression>& rhs) {  \
-    return std::make_shared<ComparisonExpression>(compute::CompareOperator::NAME, lhs,   \
-                                                  rhs);                                  \
-  }                                                                                      \
-                                                                                         \
-  template <typename T, typename Enable = typename std::enable_if<!std::is_base_of<      \
-                            Expression, typename std::decay<T>::type>::value>::type>     \
-  ComparisonExpression operator OP(const Expression& lhs, T&& rhs) {                     \
-    return ComparisonExpression(compute::CompareOperator::NAME, lhs.Copy(),              \
-                                scalar(std::forward<T>(rhs)));                           \
-  }                                                                                      \
-                                                                                         \
-  inline ComparisonExpression operator OP(const Expression& lhs,                         \
-                                          const Expression& rhs) {                       \
-    return ComparisonExpression(compute::CompareOperator::NAME, lhs.Copy(), rhs.Copy()); \
+#define COMPARISON_FACTORY(NAME, FACTORY_NAME, OP)                                      \
+  inline std::shared_ptr<ComparisonExpression> FACTORY_NAME(                            \
+      const std::shared_ptr<Expression>& lhs, const std::shared_ptr<Expression>& rhs) { \
+    return std::make_shared<ComparisonExpression>(CompareOperator::NAME, lhs, rhs);     \
+  }                                                                                     \
+                                                                                        \
+  template <typename T, typename Enable = typename std::enable_if<!std::is_base_of<     \
+                            Expression, typename std::decay<T>::type>::value>::type>    \
+  ComparisonExpression operator OP(const Expression& lhs, T&& rhs) {                    \
+    return ComparisonExpression(CompareOperator::NAME, lhs.Copy(),                      \
+                                scalar(std::forward<T>(rhs)));                          \
+  }                                                                                     \
+                                                                                        \
+  inline ComparisonExpression operator OP(const Expression& lhs,                        \
+                                          const Expression& rhs) {                      \
+    return ComparisonExpression(CompareOperator::NAME, lhs.Copy(), rhs.Copy());         \
   }
 COMPARISON_FACTORY(EQUAL, equal, ==)
 COMPARISON_FACTORY(NOT_EQUAL, not_equal, !=)
@@ -593,21 +592,19 @@ class ARROW_DS_EXPORT ExpressionEvaluator {
   /// slots contain a single repeated value.
   ///
   /// expr must be validated against the schema of batch before calling this method.
-  virtual Result<compute::Datum> Evaluate(const Expression& expr,
-                                          const RecordBatch& batch,
-                                          MemoryPool* pool) const = 0;
+  virtual Result<Datum> Evaluate(const Expression& expr, const RecordBatch& batch,
+                                 MemoryPool* pool) const = 0;
 
-  Result<compute::Datum> Evaluate(const Expression& expr,
-                                  const RecordBatch& batch) const {
+  Result<Datum> Evaluate(const Expression& expr, const RecordBatch& batch) const {
     return Evaluate(expr, batch, default_memory_pool());
   }
 
   virtual Result<std::shared_ptr<RecordBatch>> Filter(
-      const compute::Datum& selection, const std::shared_ptr<RecordBatch>& batch,
+      const Datum& selection, const std::shared_ptr<RecordBatch>& batch,
       MemoryPool* pool) const = 0;
 
   Result<std::shared_ptr<RecordBatch>> Filter(
-      const compute::Datum& selection, const std::shared_ptr<RecordBatch>& batch) const {
+      const Datum& selection, const std::shared_ptr<RecordBatch>& batch) const {
     return Filter(selection, batch, default_memory_pool());
   }
 
@@ -628,10 +625,10 @@ class ARROW_DS_EXPORT ExpressionEvaluator {
 /// filter record batches in depth first order
 class ARROW_DS_EXPORT TreeEvaluator : public ExpressionEvaluator {
  public:
-  Result<compute::Datum> Evaluate(const Expression& expr, const RecordBatch& batch,
-                                  MemoryPool* pool) const override;
+  Result<Datum> Evaluate(const Expression& expr, const RecordBatch& batch,
+                         MemoryPool* pool) const override;
 
-  Result<std::shared_ptr<RecordBatch>> Filter(const compute::Datum& selection,
+  Result<std::shared_ptr<RecordBatch>> Filter(const Datum& selection,
                                               const std::shared_ptr<RecordBatch>& batch,
                                               MemoryPool* pool) const override;
 
diff --git a/cpp/src/arrow/dataset/filter_test.cc b/cpp/src/arrow/dataset/filter_test.cc
index 43502a6ab70..519fa420fa4 100644
--- a/cpp/src/arrow/dataset/filter_test.cc
+++ b/cpp/src/arrow/dataset/filter_test.cc
@@ -26,8 +26,6 @@
 #include <gtest/gtest.h>
 
 #include "arrow/compute/api.h"
-#include "arrow/compute/context.h"
-#include "arrow/compute/kernels/take.h"
 #include "arrow/dataset/test_util.h"
 #include "arrow/record_batch.h"
 #include "arrow/status.h"
@@ -443,8 +441,8 @@ class TakeExpression : public CustomExpression {
 
     using TreeEvaluator::Evaluate;
 
-    Result<compute::Datum> Evaluate(const Expression& expr, const RecordBatch& batch,
-                                    MemoryPool* pool) const override {
+    Result<Datum> Evaluate(const Expression& expr, const RecordBatch& batch,
+                           MemoryPool* pool) const override {
       if (expr.type() == ExpressionType::CUSTOM) {
         const auto& take_expr = checked_cast<const TakeExpression&>(expr);
         return EvaluateTake(take_expr, batch, pool);
@@ -452,23 +450,22 @@ class TakeExpression : public CustomExpression {
       return TreeEvaluator::Evaluate(expr, batch, pool);
     }
 
-    Result<compute::Datum> EvaluateTake(const TakeExpression& take_expr,
-                                        const RecordBatch& batch,
-                                        MemoryPool* pool) const {
+    Result<Datum> EvaluateTake(const TakeExpression& take_expr, const RecordBatch& batch,
+                               MemoryPool* pool) const {
       ARROW_ASSIGN_OR_RAISE(auto indices, Evaluate(*take_expr.operand_, batch, pool));
 
       if (indices.kind() == Datum::SCALAR) {
         ARROW_ASSIGN_OR_RAISE(auto indices_array,
                               MakeArrayFromScalar(*indices.scalar(), batch.num_rows(),
                                                   default_memory_pool()));
-        indices = compute::Datum(indices_array->data());
+        indices = Datum(indices_array->data());
       }
 
       DCHECK_EQ(indices.kind(), Datum::ARRAY);
-      compute::Datum out;
-      compute::FunctionContext ctx{pool};
-      RETURN_NOT_OK(compute::Take(&ctx, compute::Datum(take_expr.dictionary_->data()),
-                                  indices, compute::TakeOptions(), &out));
+      compute::ExecContext ctx(pool);
+      ARROW_ASSIGN_OR_RAISE(Datum out,
+                            compute::Take(take_expr.dictionary_->data(), indices,
+                                          compute::TakeOptions(), &ctx));
       return std::move(out);
     }
   };
diff --git a/cpp/src/arrow/dataset/scanner.h b/cpp/src/arrow/dataset/scanner.h
index 4e96f289995..243df2fde39 100644
--- a/cpp/src/arrow/dataset/scanner.h
+++ b/cpp/src/arrow/dataset/scanner.h
@@ -25,7 +25,6 @@
 #include <utility>
 #include <vector>
 
-#include "arrow/compute/context.h"
 #include "arrow/dataset/dataset.h"
 #include "arrow/dataset/projector.h"
 #include "arrow/dataset/type_fwd.h"
diff --git a/cpp/src/arrow/dataset/scanner_internal.h b/cpp/src/arrow/dataset/scanner_internal.h
index 902e5d41076..1207a7efa93 100644
--- a/cpp/src/arrow/dataset/scanner_internal.h
+++ b/cpp/src/arrow/dataset/scanner_internal.h
@@ -33,7 +33,7 @@ inline RecordBatchIterator FilterRecordBatch(RecordBatchIterator it,
                                              const Expression& filter, MemoryPool* pool) {
   return MakeMaybeMapIterator(
       [&filter, &evaluator, pool](std::shared_ptr<RecordBatch> in) {
-        return evaluator.Evaluate(filter, *in, pool).Map([&](compute::Datum selection) {
+        return evaluator.Evaluate(filter, *in, pool).Map([&](Datum selection) {
           return evaluator.Filter(selection, in);
         });
       },
diff --git a/cpp/src/arrow/dataset/scanner_test.cc b/cpp/src/arrow/dataset/scanner_test.cc
index 19b22cb197e..82c546b9118 100644
--- a/cpp/src/arrow/dataset/scanner_test.cc
+++ b/cpp/src/arrow/dataset/scanner_test.cc
@@ -19,7 +19,6 @@
 
 #include <memory>
 
-#include "arrow/compute/context.h"
 #include "arrow/dataset/test_util.h"
 #include "arrow/record_batch.h"
 #include "arrow/testing/generator.h"
diff --git a/cpp/src/arrow/dataset/type_fwd.h b/cpp/src/arrow/dataset/type_fwd.h
index b7bfd999806..ce6fc02cdff 100644
--- a/cpp/src/arrow/dataset/type_fwd.h
+++ b/cpp/src/arrow/dataset/type_fwd.h
@@ -29,7 +29,7 @@ namespace arrow {
 
 namespace compute {
 
-class FunctionContext;
+class ExecContext;
 
 }  // namespace compute
 
diff --git a/cpp/src/arrow/datum.cc b/cpp/src/arrow/datum.cc
new file mode 100644
index 00000000000..21d075c70a3
--- /dev/null
+++ b/cpp/src/arrow/datum.cc
@@ -0,0 +1,188 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/datum.h"
+
+#include <memory>
+#include <sstream>
+#include <vector>
+
+#include "arrow/util/memory.h"
+
+namespace arrow {
+
+static bool CollectionEquals(const std::vector<Datum>& left,
+                             const std::vector<Datum>& right) {
+  if (left.size() != right.size()) {
+    return false;
+  }
+
+  for (size_t i = 0; i < left.size(); i++) {
+    if (!left[i].Equals(right[i])) {
+      return false;
+    }
+  }
+  return true;
+}
+
+Datum::Datum(bool value) : value(std::make_shared<BooleanScalar>(value)) {}
+Datum::Datum(int8_t value) : value(std::make_shared<Int8Scalar>(value)) {}
+Datum::Datum(uint8_t value) : value(std::make_shared<UInt8Scalar>(value)) {}
+Datum::Datum(int16_t value) : value(std::make_shared<Int16Scalar>(value)) {}
+Datum::Datum(uint16_t value) : value(std::make_shared<UInt16Scalar>(value)) {}
+Datum::Datum(int32_t value) : value(std::make_shared<Int32Scalar>(value)) {}
+Datum::Datum(uint32_t value) : value(std::make_shared<UInt32Scalar>(value)) {}
+Datum::Datum(int64_t value) : value(std::make_shared<Int64Scalar>(value)) {}
+Datum::Datum(uint64_t value) : value(std::make_shared<UInt64Scalar>(value)) {}
+Datum::Datum(float value) : value(std::make_shared<FloatScalar>(value)) {}
+Datum::Datum(double value) : value(std::make_shared<DoubleScalar>(value)) {}
+
+std::shared_ptr<Array> Datum::make_array() const {
+  DCHECK_EQ(Datum::ARRAY, this->kind());
+  return MakeArray(util::get<std::shared_ptr<ArrayData>>(this->value));
+}
+
+std::shared_ptr<DataType> Datum::type() const {
+  if (this->kind() == Datum::ARRAY) {
+    return util::get<std::shared_ptr<ArrayData>>(this->value)->type;
+  } else if (this->kind() == Datum::CHUNKED_ARRAY) {
+    return util::get<std::shared_ptr<ChunkedArray>>(this->value)->type();
+  } else if (this->kind() == Datum::SCALAR) {
+    return util::get<std::shared_ptr<Scalar>>(this->value)->type;
+  }
+  return NULLPTR;
+}
+
+int64_t Datum::length() const {
+  if (this->kind() == Datum::ARRAY) {
+    return util::get<std::shared_ptr<ArrayData>>(this->value)->length;
+  } else if (this->kind() == Datum::CHUNKED_ARRAY) {
+    return util::get<std::shared_ptr<ChunkedArray>>(this->value)->length();
+  } else if (this->kind() == Datum::SCALAR) {
+    return 1;
+  }
+  return kUnknownLength;
+}
+
+int64_t Datum::null_count() const {
+  if (this->kind() == Datum::ARRAY) {
+    return util::get<std::shared_ptr<ArrayData>>(this->value)->GetNullCount();
+  } else if (this->kind() == Datum::CHUNKED_ARRAY) {
+    return util::get<std::shared_ptr<ChunkedArray>>(this->value)->null_count();
+  } else if (this->kind() == Datum::SCALAR) {
+    const auto& val = *util::get<std::shared_ptr<Scalar>>(this->value);
+    return val.is_valid ? 0 : 1;
+  } else {
+    DCHECK(false) << "This function only valid for array-like values";
+    return 0;
+  }
+}
+
+ArrayVector Datum::chunks() const {
+  if (!this->is_arraylike()) {
+    return {};
+  }
+  if (this->is_array()) {
+    return {this->make_array()};
+  }
+  return this->chunked_array()->chunks();
+}
+
+bool Datum::Equals(const Datum& other) const {
+  if (this->kind() != other.kind()) return false;
+
+  switch (this->kind()) {
+    case Datum::NONE:
+      return true;
+    case Datum::SCALAR:
+      return internal::SharedPtrEquals(this->scalar(), other.scalar());
+    case Datum::ARRAY:
+      return internal::SharedPtrEquals(this->make_array(), other.make_array());
+    case Datum::CHUNKED_ARRAY:
+      return internal::SharedPtrEquals(this->chunked_array(), other.chunked_array());
+    case Datum::RECORD_BATCH:
+      return internal::SharedPtrEquals(this->record_batch(), other.record_batch());
+    case Datum::TABLE:
+      return internal::SharedPtrEquals(this->table(), other.table());
+    case Datum::COLLECTION:
+      return CollectionEquals(this->collection(), other.collection());
+    default:
+      return false;
+  }
+}
+
+ValueDescr Datum::descr() const {
+  if (this->is_arraylike()) {
+    return ValueDescr(this->type(), ValueDescr::ARRAY);
+  } else if (this->is_scalar()) {
+    return ValueDescr(this->type(), ValueDescr::SCALAR);
+  } else {
+    DCHECK(false) << "Datum is not value-like, this method should not be called";
+    return ValueDescr();
+  }
+}
+
+ValueDescr::Shape Datum::shape() const {
+  if (this->is_arraylike()) {
+    return ValueDescr::ARRAY;
+  } else if (this->is_scalar()) {
+    return ValueDescr::SCALAR;
+  } else {
+    DCHECK(false) << "Datum is not value-like, this method should not be called";
+    return ValueDescr::ANY;
+  }
+}
+
+static std::string FormatValueDescr(const ValueDescr& descr) {
+  std::stringstream ss;
+  switch (descr.shape) {
+    case ValueDescr::ANY:
+      ss << "any";
+      break;
+    case ValueDescr::ARRAY:
+      ss << "array";
+      break;
+    case ValueDescr::SCALAR:
+      ss << "scalar";
+      break;
+    default:
+      DCHECK(false);
+      break;
+  }
+  ss << "[" << descr.type->ToString() << "]";
+  return ss.str();
+}
+
+std::string ValueDescr::ToString() const { return FormatValueDescr(*this); }
+
+std::string Datum::ToString() const {
+  // TODO: Formatting for other values
+  return FormatValueDescr(this->descr());
+}
+
+ValueDescr::Shape GetBroadcastShape(const std::vector<ValueDescr>& args) {
+  ValueDescr::Shape shape = ValueDescr::SCALAR;
+  for (const auto& descr : args) {
+    if (descr.shape == ValueDescr::ARRAY) {
+      shape = ValueDescr::ARRAY;
+      break;
+    }
+  }
+  return shape;
+}
+
+}  // namespace arrow
diff --git a/cpp/src/arrow/datum.h b/cpp/src/arrow/datum.h
new file mode 100644
index 00000000000..7df33575789
--- /dev/null
+++ b/cpp/src/arrow/datum.h
@@ -0,0 +1,270 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"         // IWYU pragma: keep
+#include "arrow/record_batch.h"  // IWYU pragma: keep
+#include "arrow/scalar.h"        // IWYU pragma: keep
+#include "arrow/table.h"         // IWYU pragma: keep
+#include "arrow/type.h"          // IWYU pragma: keep
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/variant.h"  // IWYU pragma: export
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class ChunkedArray;
+class RecordBatch;
+struct Scalar;
+class Table;
+
+/// \brief A descriptor type that gives the shape (array or scalar) and
+/// DataType of a Value, but without the data
+struct ARROW_EXPORT ValueDescr {
+  std::shared_ptr<DataType> type;
+  enum Shape {
+    /// \brief Either Array or Scalar
+    ANY,
+
+    /// \brief Array type
+    ARRAY,
+
+    /// \brief Only Scalar arguments supported
+    SCALAR
+  };
+
+  Shape shape;
+
+  ValueDescr() : shape(ANY) {}
+
+  ValueDescr(std::shared_ptr<DataType> type, ValueDescr::Shape shape)
+      : type(std::move(type)), shape(shape) {}
+
+  ValueDescr(std::shared_ptr<DataType> type)  // NOLINT implicit conversion
+      : type(std::move(type)), shape(ValueDescr::ANY) {}
+
+  /// \brief Convenience constructor for ANY descr
+  static ValueDescr Any(std::shared_ptr<DataType> type) {
+    return ValueDescr(std::move(type), ANY);
+  }
+
+  /// \brief Convenience constructor for Value::ARRAY descr
+  static ValueDescr Array(std::shared_ptr<DataType> type) {
+    return ValueDescr(std::move(type), ARRAY);
+  }
+
+  /// \brief Convenience constructor for Value::SCALAR descr
+  static ValueDescr Scalar(std::shared_ptr<DataType> type) {
+    return ValueDescr(std::move(type), SCALAR);
+  }
+
+  bool operator==(const ValueDescr& other) const {
+    return this->shape == other.shape && this->type->Equals(*other.type);
+  }
+
+  bool operator!=(const ValueDescr& other) const { return !(*this == other); }
+
+  std::string ToString() const;
+};
+
+/// \brief For use with scalar functions, returns the broadcasted Value::Shape
+/// given a vector of value descriptors. Return SCALAR unless any value is
+/// ARRAY
+ARROW_EXPORT
+ValueDescr::Shape GetBroadcastShape(const std::vector<ValueDescr>& args);
+
+/// \class Datum
+/// \brief Variant type for various Arrow C++ data structures
+struct ARROW_EXPORT Datum {
+  enum Kind { NONE, SCALAR, ARRAY, CHUNKED_ARRAY, RECORD_BATCH, TABLE, COLLECTION };
+
+  // Datums variants may have a length. This special value indicate that the
+  // current variant does not have a length.
+  static constexpr int64_t kUnknownLength = -1;
+
+  util::variant<decltype(NULLPTR), std::shared_ptr<Scalar>, std::shared_ptr<ArrayData>,
+                std::shared_ptr<ChunkedArray>, std::shared_ptr<RecordBatch>,
+                std::shared_ptr<Table>, std::vector<Datum>>
+      value;
+
+  /// \brief Empty datum, to be populated elsewhere
+  Datum() : value(NULLPTR) {}
+
+  Datum(std::shared_ptr<Scalar> value)  // NOLINT implicit conversion
+      : value(std::move(value)) {}
+
+  Datum(std::shared_ptr<ArrayData> value)  // NOLINT implicit conversion
+      : value(std::move(value)) {}
+
+  Datum(ArrayData arg)  // NOLINT implicit conversion
+      : value(std::make_shared<ArrayData>(std::move(arg))) {}
+
+  Datum(const Array& value)  // NOLINT implicit conversion
+      : Datum(value.data()) {}
+
+  Datum(const std::shared_ptr<Array>& value)  // NOLINT implicit conversion
+      : Datum(value ? value->data() : NULLPTR) {}
+
+  Datum(std::shared_ptr<ChunkedArray> value)  // NOLINT implicit conversion
+      : value(std::move(value)) {}
+  Datum(std::shared_ptr<RecordBatch> value)  // NOLINT implicit conversion
+      : value(std::move(value)) {}
+  Datum(std::shared_ptr<Table> value)  // NOLINT implicit conversion
+      : value(std::move(value)) {}
+  Datum(std::vector<Datum> value)  // NOLINT implicit conversion
+      : value(std::move(value)) {}
+
+  // Cast from subtypes of Array to Datum
+  template <typename T, typename = enable_if_t<std::is_base_of<Array, T>::value>>
+  Datum(const std::shared_ptr<T>& value)  // NOLINT implicit conversion
+      : Datum(std::shared_ptr<Array>(value)) {}
+
+  // Convenience constructors
+  explicit Datum(bool value);
+  explicit Datum(int8_t value);
+  explicit Datum(uint8_t value);
+  explicit Datum(int16_t value);
+  explicit Datum(uint16_t value);
+  explicit Datum(int32_t value);
+  explicit Datum(uint32_t value);
+  explicit Datum(int64_t value);
+  explicit Datum(uint64_t value);
+  explicit Datum(float value);
+  explicit Datum(double value);
+
+  Datum(const Datum& other) noexcept { this->value = other.value; }
+
+  Datum& operator=(const Datum& other) noexcept {
+    value = other.value;
+    return *this;
+  }
+
+  // Define move constructor and move assignment, for better performance
+  Datum(Datum&& other) noexcept : value(std::move(other.value)) {}
+
+  Datum& operator=(Datum&& other) noexcept {
+    value = std::move(other.value);
+    return *this;
+  }
+
+  Datum::Kind kind() const {
+    switch (this->value.index()) {
+      case 0:
+        return Datum::NONE;
+      case 1:
+        return Datum::SCALAR;
+      case 2:
+        return Datum::ARRAY;
+      case 3:
+        return Datum::CHUNKED_ARRAY;
+      case 4:
+        return Datum::RECORD_BATCH;
+      case 5:
+        return Datum::TABLE;
+      case 6:
+        return Datum::COLLECTION;
+      default:
+        return Datum::NONE;
+    }
+  }
+
+  const std::shared_ptr<ArrayData>& array() const {
+    return util::get<std::shared_ptr<ArrayData>>(this->value);
+  }
+
+  ArrayData* mutable_array() const { return this->array().get(); }
+
+  std::shared_ptr<Array> make_array() const;
+
+  const std::shared_ptr<ChunkedArray>& chunked_array() const {
+    return util::get<std::shared_ptr<ChunkedArray>>(this->value);
+  }
+
+  const std::shared_ptr<RecordBatch>& record_batch() const {
+    return util::get<std::shared_ptr<RecordBatch>>(this->value);
+  }
+
+  const std::shared_ptr<Table>& table() const {
+    return util::get<std::shared_ptr<Table>>(this->value);
+  }
+
+  const std::vector<Datum>& collection() const {
+    return util::get<std::vector<Datum>>(this->value);
+  }
+
+  const std::shared_ptr<Scalar>& scalar() const {
+    return util::get<std::shared_ptr<Scalar>>(this->value);
+  }
+
+  template <typename ExactType>
+  const ExactType& scalar_as() const {
+    return internal::checked_cast<const ExactType&>(*this->scalar());
+  }
+
+  bool is_array() const { return this->kind() == Datum::ARRAY; }
+
+  bool is_arraylike() const {
+    return this->kind() == Datum::ARRAY || this->kind() == Datum::CHUNKED_ARRAY;
+  }
+
+  bool is_scalar() const { return this->kind() == Datum::SCALAR; }
+
+  /// \brief True if Datum contains a scalar or array-like data
+  bool is_value() const { return this->is_arraylike() || this->is_scalar(); }
+
+  bool is_collection() const { return this->kind() == Datum::COLLECTION; }
+
+  int64_t null_count() const;
+
+  /// \brief Return the shape (array or scalar) and type for supported kinds
+  /// (ARRAY, CHUNKED_ARRAY, and SCALAR). Debug asserts otherwise
+  ValueDescr descr() const;
+
+  /// \brief Return the shape (array or scalar) for supported kinds (ARRAY,
+  /// CHUNKED_ARRAY, and SCALAR). Debug asserts otherwise
+  ValueDescr::Shape shape() const;
+
+  /// \brief The value type of the variant, if any
+  ///
+  /// \return nullptr if no type
+  std::shared_ptr<DataType> type() const;
+
+  /// \brief The value length of the variant, if any
+  ///
+  /// \return kUnknownLength if no type
+  int64_t length() const;
+
+  /// \brief The array chunks of the variant, if any
+  ///
+  /// \return empty if not arraylike
+  ArrayVector chunks() const;
+
+  bool Equals(const Datum& other) const;
+
+  std::string ToString() const;
+};
+
+}  // namespace arrow
diff --git a/cpp/src/arrow/datum_test.cc b/cpp/src/arrow/datum_test.cc
new file mode 100644
index 00000000000..0b88758f3ef
--- /dev/null
+++ b/cpp/src/arrow/datum_test.cc
@@ -0,0 +1,161 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <memory>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "arrow/datum.h"
+#include "arrow/scalar.h"
+#include "arrow/testing/gtest_common.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/checked_cast.h"
+
+namespace arrow {
+
+class ChunkedArray;
+class RecordBatch;
+class Table;
+
+using internal::checked_cast;
+
+// ----------------------------------------------------------------------
+// Datum
+
+template <typename T>
+void CheckImplicitConstructor(Datum::Kind expected_kind) {
+  std::shared_ptr<T> value;
+  Datum datum = value;
+  ASSERT_EQ(expected_kind, datum.kind());
+}
+
+TEST(Datum, ImplicitConstructors) {
+  CheckImplicitConstructor<Scalar>(Datum::SCALAR);
+
+  CheckImplicitConstructor<Array>(Datum::ARRAY);
+
+  // Instantiate from array subclass
+  CheckImplicitConstructor<BinaryArray>(Datum::ARRAY);
+
+  CheckImplicitConstructor<ChunkedArray>(Datum::CHUNKED_ARRAY);
+  CheckImplicitConstructor<RecordBatch>(Datum::RECORD_BATCH);
+
+  CheckImplicitConstructor<Table>(Datum::TABLE);
+}
+
+TEST(Datum, Constructors) {
+  Datum val(std::make_shared<Int64Scalar>(1));
+  ASSERT_EQ(ValueDescr::SCALAR, val.shape());
+  AssertTypeEqual(*int64(), *val.type());
+  ASSERT_TRUE(val.is_scalar());
+  ASSERT_FALSE(val.is_array());
+  ASSERT_EQ(1, val.length());
+
+  const Int64Scalar& val_as_i64 = checked_cast<const Int64Scalar&>(*val.scalar());
+  const Int64Scalar& val_as_i64_2 = val.scalar_as<Int64Scalar>();
+  ASSERT_EQ(1, val_as_i64.value);
+  ASSERT_EQ(1, val_as_i64_2.value);
+
+  auto arr = ArrayFromJSON(int64(), "[1, 2, 3, 4]");
+  auto sel_indices = ArrayFromJSON(int32(), "[0, 3]");
+
+  Datum val2(arr);
+  ASSERT_EQ(Datum::ARRAY, val2.kind());
+  ASSERT_EQ(ValueDescr::ARRAY, val2.shape());
+  AssertTypeEqual(*int64(), *val2.type());
+  AssertArraysEqual(*arr, *val2.make_array());
+  ASSERT_TRUE(val2.is_array());
+  ASSERT_FALSE(val2.is_scalar());
+  ASSERT_EQ(arr->length(), val2.length());
+
+  auto Check = [&](const Datum& v) { AssertArraysEqual(*arr, *v.make_array()); };
+
+  // Copy constructor
+  Datum val3 = val2;
+  Check(val3);
+
+  // Copy assignment
+  Datum val4;
+  val4 = val2;
+  Check(val4);
+
+  // Move constructor
+  Datum val5 = std::move(val2);
+  Check(val5);
+
+  // Move assignment
+  Datum val6;
+  val6 = std::move(val4);
+  Check(val6);
+}
+
+TEST(Datum, NullCount) {
+  Datum val1(std::make_shared<Int8Scalar>(1));
+  ASSERT_EQ(0, val1.null_count());
+
+  Datum val2(MakeNullScalar(int8()));
+  ASSERT_EQ(1, val2.null_count());
+
+  Datum val3(ArrayFromJSON(int8(), "[1, null, null, null]"));
+  ASSERT_EQ(3, val3.null_count());
+}
+
+TEST(Datum, MutableArray) {
+  auto arr = ArrayFromJSON(int8(), "[1, 2, 3, 4]");
+
+  Datum val(arr);
+
+  val.mutable_array()->length = 0;
+  ASSERT_EQ(0, val.array()->length);
+}
+
+TEST(Datum, ToString) {
+  auto arr = ArrayFromJSON(int8(), "[1, 2, 3, 4]");
+
+  Datum v1(arr);
+  Datum v2(std::make_shared<Int8Scalar>(1));
+  ASSERT_EQ("array[int8]", v1.ToString());
+  ASSERT_EQ("scalar[int8]", v2.ToString());
+}
+
+TEST(ValueDescr, Basics) {
+  ValueDescr d1(utf8(), ValueDescr::SCALAR);
+  ValueDescr d2 = ValueDescr::Any(utf8());
+  ValueDescr d3 = ValueDescr::Scalar(utf8());
+  ValueDescr d4 = ValueDescr::Array(utf8());
+
+  ASSERT_EQ(ValueDescr::SCALAR, d1.shape);
+  AssertTypeEqual(*utf8(), *d1.type);
+  ASSERT_EQ(ValueDescr::Scalar(utf8()), d1);
+
+  ASSERT_EQ(ValueDescr::ANY, d2.shape);
+  AssertTypeEqual(*utf8(), *d2.type);
+  ASSERT_EQ(ValueDescr::Any(utf8()), d2);
+  ASSERT_NE(ValueDescr::Any(int32()), d2);
+
+  ASSERT_EQ(ValueDescr::SCALAR, d3.shape);
+  ASSERT_EQ(ValueDescr::ARRAY, d4.shape);
+
+  ASSERT_EQ("scalar[string]", d1.ToString());
+  ASSERT_EQ("any[string]", d2.ToString());
+  ASSERT_EQ("scalar[string]", d3.ToString());
+  ASSERT_EQ("array[string]", d4.ToString());
+}
+
+}  // namespace arrow
diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc
index 4713cee82a0..261c06bcb32 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -33,6 +33,7 @@
 
 #include "arrow/array.h"
 #include "arrow/buffer.h"
+#include "arrow/datum.h"
 #include "arrow/status.h"
 #include "arrow/table.h"
 #include "arrow/type.h"
@@ -688,13 +689,11 @@ inline Status ConvertStruct(const PandasOptions& options, const ChunkedArray& da
 
 Status DecodeDictionaries(MemoryPool* pool, const std::shared_ptr<DataType>& dense_type,
                           std::vector<std::shared_ptr<Array>>* arrays) {
-  compute::FunctionContext ctx(pool);
+  compute::ExecContext ctx(pool);
   compute::CastOptions options;
-
   for (size_t i = 0; i < arrays->size(); ++i) {
-    std::shared_ptr<Array> out;
-    RETURN_NOT_OK(compute::Cast(&ctx, *(*arrays)[i], dense_type, options, &out));
-    (*arrays)[i] = out;
+    ARROW_ASSIGN_OR_RAISE((*arrays)[i],
+                          compute::Cast(*(*arrays)[i], dense_type, options, &ctx));
   }
   return Status::OK();
 }
@@ -1212,14 +1211,14 @@ class DatetimeNanoWriter : public DatetimeWriter<TimeUnit::NANO> {
   Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement) override {
     Type::type type = data->type()->id();
     int64_t* out_values = this->GetBlockColumnStart(rel_placement);
-    compute::FunctionContext ctx(options_.pool);
+    compute::ExecContext ctx(options_.pool);
     compute::CastOptions options;
     if (options_.safe_cast) {
       options = compute::CastOptions::Safe();
     } else {
       options = compute::CastOptions::Unsafe();
     }
-    compute::Datum out;
+    Datum out;
     auto target_type = timestamp(TimeUnit::NANO);
 
     if (type == Type::DATE32) {
@@ -1236,8 +1235,7 @@ class DatetimeNanoWriter : public DatetimeWriter<TimeUnit::NANO> {
         ConvertNumericNullable<int64_t>(*data, kPandasTimestampNull, out_values);
       } else if (ts_type.unit() == TimeUnit::MICRO || ts_type.unit() == TimeUnit::MILLI ||
                  ts_type.unit() == TimeUnit::SECOND) {
-        RETURN_NOT_OK(
-            arrow::compute::Cast(&ctx, compute::Datum(data), target_type, options, &out));
+        ARROW_ASSIGN_OR_RAISE(out, compute::Cast(data, target_type, options, &ctx));
         ConvertNumericNullable<int64_t>(*out.chunked_array(), kPandasTimestampNull,
                                         out_values);
       } else {
@@ -1998,13 +1996,12 @@ Status ConvertCategoricals(const PandasOptions& options,
   // For Categorical conversions
   auto EncodeColumn = [&](int j) {
     int i = columns_to_encode[j];
-    compute::FunctionContext ctx(options.pool);
-    compute::Datum out;
     if (options.zero_copy_only) {
       return Status::Invalid("Need to dictionary encode a column, but ",
                              "only zero-copy conversions allowed");
     }
-    RETURN_NOT_OK(DictionaryEncode(&ctx, (*arrays)[i], &out));
+    compute::ExecContext ctx(options.pool);
+    ARROW_ASSIGN_OR_RAISE(Datum out, DictionaryEncode((*arrays)[i], &ctx));
     (*arrays)[i] = out.chunked_array();
     (*fields)[i] = (*fields)[i]->WithType((*arrays)[i]->type());
     return Status::OK();
@@ -2039,13 +2036,12 @@ Status ConvertChunkedArrayToPandas(const PandasOptions& options,
                                    std::shared_ptr<ChunkedArray> arr, PyObject* py_ref,
                                    PyObject** out) {
   if (options.strings_to_categorical && is_base_binary_like(arr->type()->id())) {
-    compute::FunctionContext ctx(options.pool);
-    compute::Datum out;
     if (options.zero_copy_only) {
       return Status::Invalid("Need to dictionary encode a column, but ",
                              "only zero-copy conversions allowed");
     }
-    RETURN_NOT_OK(DictionaryEncode(&ctx, arr, &out));
+    compute::ExecContext ctx(options.pool);
+    ARROW_ASSIGN_OR_RAISE(Datum out, DictionaryEncode(arr, &ctx));
     arr = out.chunked_array();
   }
 
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc
index 6484a460a24..6a66ebc2c2b 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -43,8 +43,7 @@
 #include "arrow/util/utf8.h"
 #include "arrow/visitor_inline.h"
 
-#include "arrow/compute/context.h"
-#include "arrow/compute/kernels/cast.h"
+#include "arrow/compute/api_eager.h"
 
 #include "arrow/python/common.h"
 #include "arrow/python/config.h"
@@ -340,14 +339,10 @@ Status CastBuffer(const std::shared_ptr<DataType>& in_type,
                   std::shared_ptr<Buffer>* out) {
   // Must cast
   auto tmp_data = ArrayData::Make(in_type, length, {valid_bitmap, input}, null_count);
-
-  std::shared_ptr<Array> tmp_array = MakeArray(tmp_data);
-  std::shared_ptr<Array> casted_array;
-
-  compute::FunctionContext context(pool);
-
-  RETURN_NOT_OK(
-      compute::Cast(&context, *tmp_array, out_type, cast_options, &casted_array));
+  compute::ExecContext context(pool);
+  ARROW_ASSIGN_OR_RAISE(
+      std::shared_ptr<Array> casted_array,
+      compute::Cast(*MakeArray(tmp_data), out_type, cast_options, &context));
   *out = casted_array->data()->buffers[1];
   return Status::OK();
 }
@@ -666,9 +661,10 @@ Status NumPyConverter::Visit(const StringType& type) {
                                                  &null_count_));
       if (null_count_ == length_) {
         auto arr = std::make_shared<NullArray>(length_);
-        std::shared_ptr<Array> out;
-        compute::FunctionContext context(pool_);
-        RETURN_NOT_OK(compute::Cast(&context, *arr, arrow::utf8(), cast_options_, &out));
+        compute::ExecContext context(pool_);
+        ARROW_ASSIGN_OR_RAISE(
+            std::shared_ptr<Array> out,
+            compute::Cast(*arr, arrow::utf8(), cast_options_, &context));
         out_arrays_.emplace_back(out);
         return Status::OK();
       }
diff --git a/cpp/src/arrow/python/numpy_to_arrow.h b/cpp/src/arrow/python/numpy_to_arrow.h
index 89ea19edbec..b6cd093e554 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.h
+++ b/cpp/src/arrow/python/numpy_to_arrow.h
@@ -23,7 +23,7 @@
 
 #include <memory>
 
-#include "arrow/compute/kernels/cast.h"
+#include "arrow/compute/api.h"
 #include "arrow/python/visibility.h"
 
 namespace arrow {
diff --git a/cpp/src/arrow/stl.h b/cpp/src/arrow/stl.h
index 67fffa5441f..1e1481b152c 100644
--- a/cpp/src/arrow/stl.h
+++ b/cpp/src/arrow/stl.h
@@ -339,16 +339,15 @@ struct RowIterator<Tuple, 0> {
 template <typename Tuple, std::size_t N = std::tuple_size<Tuple>::value>
 struct EnsureColumnTypes {
   static Status Cast(const Table& table, std::shared_ptr<Table>* table_owner,
-                     const compute::CastOptions& cast_options,
-                     compute::FunctionContext* ctx,
+                     const compute::CastOptions& cast_options, compute::ExecContext* ctx,
                      std::reference_wrapper<const ::arrow::Table>* result) {
     using Element = BareTupleElement<N - 1, Tuple>;
     std::shared_ptr<DataType> expected_type = ConversionTraits<Element>::type_singleton();
 
     if (!table.schema()->field(N - 1)->type()->Equals(*expected_type)) {
-      compute::Datum casted;
-      ARROW_RETURN_NOT_OK(compute::Cast(ctx, compute::Datum(table.column(N - 1)),
-                                        expected_type, cast_options, &casted));
+      ARROW_ASSIGN_OR_RAISE(
+          Datum casted,
+          compute::Cast(table.column(N - 1), expected_type, cast_options, ctx));
       auto new_field = table.schema()->field(N - 1)->WithType(expected_type);
       ARROW_ASSIGN_OR_RAISE(*table_owner,
                             table.SetColumn(N - 1, new_field, casted.chunked_array()));
@@ -363,8 +362,7 @@ struct EnsureColumnTypes {
 template <typename Tuple>
 struct EnsureColumnTypes<Tuple, 0> {
   static Status Cast(const Table& table, std::shared_ptr<Table>* table_owner,
-                     const compute::CastOptions& cast_options,
-                     compute::FunctionContext* ctx,
+                     const compute::CastOptions& cast_options, compute::ExecContext* ctx,
                      std::reference_wrapper<const ::arrow::Table>* result) {
     return Status::OK();
   }
@@ -429,7 +427,7 @@ Status TableFromTupleRange(MemoryPool* pool, Range&& rows,
 
 template <typename Range>
 Status TupleRangeFromTable(const Table& table, const compute::CastOptions& cast_options,
-                           compute::FunctionContext* ctx, Range* rows) {
+                           compute::ExecContext* ctx, Range* rows) {
   using row_type = typename std::decay<decltype(*std::begin(*rows))>::type;
   constexpr std::size_t n_columns = std::tuple_size<row_type>::value;
 
diff --git a/cpp/src/arrow/stl_test.cc b/cpp/src/arrow/stl_test.cc
index 7ec09e76d1c..159d1d98312 100644
--- a/cpp/src/arrow/stl_test.cc
+++ b/cpp/src/arrow/stl_test.cc
@@ -408,7 +408,7 @@ TEST(TestTableFromTupleVector, AppendingMultipleRows) {
 }
 
 TEST(TestTupleVectorFromTable, PrimitiveTypes) {
-  compute::FunctionContext ctx;
+  compute::ExecContext ctx;
   compute::CastOptions cast_options;
 
   std::vector<primitive_types_tuple> expected_rows{
@@ -463,7 +463,7 @@ TEST(TestTupleVectorFromTable, PrimitiveTypes) {
 TEST(TestTupleVectorFromTable, ListType) {
   using tuple_type = std::tuple<std::vector<int64_t>>;
 
-  compute::FunctionContext ctx;
+  compute::ExecContext ctx;
   compute::CastOptions cast_options;
   auto expected_schema =
       std::shared_ptr<Schema>(new Schema({field("column1", list(int64()), false)}));
@@ -482,7 +482,7 @@ TEST(TestTupleVectorFromTable, ListType) {
 TEST(TestTupleVectorFromTable, CastingNeeded) {
   using tuple_type = std::tuple<std::vector<int64_t>>;
 
-  compute::FunctionContext ctx;
+  compute::ExecContext ctx;
   compute::CastOptions cast_options;
   auto expected_schema =
       std::shared_ptr<Schema>(new Schema({field("column1", list(int16()), false)}));
diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc
index 42bdd3cafb2..894e3fb60c6 100644
--- a/cpp/src/arrow/testing/gtest_util.cc
+++ b/cpp/src/arrow/testing/gtest_util.cc
@@ -40,7 +40,7 @@
 
 #include "arrow/array.h"
 #include "arrow/buffer.h"
-#include "arrow/compute/kernel.h"
+#include "arrow/datum.h"
 #include "arrow/ipc/json_simple.h"
 #include "arrow/pretty_print.h"
 #include "arrow/status.h"
@@ -105,6 +105,20 @@ void AssertChunkedEqual(const ChunkedArray& actual, const ArrayVector& expected)
   AssertChunkedEqual(ChunkedArray(expected, actual.type()), actual);
 }
 
+void AssertChunkedEquivalent(const ChunkedArray& expected, const ChunkedArray& actual) {
+  // XXX: AssertChunkedEqual in gtest_util.h does not permit the chunk layouts
+  // to be different
+  if (!actual.Equals(expected)) {
+    std::stringstream pp_expected;
+    std::stringstream pp_actual;
+    ::arrow::PrettyPrintOptions options(/*indent=*/2);
+    options.window = 50;
+    ARROW_EXPECT_OK(PrettyPrint(expected, options, &pp_expected));
+    ARROW_EXPECT_OK(PrettyPrint(actual, options, &pp_actual));
+    FAIL() << "Got: \n" << pp_actual.str() << "\nExpected: \n" << pp_expected.str();
+  }
+}
+
 void AssertBufferEqual(const Buffer& buffer, const std::vector<uint8_t>& expected) {
   ASSERT_EQ(static_cast<size_t>(buffer.size()), expected.size())
       << "Mismatching buffer size";
diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h
index 14da481f0bb..846e30dda62 100644
--- a/cpp/src/arrow/testing/gtest_util.h
+++ b/cpp/src/arrow/testing/gtest_util.h
@@ -158,12 +158,7 @@ class Array;
 class ChunkedArray;
 class RecordBatch;
 class Table;
-
-namespace compute {
 struct Datum;
-}
-
-using Datum = compute::Datum;
 
 #define ASSERT_ARRAYS_EQUAL(lhs, rhs) AssertArraysEqual((lhs), (rhs))
 #define ASSERT_BATCHES_EQUAL(lhs, rhs) AssertBatchesEqual((lhs), (rhs))
@@ -179,6 +174,9 @@ ARROW_EXPORT void AssertChunkedEqual(const ChunkedArray& expected,
                                      const ChunkedArray& actual);
 ARROW_EXPORT void AssertChunkedEqual(const ChunkedArray& actual,
                                      const ArrayVector& expected);
+// Like ChunkedEqual, but permits different chunk layout
+ARROW_EXPORT void AssertChunkedEquivalent(const ChunkedArray& expected,
+                                          const ChunkedArray& actual);
 ARROW_EXPORT void AssertBufferEqual(const Buffer& buffer,
                                     const std::vector<uint8_t>& expected);
 ARROW_EXPORT void AssertBufferEqual(const Buffer& buffer, const std::string& expected);
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index 230026cbe7e..86b8a8ab869 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -86,7 +86,86 @@ constexpr Type::type DurationType::type_id;
 
 constexpr Type::type DictionaryType::type_id;
 
+namespace internal {
+
+std::string ToString(Type::type id) {
+  switch (id) {
+    case Type::NA:
+      return "null";
+    case Type::BOOL:
+      return "bool";
+    case Type::UINT8:
+      return "uint8";
+    case Type::INT8:
+      return "int8";
+    case Type::UINT16:
+      return "uint16";
+    case Type::INT16:
+      return "int16";
+    case Type::UINT32:
+      return "uint32";
+    case Type::INT32:
+      return "int32";
+    case Type::UINT64:
+      return "uint64";
+    case Type::INT64:
+      return "int64";
+    case Type::HALF_FLOAT:
+      return "half_float";
+    case Type::FLOAT:
+      return "float";
+    case Type::DOUBLE:
+      return "double";
+    case Type::STRING:
+      return "utf8";
+    case Type::BINARY:
+      return "binary";
+    case Type::FIXED_SIZE_BINARY:
+      return "fixed_size_binary";
+    case Type::DATE64:
+      return "date64";
+    case Type::TIMESTAMP:
+      return "timestamp";
+    case Type::TIME32:
+      return "time32";
+    case Type::TIME64:
+      return "time64";
+    case Type::INTERVAL_MONTHS:
+      return "interval_months";
+    case Type::INTERVAL_DAY_TIME:
+      return "interval_day_time";
+    case Type::DECIMAL:
+      return "decimal";
+    case Type::LIST:
+      return "list";
+    case Type::STRUCT:
+      return "struct";
+    case Type::UNION:
+      return "union";
+    case Type::DICTIONARY:
+      return "dictionary";
+    case Type::MAP:
+      return "map";
+    case Type::EXTENSION:
+      return "extension";
+    case Type::FIXED_SIZE_LIST:
+      return "fixed_size_list";
+    case Type::DURATION:
+      return "duration";
+    case Type::LARGE_BINARY:
+      return "large_binary";
+    case Type::LARGE_LIST:
+      return "large_list";
+    default:
+      DCHECK(false) << "Should not be able to reach here";
+      return "unknown";
+  }
+}
+
+}  // namespace internal
+
 namespace {
+
 using internal::checked_cast;
 
 // Merges `existing` and `other` if one of them is of NullType, otherwise
@@ -248,6 +327,13 @@ bool DataType::Equals(const std::shared_ptr<DataType>& other) const {
   return Equals(*other.get());
 }
 
+size_t DataType::Hash() const {
+  static constexpr size_t kHashSeed = 0;
+  size_t result = kHashSeed;
+  internal::hash_combine(result, this->ComputeFingerprint());
+  return result;
+}
+
 std::ostream& operator<<(std::ostream& os, const DataType& type) {
   os << type.ToString();
   return os;
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index c65bb9f6464..8899e98a583 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -161,6 +161,13 @@ struct Type {
   };
 };
 
+namespace internal {
+
+ARROW_EXPORT
+std::string ToString(Type::type id);
+
+}  // namespace internal
+
 namespace detail {
 
 class ARROW_EXPORT Fingerprintable {
@@ -271,6 +278,9 @@ class ARROW_EXPORT DataType : public detail::Fingerprintable {
   /// \brief A string representation of the type, including any children
   virtual std::string ToString() const = 0;
 
+  /// \brief Return hash value (excluding metadata in child fields)
+  size_t Hash() const;
+
   /// \brief A string name of the type, omitting any child fields
   ///
   /// \note Experimental API
diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h
index 991354532c0..158ec3ebba5 100644
--- a/cpp/src/arrow/type_fwd.h
+++ b/cpp/src/arrow/type_fwd.h
@@ -49,6 +49,7 @@ class FieldRef;
 class KeyValueMetadata;
 class Schema;
 
+using DataTypeVector = std::vector<std::shared_ptr<DataType>>;
 using FieldVector = std::vector<std::shared_ptr<Field>>;
 
 class Array;
diff --git a/cpp/src/gandiva/arrow.h b/cpp/src/gandiva/arrow.h
index 24dcc0befed..e6d40cb18c6 100644
--- a/cpp/src/gandiva/arrow.h
+++ b/cpp/src/gandiva/arrow.h
@@ -29,29 +29,22 @@
 
 namespace gandiva {
 
-using ArrayPtr = std::shared_ptr<arrow::Array>;
+using arrow::ArrayDataVector;
+using arrow::DataTypeVector;
+using arrow::FieldVector;
+using arrow::Result;
+using arrow::Status;
+using arrow::StatusCode;
 
+using ArrayPtr = std::shared_ptr<arrow::Array>;
+using ArrayDataPtr = std::shared_ptr<arrow::ArrayData>;
 using DataTypePtr = std::shared_ptr<arrow::DataType>;
-using DataTypeVector = std::vector<DataTypePtr>;
-
-using Decimal128TypePtr = std::shared_ptr<arrow::Decimal128Type>;
-using Decimal128TypeVector = std::vector<Decimal128TypePtr>;
-
 using FieldPtr = std::shared_ptr<arrow::Field>;
-using FieldVector = std::vector<FieldPtr>;
-
 using RecordBatchPtr = std::shared_ptr<arrow::RecordBatch>;
-
 using SchemaPtr = std::shared_ptr<arrow::Schema>;
 
-using ArrayDataPtr = std::shared_ptr<arrow::ArrayData>;
-using ArrayDataVector = std::vector<ArrayDataPtr>;
-
-using Status = arrow::Status;
-using StatusCode = arrow::StatusCode;
-
-template <typename T>
-using Result = arrow::Result<T>;
+using Decimal128TypePtr = std::shared_ptr<arrow::Decimal128Type>;
+using Decimal128TypeVector = std::vector<Decimal128TypePtr>;
 
 static inline bool is_decimal_128(DataTypePtr type) {
   if (type->id() == arrow::Type::DECIMAL) {
diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index 9242b533ebf..8bfbb94c619 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -31,7 +31,7 @@
 #include <vector>
 
 #include "arrow/api.h"
-#include "arrow/compute/kernels/cast.h"
+#include "arrow/compute/api_eager.h"
 #include "arrow/pretty_print.h"
 #include "arrow/record_batch.h"
 #include "arrow/testing/gtest_util.h"
@@ -59,6 +59,7 @@ using arrow::ArrayVisitor;
 using arrow::Buffer;
 using arrow::ChunkedArray;
 using arrow::DataType;
+using arrow::Datum;
 using arrow::default_memory_pool;
 using arrow::ListArray;
 using arrow::PrimitiveArray;
@@ -66,9 +67,7 @@ using arrow::ResizableBuffer;
 using arrow::Status;
 using arrow::Table;
 using arrow::TimeUnit;
-using arrow::compute::Datum;
 using arrow::compute::DictionaryEncode;
-using arrow::compute::FunctionContext;
 using arrow::io::BufferReader;
 
 using arrow::randint;
@@ -701,9 +700,7 @@ TYPED_TEST(TestParquetIO, SingleColumnOptionalDictionaryWrite) {
 
   ASSERT_OK(NullableArray<TypeParam>(SMALL_SIZE, 10, kDefaultSeed, &values));
 
-  Datum out;
-  FunctionContext ctx(default_memory_pool());
-  ASSERT_OK(DictionaryEncode(&ctx, Datum(values), &out));
+  ASSERT_OK_AND_ASSIGN(Datum out, DictionaryEncode(values));
   std::shared_ptr<Array> dict_values = MakeArray(out.array());
   std::shared_ptr<GroupNode> schema =
       MakeSimpleSchema(*dict_values->type(), Repetition::OPTIONAL);
@@ -3059,15 +3056,12 @@ TEST_P(TestArrowReadDictionary, IncrementalReads) {
   int num_reads = 4;
   int batch_size = options.num_rows / num_reads;
 
-  ::arrow::compute::FunctionContext fc;
   for (int i = 0; i < num_reads; ++i) {
     std::shared_ptr<ChunkedArray> chunk;
     ASSERT_OK(col->NextBatch(batch_size, &chunk));
 
-    std::shared_ptr<Array> result_dense;
-    ASSERT_OK(::arrow::compute::Cast(&fc, *chunk->chunk(0), ::arrow::utf8(),
-                                     ::arrow::compute::CastOptions::Safe(),
-                                     &result_dense));
+    ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> result_dense,
+                         ::arrow::compute::Cast(*chunk->chunk(0), ::arrow::utf8()));
     AssertArraysEqual(*dense_values_->Slice(i * batch_size, batch_size), *result_dense);
   }
 }
diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc
index 1536fb4745c..c706d153dad 100644
--- a/cpp/src/parquet/arrow/reader_internal.cc
+++ b/cpp/src/parquet/arrow/reader_internal.cc
@@ -29,7 +29,7 @@
 
 #include "arrow/array.h"
 #include "arrow/builder.h"
-#include "arrow/compute/kernel.h"
+#include "arrow/datum.h"
 #include "arrow/extension_type.h"
 #include "arrow/io/memory.h"
 #include "arrow/ipc/reader.h"
@@ -57,6 +57,7 @@ using arrow::Array;
 using arrow::BooleanArray;
 using arrow::ChunkedArray;
 using arrow::DataType;
+using arrow::Datum;
 using arrow::Field;
 using arrow::Int32Array;
 using arrow::ListArray;
@@ -66,7 +67,6 @@ using arrow::Status;
 using arrow::StructArray;
 using arrow::Table;
 using arrow::TimestampArray;
-using arrow::compute::Datum;
 
 using ::arrow::BitUtil::FromBigEndian;
 using ::arrow::internal::checked_cast;
diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index e37bebacfdc..bd6dbe6b558 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -28,7 +28,7 @@
 
 #include "arrow/array.h"
 #include "arrow/buffer_builder.h"
-#include "arrow/compute/api.h"
+#include "arrow/compute/api_eager.h"
 #include "arrow/status.h"
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
@@ -49,9 +49,9 @@
 #include "parquet/thrift_internal.h"
 #include "parquet/types.h"
 
+using arrow::Datum;
 using arrow::Status;
 using arrow::BitUtil::BitWriter;
-using arrow::compute::Datum;
 using arrow::internal::checked_cast;
 using arrow::util::RleEncoder;
 
@@ -923,10 +923,10 @@ Status ConvertDictionaryToDense(const ::arrow::Array& array, MemoryPool* pool,
     return Status::OK();
   }
 
-  ::arrow::compute::FunctionContext ctx(pool);
-  Datum cast_output;
-  RETURN_NOT_OK(::arrow::compute::Cast(&ctx, Datum(array.data()), dict_type.value_type(),
-                                       ::arrow::compute::CastOptions(), &cast_output));
+  ::arrow::compute::ExecContext ctx(pool);
+  ARROW_ASSIGN_OR_RAISE(Datum cast_output,
+                        ::arrow::compute::Cast(array.data(), dict_type.value_type(),
+                                               ::arrow::compute::CastOptions(), &ctx));
   *out = cast_output.make_array();
   return Status::OK();
 }
diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc
index af3b6bcba61..5c798d13ec0 100644
--- a/cpp/src/parquet/encoding_test.cc
+++ b/cpp/src/parquet/encoding_test.cc
@@ -25,7 +25,6 @@
 #include <vector>
 
 #include "arrow/array.h"
-#include "arrow/compute/api.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/random.h"
 #include "arrow/testing/util.h"
diff --git a/testing b/testing
index bff46cad1a4..7660b5fb3dc 160000
--- a/testing
+++ b/testing
@@ -1 +1 @@
-Subproject commit bff46cad1a4b4df490f853a319e143a03b6ec233
+Subproject commit 7660b5fb3dca9f21996ac239d9f520e631f4d3e5

From 0b88a497a17ce132ce0fbc6f6132ecfe1eaa4dca Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Mon, 18 May 2020 19:48:58 -0500
Subject: [PATCH 02/41] Finish porting SortToIndices, consolidate files and
 tests since there is shared code

---
 cpp/src/arrow/CMakeLists.txt                  |   9 +-
 cpp/src/arrow/compute/api_eager.cc            |  17 +-
 cpp/src/arrow/compute/api_eager.h             |  14 +-
 cpp/src/arrow/compute/kernel.cc               |  14 +-
 cpp/src/arrow/compute/kernel.h                |   8 +-
 cpp/src/arrow/compute/kernel_test.cc          |   3 +
 cpp/src/arrow/compute/kernels/CMakeLists.txt  |   4 +-
 .../arrow/compute/kernels/aggregate_basic.cc  | 171 +++++++++++-
 .../arrow/compute/kernels/aggregate_test.cc   | 197 +++++++++++++-
 cpp/src/arrow/compute/kernels/minmax.cc       | 131 ---------
 cpp/src/arrow/compute/kernels/registry.h      |   1 -
 .../arrow/compute/kernels/vector_partition.cc | 107 --------
 .../compute/kernels/vector_partition_test.cc  | 177 ------------
 cpp/src/arrow/compute/kernels/vector_sort.cc  | 255 ++++++++----------
 .../arrow/compute/kernels/vector_sort_test.cc | 231 +++++++++++-----
 cpp/src/arrow/compute/registry.cc             |   2 +-
 16 files changed, 687 insertions(+), 654 deletions(-)
 delete mode 100644 cpp/src/arrow/compute/kernels/minmax.cc
 delete mode 100644 cpp/src/arrow/compute/kernels/vector_partition.cc
 delete mode 100644 cpp/src/arrow/compute/kernels/vector_partition_test.cc

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 2e62391c442..f7e4fe0ba2d 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -331,12 +331,11 @@ if(ARROW_COMPUTE)
               compute/kernels/scalar_boolean.cc
               compute/kernels/scalar_compare.cc
               compute/kernels/scalar_set_lookup.cc
-              compute/kernels/vector_partition.cc
+              compute/kernels/vector_sort.cc
               # compute/kernels/scalar_cast.cc
-              # compute/kernels/filter.cc
-              # compute/kernels/take.cc
-              # compute/kernels/hash.cc
-              # compute/kernels/sort_to_indices.cc
+              # compute/kernels/vector_filter.cc
+              # compute/kernels/vector_take.cc
+              # compute/kernels/vector_hash.cc
        )
 endif()
 
diff --git a/cpp/src/arrow/compute/api_eager.cc b/cpp/src/arrow/compute/api_eager.cc
index 129a40f69f8..4e4b4e0b35f 100644
--- a/cpp/src/arrow/compute/api_eager.cc
+++ b/cpp/src/arrow/compute/api_eager.cc
@@ -125,10 +125,9 @@ Result<Datum> Sum(const Datum& value, ExecContext* ctx) {
   return ExecScalarAggregateFunction(ctx, "sum", {value});
 }
 
-// Result<Datum> MinMax(const Datum& value, const MinMaxOptions& options,
-//                      ExecContext* ctx) {
-//   return ExecScalarAggregateFunction(ctx, "minmax", {value});
-// }
+Result<Datum> MinMax(const Datum& value, const MinMaxOptions& options, ExecContext* ctx) {
+  return ExecScalarAggregateFunction(ctx, "minmax", {value}, &options);
+}
 
 // ----------------------------------------------------------------------
 // Vector functions
@@ -197,13 +196,19 @@ Result<std::shared_ptr<Array>> ValueCounts(const Datum& value, ExecContext* ctx)
   return Status::NotImplemented("NYI");
 }
 
-Result<std::shared_ptr<Array>> PartitionIndices(const Array& values, int64_t n,
-                                                ExecContext* ctx) {
+Result<std::shared_ptr<Array>> NthToIndices(const Array& values, int64_t n,
+                                            ExecContext* ctx) {
   PartitionOptions options(/*pivot=*/n);
   ARROW_ASSIGN_OR_RAISE(Datum result, ExecVectorFunction(ctx, "partition_indices",
                                                          {Datum(values)}, &options));
   return result.make_array();
 }
 
+Result<std::shared_ptr<Array>> SortToIndices(const Array& values, ExecContext* ctx) {
+  ARROW_ASSIGN_OR_RAISE(Datum result,
+                        ExecVectorFunction(ctx, "sort_indices", {Datum(values)}));
+  return result.make_array();
+}
+
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/api_eager.h b/cpp/src/arrow/compute/api_eager.h
index d41210c9594..c58348ba63d 100644
--- a/cpp/src/arrow/compute/api_eager.h
+++ b/cpp/src/arrow/compute/api_eager.h
@@ -194,14 +194,8 @@ Result<Datum> Match(const Datum& haystack, std::shared_ptr<Array> needles,
 /// \param[in] ctx the function execution context, optional
 /// \return offsets indices that would partition an array
 ARROW_EXPORT
-Result<std::shared_ptr<Array>> PartitionIndices(const Array& values, int64_t n,
-                                                ExecContext* ctx = NULLPTR);
-
-ARROW_DEPRECATED("Deprecated in 1.0.0. Use PartitionIndices")
 Result<std::shared_ptr<Array>> NthToIndices(const Array& values, int64_t n,
-                                            ExecContext* ctx = NULLPTR) {
-  return PartitionIndices(values, n, ctx);
-}
+                                            ExecContext* ctx = NULLPTR);
 
 /// \brief Returns the indices that would sort an array.
 ///
@@ -305,13 +299,13 @@ Result<Datum> Sum(const Datum& value, ExecContext* ctx = NULLPTR);
 
 /// \brief Calculate the min / max of a numeric array
 ///
-/// This function returns both the min and max as a collection. The resulting
-/// datum thus consists of two scalar datums: {Datum(min), Datum(max)}
+/// This function returns both the min and max as a struct scalar, with type
+/// struct<min: T, max: T>, where T is ht einput type
 ///
 /// \param[in] value input datum, expecting Array or ChunkedArray
 /// \param[in] options see MinMaxOptions for more information
 /// \param[in] ctx the function execution context, optional
-/// \return resulting datum containing a {min, max} collection
+/// \return resulting datum as a struct<min: T, max: T> scalar
 ///
 /// \since 1.0.0
 /// \note API not yet finalized
diff --git a/cpp/src/arrow/compute/kernel.cc b/cpp/src/arrow/compute/kernel.cc
index b03523c8be0..5be03e82dc1 100644
--- a/cpp/src/arrow/compute/kernel.cc
+++ b/cpp/src/arrow/compute/kernel.cc
@@ -174,15 +174,17 @@ Type::type InputType::type_id() const {
 // ----------------------------------------------------------------------
 // OutputType
 
-OutputType::Resolver ResolveAs(ValueDescr descr) {
-  return [descr](const std::vector<ValueDescr>&) { return descr; };
+OutputType::OutputType(ValueDescr descr) : OutputType(descr.type) {
+  shape_ = descr.shape;
 }
 
-OutputType::OutputType(ValueDescr descr) : resolver_(ResolveAs(descr)) {}
-
 Result<ValueDescr> OutputType::Resolve(const std::vector<ValueDescr>& args) const {
   if (kind_ == OutputType::FIXED) {
-    return ValueDescr(type_, GetBroadcastShape(args));
+    ValueDescr::Shape out_shape = shape_;
+    if (out_shape == ValueDescr::ANY) {
+      out_shape = GetBroadcastShape(args);
+    }
+    return ValueDescr(type_, out_shape);
   } else {
     return resolver_(args);
   }
@@ -200,7 +202,7 @@ const OutputType::Resolver& OutputType::resolver() const {
 
 std::string OutputType::ToString() const {
   if (kind_ == OutputType::FIXED) {
-    return type_->ToString();
+    return ValueDescr(type_, shape_).ToString();
   } else {
     return "computed";
   }
diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h
index 30eb097f5ef..50c0afb8c4f 100644
--- a/cpp/src/arrow/compute/kernel.h
+++ b/cpp/src/arrow/compute/kernel.h
@@ -228,7 +228,7 @@ class ARROW_EXPORT OutputType {
   using Resolver = std::function<Result<ValueDescr>(const std::vector<ValueDescr>&)>;
 
   OutputType(std::shared_ptr<DataType> type)  // NOLINT implicit construction
-      : kind_(FIXED), type_(std::move(type)) {}
+      : kind_(FIXED), type_(std::move(type)), shape_(ValueDescr::ANY) {}
 
   /// For outputting a particular type and shape
   OutputType(ValueDescr descr);  // NOLINT implicit construction
@@ -266,12 +266,18 @@ class ARROW_EXPORT OutputType {
   /// fixed/invariant or computed by a "user"-defined resolver
   ResolveKind kind() const { return kind_; }
 
+  /// \brief If the shape is ANY, then Resolve will compute the shape based on
+  /// the input arguments
+  ValueDescr::Shape shape() const { return shape_; }
+
  private:
   ResolveKind kind_;
 
   // For FIXED resolution
   std::shared_ptr<DataType> type_;
 
+  ValueDescr::Shape shape_;
+
   // For COMPUTED resolution
   Resolver resolver_;
 };
diff --git a/cpp/src/arrow/compute/kernel_test.cc b/cpp/src/arrow/compute/kernel_test.cc
index b562da95815..5b61e93a3f3 100644
--- a/cpp/src/arrow/compute/kernel_test.cc
+++ b/cpp/src/arrow/compute/kernel_test.cc
@@ -281,6 +281,9 @@ TEST(OutputType, ResolveDescr) {
   OutputType ty1(d1);
   OutputType ty2(d2);
 
+  ASSERT_EQ(ValueDescr::SCALAR, ty1.shape());
+  ASSERT_EQ(ValueDescr::ARRAY, ty2.shape());
+
   {
     ASSERT_OK_AND_ASSIGN(ValueDescr descr, ty1.Resolve({}));
     ASSERT_EQ(d1, descr);
diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt
index b230621ad53..daaf5222866 100644
--- a/cpp/src/arrow/compute/kernels/CMakeLists.txt
+++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt
@@ -32,9 +32,7 @@ add_arrow_benchmark(scalar_compare_benchmark PREFIX "arrow-compute")
 # ----------------------------------------------------------------------
 # Vector kernels
 
-add_arrow_compute_test(vector_test
-                       SOURCES
-                       vector_partition_test.cc)
+add_arrow_compute_test(vector_test SOURCES vector_sort_test.cc)
 
 # add_arrow_compute_test(hash_test)
 
diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
index 8fb02f18d9e..ed1dc4f535f 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
@@ -319,6 +319,156 @@ std::unique_ptr<KernelState> MeanInit(KernelContext* ctx, const Kernel& kernel,
   return visitor.Create();
 }
 
+// ----------------------------------------------------------------------
+// MinMax implementation
+
+template <typename ArrowType, typename Enable = void>
+struct MinMaxState {};
+
+template <typename ArrowType>
+struct MinMaxState<ArrowType, enable_if_integer<ArrowType>> {
+  using ThisType = MinMaxState<ArrowType>;
+  using T = typename ArrowType::c_type;
+
+  ThisType& operator+=(const ThisType& rhs) {
+    this->has_nulls |= rhs.has_nulls;
+    this->min = std::min(this->min, rhs.min);
+    this->max = std::max(this->max, rhs.max);
+    return *this;
+  }
+
+  void MergeOne(T value) {
+    this->min = std::min(this->min, value);
+    this->max = std::max(this->max, value);
+  }
+
+  T min = std::numeric_limits<T>::max();
+  T max = std::numeric_limits<T>::min();
+  bool has_nulls = false;
+};
+
+template <typename ArrowType>
+struct MinMaxState<ArrowType, enable_if_floating_point<ArrowType>> {
+  using ThisType = MinMaxState<ArrowType>;
+  using T = typename ArrowType::c_type;
+
+  ThisType& operator+=(const ThisType& rhs) {
+    this->has_nulls |= rhs.has_nulls;
+    this->min = std::fmin(this->min, rhs.min);
+    this->max = std::fmax(this->max, rhs.max);
+    return *this;
+  }
+
+  void MergeOne(T value) {
+    this->min = std::fmin(this->min, value);
+    this->max = std::fmax(this->max, value);
+  }
+
+  T min = std::numeric_limits<T>::infinity();
+  T max = -std::numeric_limits<T>::infinity();
+  bool has_nulls = false;
+};
+
+template <typename ArrowType>
+struct MinMaxImpl : public ScalarAggregator {
+  using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
+  using ThisType = MinMaxImpl<ArrowType>;
+  using StateType = MinMaxState<ArrowType>;
+
+  MinMaxImpl(const std::shared_ptr<DataType>& out_type, const MinMaxOptions& options)
+      : out_type(out_type), options(options) {}
+
+  void Consume(KernelContext*, const ExecBatch& batch) override {
+    StateType local;
+
+    ArrayType arr(batch[0].array());
+
+    local.has_nulls = arr.null_count() > 0;
+    if (local.has_nulls && options.null_handling == MinMaxOptions::OUTPUT_NULL) {
+      this->state = local;
+      return;
+    }
+
+    const auto values = arr.raw_values();
+    if (arr.null_count() > 0) {
+      internal::BitmapReader reader(arr.null_bitmap_data(), arr.offset(), arr.length());
+      for (int64_t i = 0; i < arr.length(); i++) {
+        if (reader.IsSet()) {
+          local.MergeOne(values[i]);
+        }
+        reader.Next();
+      }
+    } else {
+      for (int64_t i = 0; i < arr.length(); i++) {
+        local.MergeOne(values[i]);
+      }
+    }
+    this->state = local;
+  }
+
+  void MergeFrom(KernelContext*, const KernelState& src) override {
+    const auto& other = checked_cast<const ThisType&>(src);
+    this->state += other.state;
+  }
+
+  void Finalize(KernelContext*, Datum* out) override {
+    using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
+
+    std::vector<std::shared_ptr<Scalar>> values;
+    if (state.has_nulls && options.null_handling == MinMaxOptions::OUTPUT_NULL) {
+      // (null, null)
+      values = {std::make_shared<ScalarType>(), std::make_shared<ScalarType>()};
+    } else {
+      values = {std::make_shared<ScalarType>(state.min),
+                std::make_shared<ScalarType>(state.max)};
+    }
+    out->value = std::make_shared<StructScalar>(std::move(values), this->out_type);
+  }
+
+  std::shared_ptr<DataType> out_type;
+  MinMaxOptions options;
+  MinMaxState<ArrowType> state;
+};
+
+struct MinMaxInitState {
+  std::unique_ptr<KernelState> state;
+  KernelContext* ctx;
+  const DataType& in_type;
+  const std::shared_ptr<DataType>& out_type;
+  const MinMaxOptions& options;
+
+  MinMaxInitState(KernelContext* ctx, const DataType& in_type,
+                  const std::shared_ptr<DataType>& out_type, const MinMaxOptions& options)
+      : ctx(ctx), in_type(in_type), out_type(out_type), options(options) {}
+
+  Status Visit(const DataType&) {
+    return Status::NotImplemented("No min/max implemented");
+  }
+
+  Status Visit(const HalfFloatType&) {
+    return Status::NotImplemented("No sum implemented");
+  }
+
+  template <typename Type>
+  enable_if_number<Type, Status> Visit(const Type&) {
+    state.reset(new MinMaxImpl<Type>(out_type, options));
+    return Status::OK();
+  }
+
+  std::unique_ptr<KernelState> Create() {
+    ctx->SetStatus(VisitTypeInline(in_type, this));
+    return std::move(state);
+  }
+};
+
+std::unique_ptr<KernelState> MinMaxInit(KernelContext* ctx, const Kernel& kernel,
+                                        const FunctionOptions* options) {
+  MinMaxInitState visitor(ctx, *kernel.signature->in_types()[0].type(),
+                          kernel.signature->out_type().type(),
+                          static_cast<const MinMaxOptions&>(*options));
+  return visitor.Create();
+}
+
 }  // namespace
 
 namespace internal {
@@ -333,7 +483,18 @@ void AddBasicAggKernels(KernelInit init,
                         const std::vector<std::shared_ptr<DataType>>& types,
                         std::shared_ptr<DataType> out_ty, ScalarAggregateFunction* func) {
   for (const auto& ty : types) {
-    // array[T] -> scalar[T]
+    // array[InT] -> scalar[OutT]
+    auto sig = KernelSignature::Make({InputType::Array(ty)}, ValueDescr::Scalar(out_ty));
+    AddAggKernel(std::move(sig), init, func);
+  }
+}
+
+void AddMinMaxKernels(KernelInit init,
+                      const std::vector<std::shared_ptr<DataType>>& types,
+                      ScalarAggregateFunction* func) {
+  for (const auto& ty : types) {
+    // array[T] -> scalar[struct<min: T, max: T>]
+    auto out_ty = struct_({field("min", ty), field("max", ty)});
     auto sig = KernelSignature::Make({InputType::Array(ty)}, ValueDescr::Scalar(out_ty));
     AddAggKernel(std::move(sig), init, func);
   }
@@ -355,9 +516,11 @@ void RegisterBasicAggregateFunctions(FunctionRegistry* registry) {
   DCHECK_OK(registry->AddFunction(std::move(func)));
 
   func = std::make_shared<ScalarAggregateFunction>("mean", /*arity=*/1);
-  AddBasicAggKernels(MeanInit, codegen::SignedIntTypes(), float64(), func.get());
-  AddBasicAggKernels(MeanInit, codegen::UnsignedIntTypes(), float64(), func.get());
-  AddBasicAggKernels(MeanInit, codegen::FloatingPointTypes(), float64(), func.get());
+  AddBasicAggKernels(MeanInit, codegen::NumericTypes(), float64(), func.get());
+  DCHECK_OK(registry->AddFunction(std::move(func)));
+
+  func = std::make_shared<ScalarAggregateFunction>("minmax", /*arity=*/1);
+  AddMinMaxKernels(MinMaxInit, codegen::NumericTypes(), func.get());
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc
index 3d2a1d36b6a..f341e6d4b14 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_test.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc
@@ -127,7 +127,7 @@ void ValidateSum(const Array& array) {
 }
 
 template <typename ArrowType>
-class TestNumericSumKernel : public TestBase {};
+class TestNumericSumKernel : public ::testing::Test {};
 
 TYPED_TEST_SUITE(TestNumericSumKernel, NumericArrowTypes);
 TYPED_TEST(TestNumericSumKernel, SimpleSum) {
@@ -164,7 +164,7 @@ TYPED_TEST(TestNumericSumKernel, SimpleSum) {
 }
 
 template <typename ArrowType>
-class TestRandomNumericSumKernel : public TestBase {};
+class TestRandomNumericSumKernel : public ::testing::Test {};
 
 TYPED_TEST_SUITE(TestRandomNumericSumKernel, NumericArrowTypes);
 TYPED_TEST(TestRandomNumericSumKernel, RandomArraySum) {
@@ -236,7 +236,7 @@ void ValidateCount(const char* json, CountPair expected) {
 void ValidateCount(const Array& input) { ValidateCount(input, NaiveCount(input)); }
 
 template <typename ArrowType>
-class TestCountKernel : public TestBase {};
+class TestCountKernel : public ::testing::Test {};
 
 TYPED_TEST_SUITE(TestCountKernel, NumericArrowTypes);
 TYPED_TEST(TestCountKernel, SimpleCount) {
@@ -248,7 +248,7 @@ TYPED_TEST(TestCountKernel, SimpleCount) {
 }
 
 template <typename ArrowType>
-class TestRandomNumericCountKernel : public TestBase {};
+class TestRandomNumericCountKernel : public ::testing::Test {};
 
 TYPED_TEST_SUITE(TestRandomNumericCountKernel, NumericArrowTypes);
 TYPED_TEST(TestRandomNumericCountKernel, RandomArrayCount) {
@@ -264,5 +264,194 @@ TYPED_TEST(TestRandomNumericCountKernel, RandomArrayCount) {
   }
 }
 
+///
+/// Mean
+///
+
+template <typename ArrowType>
+static Datum NaiveMean(const Array& array) {
+  using MeanScalarType = typename TypeTraits<DoubleType>::ScalarType;
+
+  const auto result = NaiveSumPartial<ArrowType>(array);
+  const double mean = static_cast<double>(result.first) /
+                      static_cast<double>(result.second ? result.second : 1UL);
+  const bool is_valid = result.second > 0;
+
+  if (!is_valid) return Datum(std::make_shared<MeanScalarType>());
+  return Datum(std::make_shared<MeanScalarType>(mean));
+}
+
+template <typename ArrowType>
+void ValidateMean(const Array& input, Datum expected) {
+  using OutputType = typename FindAccumulatorType<DoubleType>::Type;
+
+  ASSERT_OK_AND_ASSIGN(Datum result, Mean(input));
+  DatumEqual<OutputType>::EnsureEqual(result, expected);
+}
+
+template <typename ArrowType>
+void ValidateMean(const char* json, Datum expected) {
+  auto array = ArrayFromJSON(TypeTraits<ArrowType>::type_singleton(), json);
+  ValidateMean<ArrowType>(*array, expected);
+}
+
+template <typename ArrowType>
+void ValidateMean(const Array& array) {
+  ValidateMean<ArrowType>(array, NaiveMean<ArrowType>(array));
+}
+
+template <typename ArrowType>
+class TestMeanKernelNumeric : public ::testing::Test {};
+
+TYPED_TEST_SUITE(TestMeanKernelNumeric, NumericArrowTypes);
+TYPED_TEST(TestMeanKernelNumeric, SimpleMean) {
+  using ScalarType = typename TypeTraits<DoubleType>::ScalarType;
+
+  ValidateMean<TypeParam>("[]", Datum(std::make_shared<ScalarType>()));
+
+  ValidateMean<TypeParam>("[null]", Datum(std::make_shared<ScalarType>()));
+
+  ValidateMean<TypeParam>("[1, null, 1]", Datum(std::make_shared<ScalarType>(1.0)));
+
+  ValidateMean<TypeParam>("[1, 2, 3, 4, 5, 6, 7, 8]",
+                          Datum(std::make_shared<ScalarType>(4.5)));
+
+  ValidateMean<TypeParam>("[0, 0, 0, 0, 0, 0, 0, 0]",
+                          Datum(std::make_shared<ScalarType>(0.0)));
+
+  ValidateMean<TypeParam>("[1, 1, 1, 1, 1, 1, 1, 1]",
+                          Datum(std::make_shared<ScalarType>(1.0)));
+}
+
+template <typename ArrowType>
+class TestRandomNumericMeanKernel : public ::testing::Test {};
+
+TYPED_TEST_SUITE(TestRandomNumericMeanKernel, NumericArrowTypes);
+TYPED_TEST(TestRandomNumericMeanKernel, RandomArrayMean) {
+  auto rand = random::RandomArrayGenerator(0x8afc055);
+  for (size_t i = 3; i < 14; i += 2) {
+    for (auto null_probability : {0.0, 0.1, 0.5, 1.0}) {
+      for (auto length_adjust : {-2, -1, 0, 1, 2}) {
+        int64_t length = (1UL << i) + length_adjust;
+        auto array = rand.Numeric<TypeParam>(length, 0, 100, null_probability);
+        ValidateMean<TypeParam>(*array);
+      }
+    }
+  }
+}
+
+///
+/// Min / Max
+///
+
+template <typename ArrowType>
+class TestNumericMinMaxKernel : public ::testing::Test {
+  using Traits = TypeTraits<ArrowType>;
+  using ArrayType = typename Traits::ArrayType;
+  using c_type = typename ArrayType::value_type;
+  using ScalarType = typename Traits::ScalarType;
+
+ public:
+  void AssertMinMaxIs(const Datum& array, c_type expected_min, c_type expected_max,
+                      const MinMaxOptions& options) {
+    ASSERT_OK_AND_ASSIGN(Datum out, MinMax(array, options));
+    const StructScalar& value = out.scalar_as<StructScalar>();
+
+    const auto& out_min = checked_cast<const ScalarType&>(*value.value[0]);
+    ASSERT_EQ(expected_min, out_min.value);
+
+    const auto& out_max = checked_cast<const ScalarType&>(*value.value[1]);
+    ASSERT_EQ(expected_max, out_max.value);
+  }
+
+  void AssertMinMaxIs(const std::string& json, c_type expected_min, c_type expected_max,
+                      const MinMaxOptions& options) {
+    auto array = ArrayFromJSON(Traits::type_singleton(), json);
+    AssertMinMaxIs(array, expected_min, expected_max, options);
+  }
+
+  void AssertMinMaxIs(const std::vector<std::string>& json, c_type expected_min,
+                      c_type expected_max, const MinMaxOptions& options) {
+    auto array = ChunkedArrayFromJSON(Traits::type_singleton(), json);
+    AssertMinMaxIs(array, expected_min, expected_max, options);
+  }
+
+  void AssertMinMaxIsNull(const Datum& array, const MinMaxOptions& options) {
+    ASSERT_OK_AND_ASSIGN(Datum out, MinMax(array, options));
+
+    const StructScalar& value = out.scalar_as<StructScalar>();
+    for (const auto& val : value.value) {
+      ASSERT_FALSE(val->is_valid);
+    }
+  }
+
+  void AssertMinMaxIsNull(const std::string& json, const MinMaxOptions& options) {
+    auto array = ArrayFromJSON(Traits::type_singleton(), json);
+    AssertMinMaxIsNull(array, options);
+  }
+
+  void AssertMinMaxIsNull(const std::vector<std::string>& json,
+                          const MinMaxOptions& options) {
+    auto array = ChunkedArrayFromJSON(Traits::type_singleton(), json);
+    AssertMinMaxIsNull(array, options);
+  }
+};
+
+template <typename ArrowType>
+class TestFloatingMinMaxKernel : public TestNumericMinMaxKernel<ArrowType> {};
+
+TYPED_TEST_SUITE(TestNumericMinMaxKernel, IntegralArrowTypes);
+TYPED_TEST(TestNumericMinMaxKernel, Basics) {
+  MinMaxOptions options;
+  std::vector<std::string> chunked_input1 = {"[5, 1, 2, 3, 4]", "[9, 1, null, 3, 4]"};
+  std::vector<std::string> chunked_input2 = {"[5, null, 2, 3, 4]", "[9, 1, 2, 3, 4]"};
+  std::vector<std::string> chunked_input3 = {"[5, 1, 2, 3, null]", "[9, 1, null, 3, 4]"};
+
+  this->AssertMinMaxIs("[5, 1, 2, 3, 4]", 1, 5, options);
+  this->AssertMinMaxIs("[5, null, 2, 3, 4]", 2, 5, options);
+  this->AssertMinMaxIs(chunked_input1, 1, 9, options);
+  this->AssertMinMaxIs(chunked_input2, 1, 9, options);
+  this->AssertMinMaxIs(chunked_input3, 1, 9, options);
+
+  options = MinMaxOptions(MinMaxOptions::OUTPUT_NULL);
+  this->AssertMinMaxIs("[5, 1, 2, 3, 4]", 1, 5, options);
+  // output null
+  this->AssertMinMaxIsNull("[5, null, 2, 3, 4]", options);
+  // output null
+  this->AssertMinMaxIsNull(chunked_input1, options);
+  this->AssertMinMaxIsNull(chunked_input2, options);
+  this->AssertMinMaxIsNull(chunked_input3, options);
+}
+
+TYPED_TEST_SUITE(TestFloatingMinMaxKernel, RealArrowTypes);
+TYPED_TEST(TestFloatingMinMaxKernel, Floats) {
+  MinMaxOptions options;
+  std::vector<std::string> chunked_input1 = {"[5, 1, 2, 3, 4]", "[9, 1, null, 3, 4]"};
+  std::vector<std::string> chunked_input2 = {"[5, null, 2, 3, 4]", "[9, 1, 2, 3, 4]"};
+  std::vector<std::string> chunked_input3 = {"[5, 1, 2, 3, null]", "[9, 1, null, 3, 4]"};
+
+  this->AssertMinMaxIs("[5, 1, 2, 3, 4]", 1, 5, options);
+  this->AssertMinMaxIs("[5, 1, 2, 3, 4]", 1, 5, options);
+  this->AssertMinMaxIs("[5, null, 2, 3, 4]", 2, 5, options);
+  this->AssertMinMaxIs("[5, Inf, 2, 3, 4]", 2.0, INFINITY, options);
+  this->AssertMinMaxIs("[5, NaN, 2, 3, 4]", 2, 5, options);
+  this->AssertMinMaxIs("[5, -Inf, 2, 3, 4]", -INFINITY, 5, options);
+  this->AssertMinMaxIs(chunked_input1, 1, 9, options);
+  this->AssertMinMaxIs(chunked_input2, 1, 9, options);
+  this->AssertMinMaxIs(chunked_input3, 1, 9, options);
+
+  options = MinMaxOptions(MinMaxOptions::OUTPUT_NULL);
+  this->AssertMinMaxIs("[5, 1, 2, 3, 4]", 1, 5, options);
+  this->AssertMinMaxIs("[5, -Inf, 2, 3, 4]", -INFINITY, 5, options);
+  // output null
+  this->AssertMinMaxIsNull("[5, null, 2, 3, 4]", options);
+  // output null
+  this->AssertMinMaxIsNull("[5, -Inf, null, 3, 4]", options);
+  // output null
+  this->AssertMinMaxIsNull(chunked_input1, options);
+  this->AssertMinMaxIsNull(chunked_input2, options);
+  this->AssertMinMaxIsNull(chunked_input3, options);
+}
+
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/minmax.cc b/cpp/src/arrow/compute/kernels/minmax.cc
deleted file mode 100644
index 0db5d9e738a..00000000000
--- a/cpp/src/arrow/compute/kernels/minmax.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// returnGegarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <algorithm>
-#include <limits>
-#include <utility>
-
-#include "arrow/type_traits.h"
-#include "arrow/util/checked_cast.h"
-
-namespace arrow {
-
-using internal::checked_cast;
-
-namespace compute {
-
-struct MinMaxState : public KernelState {
-  virtual void Consume(KernelContext* ctx, const ExecBatch& batch) = 0;
-};
-
-template <typename ArrowType, typename Enable = void>
-struct MinMaxState {};
-
-template <typename ArrowType>
-struct MinMaxState<ArrowType, enable_if_integer<ArrowType>> : public KernelState {
-  using ThisType = MinMaxState<ArrowType>;
-  using T = typename ArrowType::c_type;
-
-  ThisType& operator+=(const ThisType& rhs) {
-    this->has_nulls |= rhs.has_nulls;
-    this->min = std::min(this->min, rhs.min);
-    this->max = std::max(this->max, rhs.max);
-    return *this;
-  }
-
-  void MergeOne(T value) {
-    this->min = std::min(this->min, value);
-    this->max = std::max(this->max, value);
-  }
-
-  T min = std::numeric_limits<T>::max();
-  T max = std::numeric_limits<T>::min();
-  bool has_nulls = false;
-};
-
-template <typename ArrowType>
-struct MinMaxState<ArrowType, enable_if_floating_point<ArrowType>> : public KernelState {
-  using ThisType = MinMaxState<ArrowType>;
-  using T = typename ArrowType::c_type;
-
-  ThisType& operator+=(const ThisType& rhs) {
-    this->has_nulls |= rhs.has_nulls;
-    this->min = std::fmin(this->min, rhs.min);
-    this->max = std::fmax(this->max, rhs.max);
-    return *this;
-  }
-
-  void MergeOne(T value) {
-    this->min = std::fmin(this->min, value);
-    this->max = std::fmax(this->max, value);
-  }
-
-  T min = std::numeric_limits<T>::infinity();
-  T max = -std::numeric_limits<T>::infinity();
-  bool has_nulls = false;
-};
-
-struct StateVisitor {
-  std::unique_ptr<KernelState> result;
-
-  Status Visit(const DataType&) { return Status::NotImplemented("NYI"); }
-
-  template <typename Type>
-  enable_if_number<Type, Status> Visit(const Type&) {
-    using StateType = MinMaxState<Type>;
-    result.reset(new StateType());
-  }
-};
-
-std::unique_ptr<KernelState> MinMaxInit(KernelContext* ctx, const Kernel& kernel,
-                                        const FunctionOptions&) {
-  StateVisitor state_init;
-  ctx->SetStatus(VisitTypeInline(/*type*/, &state_init));
-}
-
-void MinMaxConsume(KernelContext* ctx, const ExecBatch& batch) {
-  checked_cast<MinMaxState*>(ctx->state())->Consume(batch);
-
-  Status Merge(const StateType& src, StateType* dst) const override {
-    *dst += src;
-    return Status::OK();
-  }
-
-  Status Finalize(const StateType& src, Datum* output) const override {
-    using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
-    if (src.has_nulls && options_.null_handling == MinMaxOptions::OUTPUT_NULL) {
-      *output = Datum(
-          {Datum(std::make_shared<ScalarType>()), Datum(std::make_shared<ScalarType>())});
-    } else {
-      *output = Datum({Datum(src.min), Datum(src.max)});
-    }
-
-    return Status::OK();
-  }
-
- private:
-  MinMaxOptions options_;
-}
-
-// MinMax implemented for
-//
-// * Number types
-//
-// Outputs struct<min: T, max: T>
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/registry.h b/cpp/src/arrow/compute/kernels/registry.h
index f835de0cd08..2e74c010c5a 100644
--- a/cpp/src/arrow/compute/kernels/registry.h
+++ b/cpp/src/arrow/compute/kernels/registry.h
@@ -34,7 +34,6 @@ void RegisterSetLookupFunctions(FunctionRegistry* registry);
 // Vector functions
 void RegisterVectorFilterFunctions(FunctionRegistry* registry);
 void RegisterVectorHashFunctions(FunctionRegistry* registry);
-void RegisterVectorPartitionFunctions(FunctionRegistry* registry);
 void RegisterVectorSortFunctions(FunctionRegistry* registry);
 void RegisterVectorTakeFunctions(FunctionRegistry* registry);
 
diff --git a/cpp/src/arrow/compute/kernels/vector_partition.cc b/cpp/src/arrow/compute/kernels/vector_partition.cc
deleted file mode 100644
index f2dec8627ab..00000000000
--- a/cpp/src/arrow/compute/kernels/vector_partition.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <algorithm>
-#include <numeric>
-#include <utility>
-
-#include "arrow/compute/kernels/common.h"
-
-namespace arrow {
-namespace compute {
-
-namespace {
-
-// We need to preserve the options
-struct PartitionIndicesState : public KernelState {
-  explicit PartitionIndicesState(int64_t pivot) : pivot(pivot) {}
-  int64_t pivot;
-};
-
-template <typename OutType, typename InType>
-struct PartitionIndices {
-  using ArrayType = typename TypeTraits<InType>::ArrayType;
-  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    ArrayType arr(batch[0].array());
-
-    int64_t pivot = checked_cast<const PartitionIndicesState&>(*ctx->state()).pivot;
-    if (pivot > arr.length()) {
-      ctx->SetStatus(Status::IndexError("NthToIndices index out of bound"));
-      return;
-    }
-    ArrayData* out_arr = out->mutable_array();
-    uint64_t* out_begin = out_arr->GetMutableValues<uint64_t>(1);
-    uint64_t* out_end = out_begin + arr.length();
-    std::iota(out_begin, out_end, 0);
-    if (pivot == arr.length()) {
-      return;
-    }
-    uint64_t* nulls_begin = out_end;
-    if (arr.null_count()) {
-      nulls_begin = std::stable_partition(
-          out_begin, out_end, [&arr](uint64_t ind) { return !arr.IsNull(ind); });
-    }
-    auto nth_begin = out_begin + pivot;
-    if (nth_begin < nulls_begin) {
-      std::nth_element(out_begin, nth_begin, nulls_begin,
-                       [&arr](uint64_t left, uint64_t right) {
-                         return arr.GetView(left) < arr.GetView(right);
-                       });
-    }
-  }
-};
-
-}  // namespace
-
-namespace internal {
-
-// Partition kernel implemented for
-//
-// * Number types
-// * Base binary types
-
-std::unique_ptr<KernelState> InitPartitionIndices(KernelContext*, const Kernel&,
-                                                  const FunctionOptions* options) {
-  int64_t pivot = static_cast<const PartitionOptions*>(options)->pivot;
-  return std::unique_ptr<KernelState>(new PartitionIndicesState(pivot));
-}
-
-void RegisterVectorPartitionFunctions(FunctionRegistry* registry) {
-  auto func = std::make_shared<VectorFunction>("partition_indices", /*arity=*/1);
-  VectorKernel base;
-  base.init = InitPartitionIndices;
-
-  // The kernel outputs into preallocated memory and is never null
-  base.mem_allocation = MemAllocation::PREALLOCATE;
-  base.null_handling = NullHandling::OUTPUT_NOT_NULL;
-
-  for (const auto& ty : codegen::NumericTypes()) {
-    base.signature = KernelSignature::Make({ty}, uint64());
-    base.exec = codegen::NumericSetReturn<PartitionIndices, UInt64Type>(*ty);
-    DCHECK_OK(func->AddKernel(base));
-  }
-  for (const auto& ty : codegen::BaseBinaryTypes()) {
-    base.signature = KernelSignature::Make({ty}, uint64());
-    base.exec = codegen::BaseBinarySetReturn<PartitionIndices, UInt64Type>(*ty);
-    DCHECK_OK(func->AddKernel(base));
-  }
-  DCHECK_OK(registry->AddFunction(std::move(func)));
-}
-
-}  // namespace internal
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/vector_partition_test.cc b/cpp/src/arrow/compute/kernels/vector_partition_test.cc
deleted file mode 100644
index b2d6e391e5c..00000000000
--- a/cpp/src/arrow/compute/kernels/vector_partition_test.cc
+++ /dev/null
@@ -1,177 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <limits>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "arrow/compute/api_eager.h"
-#include "arrow/compute/test_util.h"
-#include "arrow/testing/gtest_common.h"
-#include "arrow/testing/gtest_util.h"
-#include "arrow/testing/random.h"
-#include "arrow/testing/util.h"
-#include "arrow/type_traits.h"
-
-namespace arrow {
-
-using internal::checked_pointer_cast;
-
-namespace compute {
-
-template <typename ArrayType>
-class Comparator {
- public:
-  bool operator()(const ArrayType& array, uint64_t lhs, uint64_t rhs) {
-    if (array.IsNull(rhs)) return true;
-    if (array.IsNull(lhs)) return false;
-    return array.GetView(lhs) <= array.GetView(rhs);
-  }
-};
-
-template <typename ArrowType>
-class TestPartitionIndices : public TestBase {
-  using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
-
- private:
-  template <typename ArrayType>
-  void Validate(const ArrayType& array, int n, UInt64Array& offsets) {
-    if (n >= array.length()) {
-      for (int i = 0; i < array.length(); ++i) {
-        ASSERT_TRUE(offsets.Value(i) == (uint64_t)i);
-      }
-    } else {
-      Comparator<ArrayType> compare;
-      uint64_t nth = offsets.Value(n);
-
-      for (int i = 0; i < n; ++i) {
-        uint64_t lhs = offsets.Value(i);
-        ASSERT_TRUE(compare(array, lhs, nth));
-      }
-      for (int i = n + 1; i < array.length(); ++i) {
-        uint64_t rhs = offsets.Value(i);
-        ASSERT_TRUE(compare(array, nth, rhs));
-      }
-    }
-  }
-
- protected:
-  void AssertPartitionIndicesArray(const std::shared_ptr<Array> values, int n) {
-    ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> offsets, PartitionIndices(*values, n));
-    ASSERT_OK(offsets->ValidateFull());
-    Validate<ArrayType>(*checked_pointer_cast<ArrayType>(values), n,
-                        *checked_pointer_cast<UInt64Array>(offsets));
-  }
-
-  void AssertPartitionIndicesJson(const std::string& values, int n) {
-    auto type = TypeTraits<ArrowType>::type_singleton();
-    AssertPartitionIndicesArray(ArrayFromJSON(type, values), n);
-  }
-};
-
-template <typename ArrowType>
-class TestPartitionIndicesForReal : public TestPartitionIndices<ArrowType> {};
-TYPED_TEST_SUITE(TestPartitionIndicesForReal, RealArrowTypes);
-
-template <typename ArrowType>
-class TestPartitionIndicesForIntegral : public TestPartitionIndices<ArrowType> {};
-TYPED_TEST_SUITE(TestPartitionIndicesForIntegral, IntegralArrowTypes);
-
-template <typename ArrowType>
-class TestPartitionIndicesForStrings : public TestPartitionIndices<ArrowType> {};
-TYPED_TEST_SUITE(TestPartitionIndicesForStrings, testing::Types<StringType>);
-
-TYPED_TEST(TestPartitionIndicesForReal, Real) {
-  this->AssertPartitionIndicesJson("[null, 1, 3.3, null, 2, 5.3]", 0);
-  this->AssertPartitionIndicesJson("[null, 1, 3.3, null, 2, 5.3]", 2);
-  this->AssertPartitionIndicesJson("[null, 1, 3.3, null, 2, 5.3]", 5);
-  this->AssertPartitionIndicesJson("[null, 1, 3.3, null, 2, 5.3]", 6);
-}
-
-TYPED_TEST(TestPartitionIndicesForIntegral, Integral) {
-  this->AssertPartitionIndicesJson("[null, 1, 3, null, 2, 5]", 0);
-  this->AssertPartitionIndicesJson("[null, 1, 3, null, 2, 5]", 2);
-  this->AssertPartitionIndicesJson("[null, 1, 3, null, 2, 5]", 5);
-  this->AssertPartitionIndicesJson("[null, 1, 3, null, 2, 5]", 6);
-}
-
-TYPED_TEST(TestPartitionIndicesForStrings, Strings) {
-  this->AssertPartitionIndicesJson(R"(["testing", null, "nth", "for", null, "strings"])",
-                                   0);
-  this->AssertPartitionIndicesJson(R"(["testing", null, "nth", "for", null, "strings"])",
-                                   2);
-  this->AssertPartitionIndicesJson(R"(["testing", null, "nth", "for", null, "strings"])",
-                                   5);
-  this->AssertPartitionIndicesJson(R"(["testing", null, "nth", "for", null, "strings"])",
-                                   6);
-}
-
-template <typename ArrowType>
-class TestPartitionIndicesRandom : public TestPartitionIndices<ArrowType> {};
-
-using PartitionIndicesableTypes =
-    ::testing::Types<UInt8Type, UInt16Type, UInt32Type, UInt64Type, Int8Type, Int16Type,
-                     Int32Type, Int64Type, FloatType, DoubleType, StringType>;
-
-class RandomImpl {
- protected:
-  random::RandomArrayGenerator generator;
-
- public:
-  explicit RandomImpl(random::SeedType seed) : generator(seed) {}
-};
-
-template <typename ArrowType>
-class Random : public RandomImpl {
-  using CType = typename TypeTraits<ArrowType>::CType;
-
- public:
-  explicit Random(random::SeedType seed) : RandomImpl(seed) {}
-
-  std::shared_ptr<Array> Generate(uint64_t count, double null_prob) {
-    return generator.Numeric<ArrowType>(count, std::numeric_limits<CType>::min(),
-                                        std::numeric_limits<CType>::max(), null_prob);
-  }
-};
-
-template <>
-class Random<StringType> : public RandomImpl {
- public:
-  explicit Random(random::SeedType seed) : RandomImpl(seed) {}
-
-  std::shared_ptr<Array> Generate(uint64_t count, double null_prob) {
-    return generator.String(count, 1, 100, null_prob);
-  }
-};
-
-TYPED_TEST_SUITE(TestPartitionIndicesRandom, PartitionIndicesableTypes);
-
-TYPED_TEST(TestPartitionIndicesRandom, RandomValues) {
-  Random<TypeParam> rand(0x61549225);
-  int length = 100;
-  for (auto null_probability : {0.0, 0.1, 0.5, 1.0}) {
-    // Try n from 0 to out of bound
-    for (int n = 0; n <= length; ++n) {
-      auto array = rand.Generate(length, null_probability);
-      this->AssertPartitionIndicesArray(array, n);
-    }
-  }
-}
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc
index 31e5d348b8e..5f3df78eb24 100644
--- a/cpp/src/arrow/compute/kernels/vector_sort.cc
+++ b/cpp/src/arrow/compute/kernels/vector_sort.cc
@@ -21,50 +21,63 @@
 #include <utility>
 #include <vector>
 
-#include "arrow/builder.h"
-#include "arrow/compute/kernel.h"
-#include "arrow/type_traits.h"
-#include "arrow/visitor_inline.h"
+#include "arrow/compute/kernels/common.h"
 
 namespace arrow {
+namespace compute {
 
-class Array;
+namespace {
 
-namespace compute {
+// ----------------------------------------------------------------------
+// partition_indices implementation
 
-/// \brief UnaryKernel implementing SortToIndices operation
-class ARROW_EXPORT SortToIndicesKernel : public UnaryKernel {
- protected:
-  std::shared_ptr<DataType> type_;
+// We need to preserve the options
+struct PartitionIndicesState : public KernelState {
+  explicit PartitionIndicesState(int64_t pivot) : pivot(pivot) {}
+  int64_t pivot;
+};
 
- public:
-  /// \brief UnaryKernel interface
-  ///
-  /// delegates to subclasses via SortToIndices()
-  Status Call(FunctionContext* ctx, const Datum& values, Datum* offsets) override = 0;
-
-  /// \brief output type of this kernel
-  std::shared_ptr<DataType> out_type() const override { return uint64(); }
-
-  /// \brief single-array implementation
-  virtual Status SortToIndices(FunctionContext* ctx, const std::shared_ptr<Array>& values,
-                               std::shared_ptr<Array>* offsets) = 0;
-
-  /// \brief factory for SortToIndicesKernel
-  ///
-  /// \param[in] value_type constructed SortToIndicesKernel will support sorting
-  ///            values of this type
-  /// \param[out] out created kernel
-  static Status Make(const std::shared_ptr<DataType>& value_type,
-                     std::unique_ptr<SortToIndicesKernel>* out);
+template <typename OutType, typename InType>
+struct PartitionIndices {
+  using ArrayType = typename TypeTraits<InType>::ArrayType;
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    ArrayType arr(batch[0].array());
+
+    int64_t pivot = checked_cast<const PartitionIndicesState&>(*ctx->state()).pivot;
+    if (pivot > arr.length()) {
+      ctx->SetStatus(Status::IndexError("NthToIndices index out of bound"));
+      return;
+    }
+    ArrayData* out_arr = out->mutable_array();
+    uint64_t* out_begin = out_arr->GetMutableValues<uint64_t>(1);
+    uint64_t* out_end = out_begin + arr.length();
+    std::iota(out_begin, out_end, 0);
+    if (pivot == arr.length()) {
+      return;
+    }
+    uint64_t* nulls_begin = out_end;
+    if (arr.null_count()) {
+      nulls_begin = std::stable_partition(
+          out_begin, out_end, [&arr](uint64_t ind) { return !arr.IsNull(ind); });
+    }
+    auto nth_begin = out_begin + pivot;
+    if (nth_begin < nulls_begin) {
+      std::nth_element(out_begin, nth_begin, nulls_begin,
+                       [&arr](uint64_t left, uint64_t right) {
+                         return arr.GetView(left) < arr.GetView(right);
+                       });
+    }
+  }
 };
 
+}  // namespace
+
 template <typename ArrowType>
 class CompareSorter {
   using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
 
  public:
-  void Sort(int64_t* indices_begin, int64_t* indices_end, const ArrayType& values) {
+  void Sort(uint64_t* indices_begin, uint64_t* indices_end, const ArrayType& values) {
     std::iota(indices_begin, indices_end, 0);
 
     auto nulls_begin = indices_end;
@@ -96,7 +109,7 @@ class CountSorter {
     value_range_ = static_cast<uint32_t>(max - min) + 1;
   }
 
-  void Sort(int64_t* indices_begin, int64_t* indices_end, const ArrayType& values) {
+  void Sort(uint64_t* indices_begin, uint64_t* indices_end, const ArrayType& values) {
     // 32bit counter performs much better than 64bit one
     if (values.length() < (1LL << 32)) {
       SortInternal<uint32_t>(indices_begin, indices_end, values);
@@ -110,7 +123,7 @@ class CountSorter {
   uint32_t value_range_{0};
 
   template <typename CounterType>
-  void SortInternal(int64_t* indices_begin, int64_t* indices_end,
+  void SortInternal(uint64_t* indices_begin, uint64_t* indices_end,
                     const ArrayType& values) {
     const uint32_t value_range = value_range_;
 
@@ -151,7 +164,7 @@ class CountOrCompareSorter {
   using c_type = typename ArrowType::c_type;
 
  public:
-  void Sort(int64_t* indices_begin, int64_t* indices_end, const ArrayType& values) {
+  void Sort(uint64_t* indices_begin, uint64_t* indices_end, const ArrayType& values) {
     if (values.length() >= countsort_min_len_ && values.length() > values.null_count()) {
       c_type min{std::numeric_limits<c_type>::max()};
       c_type max{std::numeric_limits<c_type>::min()};
@@ -193,123 +206,93 @@ class CountOrCompareSorter {
   static const uint32_t countsort_max_range_ = 4096;
 };
 
-template <typename ArrowType, typename Sorter>
-class SortToIndicesKernelImpl : public SortToIndicesKernel {
-  using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
-
- public:
-  explicit SortToIndicesKernelImpl(Sorter sorter) : sorter_(sorter) {}
-
-  Status SortToIndices(FunctionContext* ctx, const std::shared_ptr<Array>& values,
-                       std::shared_ptr<Array>* offsets) {
-    return SortToIndicesImpl(ctx, std::static_pointer_cast<ArrayType>(values), offsets);
-  }
+template <typename Type, typename Enable = void>
+struct Sorter;
 
-  Status Call(FunctionContext* ctx, const Datum& values, Datum* offsets) {
-    if (!values.is_array()) {
-      return Status::Invalid("SortToIndicesKernel expects array values");
-    }
-    auto values_array = values.make_array();
-    std::shared_ptr<Array> offsets_array;
-    RETURN_NOT_OK(this->SortToIndices(ctx, values_array, &offsets_array));
-    *offsets = offsets_array;
-    return Status::OK();
-  }
-
-  std::shared_ptr<DataType> out_type() const { return type_; }
+template <>
+struct Sorter<UInt8Type> {
+  CountSorter<UInt8Type> impl;
+  Sorter() : impl(0, 255) {}
+};
 
- private:
-  Sorter sorter_;
+template <>
+struct Sorter<Int8Type> {
+  CountSorter<Int8Type> impl;
+  Sorter() : impl(-128, 127) {}
+};
 
-  Status SortToIndicesImpl(FunctionContext* ctx, const std::shared_ptr<ArrayType>& values,
-                           std::shared_ptr<Array>* offsets) {
-    int64_t buf_size = values->length() * sizeof(uint64_t);
-    ARROW_ASSIGN_OR_RAISE(auto indices_buf, AllocateBuffer(buf_size, ctx->memory_pool()));
+template <typename Type>
+struct Sorter<Type, enable_if_t<is_integer_type<Type>::value &&
+                                (sizeof(typename Type::c_type) > 1)>> {
+  CountOrCompareSorter<Type> impl;
+};
 
-    int64_t* indices_begin = reinterpret_cast<int64_t*>(indices_buf->mutable_data());
-    int64_t* indices_end = indices_begin + values->length();
+template <typename Type>
+struct Sorter<Type, enable_if_t<is_floating_type<Type>::value ||
+                                is_base_binary_type<Type>::value>> {
+  CompareSorter<Type> impl;
+};
 
-    sorter_.Sort(indices_begin, indices_end, *values.get());
-    *offsets = std::make_shared<UInt64Array>(values->length(), std::move(indices_buf));
-    return Status::OK();
+template <typename OutType, typename InType>
+struct SortIndices {
+  using ArrayType = typename TypeTraits<InType>::ArrayType;
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    ArrayType arr(batch[0].array());
+    ArrayData* out_arr = out->mutable_array();
+    uint64_t* out_begin = out_arr->GetMutableValues<uint64_t>(1);
+    uint64_t* out_end = out_begin + arr.length();
+
+    Sorter<InType> sorter;
+    sorter.impl.Sort(out_begin, out_end, arr);
   }
 };
 
-template <typename ArrowType, typename Sorter = CompareSorter<ArrowType>>
-static SortToIndicesKernelImpl<ArrowType, Sorter>* MakeCompareKernel() {
-  return new SortToIndicesKernelImpl<ArrowType, Sorter>(Sorter());
-}
+namespace internal {
 
-template <typename ArrowType, typename Sorter = CountSorter<ArrowType>>
-static SortToIndicesKernelImpl<ArrowType, Sorter>* MakeCountKernel(int min, int max) {
-  return new SortToIndicesKernelImpl<ArrowType, Sorter>(Sorter(min, max));
-}
+// Sort indices kernels implemented for
+//
+// * Number types
+// * Base binary types
 
-template <typename ArrowType, typename Sorter = CountOrCompareSorter<ArrowType>>
-static SortToIndicesKernelImpl<ArrowType, Sorter>* MakeCountOrCompareKernel() {
-  return new SortToIndicesKernelImpl<ArrowType, Sorter>(Sorter());
+std::unique_ptr<KernelState> InitPartitionIndices(KernelContext*, const Kernel&,
+                                                  const FunctionOptions* options) {
+  int64_t pivot = static_cast<const PartitionOptions*>(options)->pivot;
+  return std::unique_ptr<KernelState>(new PartitionIndicesState(pivot));
 }
 
-Status SortToIndicesKernel::Make(const std::shared_ptr<DataType>& value_type,
-                                 std::unique_ptr<SortToIndicesKernel>* out) {
-  SortToIndicesKernel* kernel;
-  switch (value_type->id()) {
-    case Type::UINT8:
-      kernel = MakeCountKernel<UInt8Type>(0, 255);
-      break;
-    case Type::INT8:
-      kernel = MakeCountKernel<Int8Type>(-128, 127);
-      break;
-    case Type::UINT16:
-      kernel = MakeCountOrCompareKernel<UInt16Type>();
-      break;
-    case Type::INT16:
-      kernel = MakeCountOrCompareKernel<Int16Type>();
-      break;
-    case Type::UINT32:
-      kernel = MakeCountOrCompareKernel<UInt32Type>();
-      break;
-    case Type::INT32:
-      kernel = MakeCountOrCompareKernel<Int32Type>();
-      break;
-    case Type::UINT64:
-      kernel = MakeCountOrCompareKernel<UInt64Type>();
-      break;
-    case Type::INT64:
-      kernel = MakeCountOrCompareKernel<Int64Type>();
-      break;
-    case Type::FLOAT:
-      kernel = MakeCompareKernel<FloatType>();
-      break;
-    case Type::DOUBLE:
-      kernel = MakeCompareKernel<DoubleType>();
-      break;
-    case Type::BINARY:
-      kernel = MakeCompareKernel<BinaryType>();
-      break;
-    case Type::STRING:
-      kernel = MakeCompareKernel<StringType>();
-      break;
-    default:
-      return Status::NotImplemented("Sorting of ", *value_type, " arrays");
+template <template <typename...> class ExecTemplate>
+struct SortingKernels {
+  static void Add(VectorKernel base, VectorFunction* func) {
+    for (const auto& ty : codegen::NumericTypes()) {
+      base.signature = KernelSignature::Make({ty}, uint64());
+      base.exec = codegen::NumericSetReturn<ExecTemplate, UInt64Type>(*ty);
+      DCHECK_OK(func->AddKernel(base));
+    }
+    for (const auto& ty : codegen::BaseBinaryTypes()) {
+      base.signature = KernelSignature::Make({ty}, uint64());
+      base.exec = codegen::BaseBinarySetReturn<ExecTemplate, UInt64Type>(*ty);
+      DCHECK_OK(func->AddKernel(base));
+    }
   }
-  out->reset(kernel);
-  return Status::OK();
-}
-
-static Status SortToIndices(FunctionContext* ctx, const Datum& values, Datum* offsets) {
-  std::unique_ptr<SortToIndicesKernel> kernel;
-  RETURN_NOT_OK(SortToIndicesKernel::Make(values.type(), &kernel));
-  return kernel->Call(ctx, values, offsets);
-}
+};
 
-Status SortToIndices(FunctionContext* ctx, const Array& values,
-                     std::shared_ptr<Array>* offsets) {
-  Datum offsets_datum;
-  RETURN_NOT_OK(SortToIndices(ctx, Datum(values.data()), &offsets_datum));
-  *offsets = offsets_datum.make_array();
-  return Status::OK();
+void RegisterVectorSortFunctions(FunctionRegistry* registry) {
+  // The kernel outputs into preallocated memory and is never null
+  VectorKernel base;
+  base.mem_allocation = MemAllocation::PREALLOCATE;
+  base.null_handling = NullHandling::OUTPUT_NOT_NULL;
+
+  auto sort_indices = std::make_shared<VectorFunction>("sort_indices", /*arity=*/1);
+  SortingKernels<SortIndices>::Add(base, sort_indices.get());
+  DCHECK_OK(registry->AddFunction(std::move(sort_indices)));
+
+  // partition_indices has a parameter so needs its init function
+  auto part_indices = std::make_shared<VectorFunction>("partition_indices", /*arity=*/1);
+  base.init = InitPartitionIndices;
+  SortingKernels<PartitionIndices>::Add(base, part_indices.get());
+  DCHECK_OK(registry->AddFunction(std::move(part_indices)));
 }
 
+}  // namespace internal
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/vector_sort_test.cc b/cpp/src/arrow/compute/kernels/vector_sort_test.cc
index 209251d8cdb..9089da7d366 100644
--- a/cpp/src/arrow/compute/kernels/vector_sort_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_sort_test.cc
@@ -20,7 +20,7 @@
 #include <string>
 #include <vector>
 
-#include "arrow/compute/api.h"
+#include "arrow/compute/api_eager.h"
 #include "arrow/compute/test_util.h"
 #include "arrow/testing/gtest_common.h"
 #include "arrow/testing/gtest_util.h"
@@ -29,8 +29,175 @@
 #include "arrow/type_traits.h"
 
 namespace arrow {
+
+using internal::checked_pointer_cast;
+
 namespace compute {
 
+template <typename ArrayType>
+class NthComparator {
+ public:
+  bool operator()(const ArrayType& array, uint64_t lhs, uint64_t rhs) {
+    if (array.IsNull(rhs)) return true;
+    if (array.IsNull(lhs)) return false;
+    return array.GetView(lhs) <= array.GetView(rhs);
+  }
+};
+
+template <typename ArrayType>
+class SortComparator {
+ public:
+  bool operator()(const ArrayType& array, uint64_t lhs, uint64_t rhs) {
+    if (array.IsNull(rhs) && array.IsNull(lhs)) return lhs < rhs;
+    if (array.IsNull(rhs)) return true;
+    if (array.IsNull(lhs)) return false;
+    if (array.GetView(lhs) == array.GetView(rhs)) return lhs < rhs;
+    return array.GetView(lhs) < array.GetView(rhs);
+  }
+};
+
+template <typename ArrowType>
+class TestNthToIndices : public TestBase {
+  using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
+
+ private:
+  template <typename ArrayType>
+  void Validate(const ArrayType& array, int n, UInt64Array& offsets) {
+    if (n >= array.length()) {
+      for (int i = 0; i < array.length(); ++i) {
+        ASSERT_TRUE(offsets.Value(i) == (uint64_t)i);
+      }
+    } else {
+      NthComparator<ArrayType> compare;
+      uint64_t nth = offsets.Value(n);
+
+      for (int i = 0; i < n; ++i) {
+        uint64_t lhs = offsets.Value(i);
+        ASSERT_TRUE(compare(array, lhs, nth));
+      }
+      for (int i = n + 1; i < array.length(); ++i) {
+        uint64_t rhs = offsets.Value(i);
+        ASSERT_TRUE(compare(array, nth, rhs));
+      }
+    }
+  }
+
+ protected:
+  void AssertNthToIndicesArray(const std::shared_ptr<Array> values, int n) {
+    ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> offsets, NthToIndices(*values, n));
+    ASSERT_OK(offsets->ValidateFull());
+    Validate<ArrayType>(*checked_pointer_cast<ArrayType>(values), n,
+                        *checked_pointer_cast<UInt64Array>(offsets));
+  }
+
+  void AssertNthToIndicesJson(const std::string& values, int n) {
+    auto type = TypeTraits<ArrowType>::type_singleton();
+    AssertNthToIndicesArray(ArrayFromJSON(type, values), n);
+  }
+};
+
+template <typename ArrowType>
+class TestNthToIndicesForReal : public TestNthToIndices<ArrowType> {};
+TYPED_TEST_SUITE(TestNthToIndicesForReal, RealArrowTypes);
+
+template <typename ArrowType>
+class TestNthToIndicesForIntegral : public TestNthToIndices<ArrowType> {};
+TYPED_TEST_SUITE(TestNthToIndicesForIntegral, IntegralArrowTypes);
+
+template <typename ArrowType>
+class TestNthToIndicesForStrings : public TestNthToIndices<ArrowType> {};
+TYPED_TEST_SUITE(TestNthToIndicesForStrings, testing::Types<StringType>);
+
+TYPED_TEST(TestNthToIndicesForReal, Real) {
+  this->AssertNthToIndicesJson("[null, 1, 3.3, null, 2, 5.3]", 0);
+  this->AssertNthToIndicesJson("[null, 1, 3.3, null, 2, 5.3]", 2);
+  this->AssertNthToIndicesJson("[null, 1, 3.3, null, 2, 5.3]", 5);
+  this->AssertNthToIndicesJson("[null, 1, 3.3, null, 2, 5.3]", 6);
+}
+
+TYPED_TEST(TestNthToIndicesForIntegral, Integral) {
+  this->AssertNthToIndicesJson("[null, 1, 3, null, 2, 5]", 0);
+  this->AssertNthToIndicesJson("[null, 1, 3, null, 2, 5]", 2);
+  this->AssertNthToIndicesJson("[null, 1, 3, null, 2, 5]", 5);
+  this->AssertNthToIndicesJson("[null, 1, 3, null, 2, 5]", 6);
+}
+
+TYPED_TEST(TestNthToIndicesForStrings, Strings) {
+  this->AssertNthToIndicesJson(R"(["testing", null, "nth", "for", null, "strings"])", 0);
+  this->AssertNthToIndicesJson(R"(["testing", null, "nth", "for", null, "strings"])", 2);
+  this->AssertNthToIndicesJson(R"(["testing", null, "nth", "for", null, "strings"])", 5);
+  this->AssertNthToIndicesJson(R"(["testing", null, "nth", "for", null, "strings"])", 6);
+}
+
+template <typename ArrowType>
+class TestNthToIndicesRandom : public TestNthToIndices<ArrowType> {};
+
+using NthToIndicesableTypes =
+    ::testing::Types<UInt8Type, UInt16Type, UInt32Type, UInt64Type, Int8Type, Int16Type,
+                     Int32Type, Int64Type, FloatType, DoubleType, StringType>;
+
+class RandomImpl {
+ protected:
+  random::RandomArrayGenerator generator;
+
+ public:
+  explicit RandomImpl(random::SeedType seed) : generator(seed) {}
+};
+
+template <typename ArrowType>
+class Random : public RandomImpl {
+  using CType = typename TypeTraits<ArrowType>::CType;
+
+ public:
+  explicit Random(random::SeedType seed) : RandomImpl(seed) {}
+
+  std::shared_ptr<Array> Generate(uint64_t count, double null_prob) {
+    return generator.Numeric<ArrowType>(count, std::numeric_limits<CType>::min(),
+                                        std::numeric_limits<CType>::max(), null_prob);
+  }
+};
+
+template <>
+class Random<StringType> : public RandomImpl {
+ public:
+  explicit Random(random::SeedType seed) : RandomImpl(seed) {}
+
+  std::shared_ptr<Array> Generate(uint64_t count, double null_prob) {
+    return generator.String(count, 1, 100, null_prob);
+  }
+};
+
+template <typename ArrowType>
+class RandomRange : public RandomImpl {
+  using CType = typename TypeTraits<ArrowType>::CType;
+
+ public:
+  explicit RandomRange(random::SeedType seed) : RandomImpl(seed) {}
+
+  std::shared_ptr<Array> Generate(uint64_t count, int range, double null_prob) {
+    CType min = std::numeric_limits<CType>::min();
+    CType max = min + range;
+    if (sizeof(CType) < 4 && (range + min) > std::numeric_limits<CType>::max()) {
+      max = std::numeric_limits<CType>::max();
+    }
+    return generator.Numeric<ArrowType>(count, min, max, null_prob);
+  }
+};
+
+TYPED_TEST_SUITE(TestNthToIndicesRandom, NthToIndicesableTypes);
+
+TYPED_TEST(TestNthToIndicesRandom, RandomValues) {
+  Random<TypeParam> rand(0x61549225);
+  int length = 100;
+  for (auto null_probability : {0.0, 0.1, 0.5, 1.0}) {
+    // Try n from 0 to out of bound
+    for (int n = 0; n <= length; ++n) {
+      auto array = rand.Generate(length, null_probability);
+      this->AssertNthToIndicesArray(array, n);
+    }
+  }
+}
+
 using arrow::internal::checked_pointer_cast;
 
 template <typename ArrowType>
@@ -131,21 +298,9 @@ using SortToIndicesableTypes =
     ::testing::Types<UInt8Type, UInt16Type, UInt32Type, UInt64Type, Int8Type, Int16Type,
                      Int32Type, Int64Type, FloatType, DoubleType, StringType>;
 
-template <typename ArrayType>
-class Comparator {
- public:
-  bool operator()(const ArrayType& array, uint64_t lhs, uint64_t rhs) {
-    if (array.IsNull(rhs) && array.IsNull(lhs)) return lhs < rhs;
-    if (array.IsNull(rhs)) return true;
-    if (array.IsNull(lhs)) return false;
-    if (array.GetView(lhs) == array.GetView(rhs)) return lhs < rhs;
-    return array.GetView(lhs) < array.GetView(rhs);
-  }
-};
-
 template <typename ArrayType>
 void ValidateSorted(const ArrayType& array, UInt64Array& offsets) {
-  Comparator<ArrayType> compare;
+  SortComparator<ArrayType> compare;
   for (int i = 1; i < array.length(); i++) {
     uint64_t lhs = offsets.Value(i - 1);
     uint64_t rhs = offsets.Value(i);
@@ -153,54 +308,6 @@ void ValidateSorted(const ArrayType& array, UInt64Array& offsets) {
   }
 }
 
-class RandomImpl {
- protected:
-  random::RandomArrayGenerator generator;
-
- public:
-  explicit RandomImpl(random::SeedType seed) : generator(seed) {}
-};
-
-template <typename ArrowType>
-class Random : public RandomImpl {
-  using CType = typename TypeTraits<ArrowType>::CType;
-
- public:
-  explicit Random(random::SeedType seed) : RandomImpl(seed) {}
-
-  std::shared_ptr<Array> Generate(uint64_t count, double null_prob) {
-    return generator.Numeric<ArrowType>(count, std::numeric_limits<CType>::min(),
-                                        std::numeric_limits<CType>::max(), null_prob);
-  }
-};
-
-template <>
-class Random<StringType> : public RandomImpl {
- public:
-  explicit Random(random::SeedType seed) : RandomImpl(seed) {}
-
-  std::shared_ptr<Array> Generate(uint64_t count, double null_prob) {
-    return generator.String(count, 1, 100, null_prob);
-  }
-};
-
-template <typename ArrowType>
-class RandomRange : public RandomImpl {
-  using CType = typename TypeTraits<ArrowType>::CType;
-
- public:
-  explicit RandomRange(random::SeedType seed) : RandomImpl(seed) {}
-
-  std::shared_ptr<Array> Generate(uint64_t count, int range, double null_prob) {
-    CType min = std::numeric_limits<CType>::min();
-    CType max = min + range;
-    if (sizeof(CType) < 4 && (range + min) > std::numeric_limits<CType>::max()) {
-      max = std::numeric_limits<CType>::max();
-    }
-    return generator.Numeric<ArrowType>(count, min, max, null_prob);
-  }
-};
-
 TYPED_TEST_SUITE(TestSortToIndicesKernelRandom, SortToIndicesableTypes);
 
 TYPED_TEST(TestSortToIndicesKernelRandom, SortRandomValues) {
diff --git a/cpp/src/arrow/compute/registry.cc b/cpp/src/arrow/compute/registry.cc
index 27caee94746..4beb9c3bdd1 100644
--- a/cpp/src/arrow/compute/registry.cc
+++ b/cpp/src/arrow/compute/registry.cc
@@ -110,7 +110,7 @@ static void CreateBuiltInRegistry() {
   RegisterBasicAggregateFunctions(g_registry.get());
 
   // Vector functions
-  RegisterVectorPartitionFunctions(g_registry.get());
+  RegisterVectorSortFunctions(g_registry.get());
 }
 
 }  // namespace internal

From 13e0b4b8f1055be4cf7550655a34a58de435fde4 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Tue, 19 May 2020 10:48:22 -0500
Subject: [PATCH 03/41] Refactoring selection kernels, not yet ported

---
 cpp/src/arrow/CMakeLists.txt                  |   7 +-
 cpp/src/arrow/compute/api.h                   |   4 +-
 cpp/src/arrow/compute/api_aggregate.cc        |  49 +++
 cpp/src/arrow/compute/api_aggregate.h         | 106 +++++++
 cpp/src/arrow/compute/api_eager.cc            | 214 -------------
 cpp/src/arrow/compute/api_scalar.cc           | 114 +++++++
 .../compute/{api_eager.h => api_scalar.h}     | 190 +++--------
 cpp/src/arrow/compute/api_vector.cc           | 264 ++++++++++++++++
 .../arrow/compute/{take.h => api_vector.h}    | 119 ++++++-
 cpp/src/arrow/compute/cast.h                  |  68 ----
 cpp/src/arrow/compute/filter.h                |  54 ----
 cpp/src/arrow/compute/kernel.h                |   2 +
 .../arrow/compute/kernels/aggregate_test.cc   |   2 +-
 .../arrow/compute/kernels/codegen_internal.h  |  15 +
 cpp/src/arrow/compute/kernels/common.h        |   1 +
 .../compute/kernels/scalar_boolean_test.cc    |   2 +-
 .../compute/kernels/scalar_compare_test.cc    |   2 +-
 .../compute/kernels/scalar_set_lookup_test.cc |   3 +-
 .../arrow/compute/kernels/vector_filter.cc    | 279 -----------------
 ...or_take_internal.h => vector_selection.cc} | 294 ++++++++++++++++--
 cpp/src/arrow/compute/kernels/vector_sort.cc  |   4 +-
 .../arrow/compute/kernels/vector_sort_test.cc |   2 +-
 cpp/src/arrow/compute/kernels/vector_take.cc  | 207 ------------
 cpp/src/arrow/dataset/filter.cc               |   2 +-
 cpp/src/arrow/python/numpy_to_arrow.cc        |   2 +-
 .../parquet/arrow/arrow_reader_writer_test.cc |   2 +-
 cpp/src/parquet/column_writer.cc              |   2 +-
 27 files changed, 981 insertions(+), 1029 deletions(-)
 create mode 100644 cpp/src/arrow/compute/api_aggregate.cc
 create mode 100644 cpp/src/arrow/compute/api_aggregate.h
 delete mode 100644 cpp/src/arrow/compute/api_eager.cc
 create mode 100644 cpp/src/arrow/compute/api_scalar.cc
 rename cpp/src/arrow/compute/{api_eager.h => api_scalar.h} (52%)
 create mode 100644 cpp/src/arrow/compute/api_vector.cc
 rename cpp/src/arrow/compute/{take.h => api_vector.h} (64%)
 delete mode 100644 cpp/src/arrow/compute/cast.h
 delete mode 100644 cpp/src/arrow/compute/filter.h
 delete mode 100644 cpp/src/arrow/compute/kernels/vector_filter.cc
 rename cpp/src/arrow/compute/kernels/{vector_take_internal.h => vector_selection.cc} (74%)
 delete mode 100644 cpp/src/arrow/compute/kernels/vector_take.cc

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index f7e4fe0ba2d..6bed2e0a934 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -320,7 +320,9 @@ endif()
 
 if(ARROW_COMPUTE)
   list(APPEND ARROW_SRCS
-              compute/api_eager.cc
+              compute/api_aggregate.cc
+              compute/api_scalar.cc
+              compute/api_vector.cc
               compute/exec.cc
               compute/function.cc
               compute/kernel.cc
@@ -332,9 +334,8 @@ if(ARROW_COMPUTE)
               compute/kernels/scalar_compare.cc
               compute/kernels/scalar_set_lookup.cc
               compute/kernels/vector_sort.cc
+              compute/kernels/vector_selection.cc
               # compute/kernels/scalar_cast.cc
-              # compute/kernels/vector_filter.cc
-              # compute/kernels/vector_take.cc
               # compute/kernels/vector_hash.cc
        )
 endif()
diff --git a/cpp/src/arrow/compute/api.h b/cpp/src/arrow/compute/api.h
index 8c3a2ac08ba..2b5a8d8d503 100644
--- a/cpp/src/arrow/compute/api.h
+++ b/cpp/src/arrow/compute/api.h
@@ -20,7 +20,9 @@
 
 #pragma once
 
-#include "arrow/compute/api_eager.h"  // IWYU pragma: export
+#include "arrow/compute/api_aggregate.h"  // IWYU pragma: export
+#include "arrow/compute/api_scalar.h"     // IWYU pragma: export
+#include "arrow/compute/api_vector.h"     // IWYU pragma: export
 #include "arrow/compute/exec.h"       // IWYU pragma: export
 #include "arrow/compute/function.h"   // IWYU pragma: export
 #include "arrow/compute/kernel.h"     // IWYU pragma: export
diff --git a/cpp/src/arrow/compute/api_aggregate.cc b/cpp/src/arrow/compute/api_aggregate.cc
new file mode 100644
index 00000000000..6d85f64843b
--- /dev/null
+++ b/cpp/src/arrow/compute/api_aggregate.cc
@@ -0,0 +1,49 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/api_aggregate.h"
+
+#include <sstream>
+#include <string>
+#include <utility>
+
+#include "arrow/compute/exec.h"
+
+namespace arrow {
+namespace compute {
+
+// ----------------------------------------------------------------------
+// Scalar aggregates
+
+Result<Datum> Count(const Datum& value, CountOptions options, ExecContext* ctx) {
+  return ExecScalarAggregateFunction(ctx, "count", {value}, &options);
+}
+
+Result<Datum> Mean(const Datum& value, ExecContext* ctx) {
+  return ExecScalarAggregateFunction(ctx, "mean", {value});
+}
+
+Result<Datum> Sum(const Datum& value, ExecContext* ctx) {
+  return ExecScalarAggregateFunction(ctx, "sum", {value});
+}
+
+Result<Datum> MinMax(const Datum& value, const MinMaxOptions& options, ExecContext* ctx) {
+  return ExecScalarAggregateFunction(ctx, "minmax", {value}, &options);
+}
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h
new file mode 100644
index 00000000000..4a13e3c1fe9
--- /dev/null
+++ b/cpp/src/arrow/compute/api_aggregate.h
@@ -0,0 +1,106 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Eager evaluation convenience APIs for invoking common functions, including
+// necessary memory allocations
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/compute/exec.h"
+#include "arrow/compute/options.h"
+#include "arrow/datum.h"
+#include "arrow/result.h"
+
+namespace arrow {
+namespace compute {
+
+// ----------------------------------------------------------------------
+// Aggregate functions
+
+/// \brief Count non-null (or null) values in an array.
+///
+/// \param[in] options counting options, see CountOptions for more information
+/// \param[in] datum to count
+/// \param[in] ctx the function execution context, optional
+/// \return out resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Count(const Datum& datum, CountOptions options = CountOptions::Defaults(),
+                    ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the mean of a numeric array.
+///
+/// \param[in] value datum to compute the mean, expecting Array
+/// \param[in] ctx the function execution context, optional
+/// \return datum of the computed mean as a DoubleScalar
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Mean(const Datum& value, ExecContext* ctx = NULLPTR);
+
+/// \brief Sum values of a numeric array.
+///
+/// \param[in] value datum to sum, expecting Array or ChunkedArray
+/// \param[in] ctx the function execution context, optional
+/// \return datum of the computed sum as a Scalar
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Sum(const Datum& value, ExecContext* ctx = NULLPTR);
+
+/// \brief Calculate the min / max of a numeric array
+///
+/// This function returns both the min and max as a struct scalar, with type
+/// struct<min: T, max: T>, where T is ht einput type
+///
+/// \param[in] value input datum, expecting Array or ChunkedArray
+/// \param[in] options see MinMaxOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return resulting datum as a struct<min: T, max: T> scalar
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> MinMax(const Datum& value,
+                     const MinMaxOptions& options = MinMaxOptions::Defaults(),
+                     ExecContext* ctx = NULLPTR);
+
+/// \brief Calculate the min / max of a numeric array.
+///
+/// This function returns both the min and max as a collection. The resulting
+/// datum thus consists of two scalar datums: {Datum(min), Datum(max)}
+///
+/// \param[in] array input array
+/// \param[in] options see MinMaxOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return resulting datum containing a {min, max} collection
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> MinMax(const Array& array,
+                     const MinMaxOptions& options = MinMaxOptions::Defaults(),
+                     ExecContext* ctx = NULLPTR);
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/api_eager.cc b/cpp/src/arrow/compute/api_eager.cc
deleted file mode 100644
index 4e4b4e0b35f..00000000000
--- a/cpp/src/arrow/compute/api_eager.cc
+++ /dev/null
@@ -1,214 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/api_eager.h"
-
-#include <sstream>
-#include <string>
-#include <utility>
-
-#include "arrow/compute/exec.h"
-
-namespace arrow {
-namespace compute {
-
-#define SCALAR_EAGER_UNARY(NAME, REGISTRY_NAME)              \
-  Result<Datum> NAME(const Datum& value, ExecContext* ctx) { \
-    return ExecScalarFunction(ctx, REGISTRY_NAME, {value});  \
-  }
-
-#define SCALAR_EAGER_BINARY(NAME, REGISTRY_NAME)                                \
-  Result<Datum> NAME(const Datum& left, const Datum& right, ExecContext* ctx) { \
-    return ExecScalarFunction(ctx, REGISTRY_NAME, {left, right});               \
-  }
-
-// ----------------------------------------------------------------------
-// Arithmetic
-
-SCALAR_EAGER_BINARY(Add, "add")
-
-// ----------------------------------------------------------------------
-// Set-related operations
-
-static Result<Datum> ExecSetLookup(const std::string& func_name, const Datum& data,
-                                   std::shared_ptr<Array> value_set,
-                                   bool add_nulls_to_hash_table, ExecContext* ctx) {
-  if (value_set->length() > 0 && !data.type()->Equals(value_set->type())) {
-    std::stringstream ss;
-    ss << "Array type didn't match type of values set: " << data.type()->ToString()
-       << " vs " << value_set->type()->ToString();
-    return Status::Invalid(ss.str());
-  }
-  SetLookupOptions options(std::move(value_set), !add_nulls_to_hash_table);
-  return ExecScalarFunction(ctx, func_name, {data}, &options);
-}
-
-Result<Datum> IsIn(const Datum& values, std::shared_ptr<Array> value_set,
-                   ExecContext* ctx) {
-  return ExecSetLookup("isin", values, std::move(value_set),
-                       /*add_nulls_to_hash_table=*/false, ctx);
-}
-
-Result<Datum> Match(const Datum& values, std::shared_ptr<Array> value_set,
-                    ExecContext* ctx) {
-  return ExecSetLookup("match", values, std::move(value_set),
-                       /*add_nulls_to_hash_table=*/true, ctx);
-}
-
-// ----------------------------------------------------------------------
-// Boolean functions
-
-SCALAR_EAGER_UNARY(Invert, "invert")
-SCALAR_EAGER_BINARY(And, "and")
-SCALAR_EAGER_BINARY(KleeneAnd, "and_kleene")
-SCALAR_EAGER_BINARY(Or, "or")
-SCALAR_EAGER_BINARY(KleeneOr, "or_kleene")
-SCALAR_EAGER_BINARY(Xor, "xor")
-
-// ----------------------------------------------------------------------
-
-Result<Datum> Compare(const Datum& left, const Datum& right, CompareOptions options,
-                      ExecContext* ctx) {
-  std::string func_name;
-  switch (options.op) {
-    case CompareOperator::EQUAL:
-      func_name = "==";
-      break;
-    case CompareOperator::NOT_EQUAL:
-      func_name = "!=";
-      break;
-    case CompareOperator::GREATER:
-      func_name = ">";
-      break;
-    case CompareOperator::GREATER_EQUAL:
-      func_name = ">=";
-      break;
-    case CompareOperator::LESS:
-      func_name = "<";
-      break;
-    case CompareOperator::LESS_EQUAL:
-      func_name = "<=";
-      break;
-    default:
-      DCHECK(false);
-      break;
-  }
-  return ExecScalarFunction(ctx, func_name, {left, right}, &options);
-}
-
-// ----------------------------------------------------------------------
-// Scalar aggregates
-
-Result<Datum> Count(const Datum& value, CountOptions options, ExecContext* ctx) {
-  return ExecScalarAggregateFunction(ctx, "count", {value}, &options);
-}
-
-Result<Datum> Mean(const Datum& value, ExecContext* ctx) {
-  return ExecScalarAggregateFunction(ctx, "mean", {value});
-}
-
-Result<Datum> Sum(const Datum& value, ExecContext* ctx) {
-  return ExecScalarAggregateFunction(ctx, "sum", {value});
-}
-
-Result<Datum> MinMax(const Datum& value, const MinMaxOptions& options, ExecContext* ctx) {
-  return ExecScalarAggregateFunction(ctx, "minmax", {value}, &options);
-}
-
-// ----------------------------------------------------------------------
-// Vector functions
-
-namespace {
-
-// Status InvokeHash(FunctionContext* ctx, HashKernel* func, const Datum& value,
-//                   std::vector<Datum>* kernel_outputs,
-//                   std::shared_ptr<Array>* dictionary) {
-//   RETURN_NOT_OK(detail::InvokeUnaryArrayKernel(ctx, func, value, kernel_outputs));
-//   std::shared_ptr<ArrayData> dict_data;
-//   RETURN_NOT_OK(func->GetDictionary(&dict_data));
-//   *dictionary = MakeArray(dict_data);
-//   return Status::OK();
-// }
-
-}  // namespace
-
-Result<std::shared_ptr<Array>> Unique(const Datum& value, ExecContext* ctx) {
-  // std::unique_ptr<HashKernel> func;
-  // RETURN_NOT_OK(GetUniqueKernel(ctx, value.type(), &func));
-  // std::vector<Datum> dummy_outputs;
-  // return InvokeHash(ctx, func.get(), value, &dummy_outputs, out);
-  return Status::NotImplemented("NYI");
-}
-
-Result<Datum> DictionaryEncode(const Datum& value, ExecContext* ctx) {
-  // std::unique_ptr<HashKernel> func;
-  // RETURN_NOT_OK(GetDictionaryEncodeKernel(ctx, value.type(), &func));
-  // std::shared_ptr<Array> dict;
-  // std::vector<Datum> indices_outputs;
-  // RETURN_NOT_OK(InvokeHash(ctx, func.get(), value, &indices_outputs, &dict));
-  // auto dict_type = dictionary(func->out_type(), dict->type());
-  // // Wrap indices in dictionary arrays for result
-  // std::vector<std::shared_ptr<Array>> dict_chunks;
-  // for (const Datum& datum : indices_outputs) {
-  //   dict_chunks.emplace_back(
-  //       std::make_shared<DictionaryArray>(dict_type, datum.make_array(), dict));
-  // }
-  // *out = detail::WrapArraysLike(value, dict_type, dict_chunks);
-  // return Status::OK();
-  return Status::NotImplemented("NYI");
-}
-
-const char kValuesFieldName[] = "values";
-const char kCountsFieldName[] = "counts";
-const int32_t kValuesFieldIndex = 0;
-const int32_t kCountsFieldIndex = 1;
-
-Result<std::shared_ptr<Array>> ValueCounts(const Datum& value, ExecContext* ctx) {
-  // std::unique_ptr<HashKernel> func;
-  // RETURN_NOT_OK(GetValueCountsKernel(ctx, value.type(), &func));
-  // // Calls return nothing for counts.
-  // std::vector<Datum> unused_output;
-  // std::shared_ptr<Array> uniques;
-  // RETURN_NOT_OK(InvokeHash(ctx, func.get(), value, &unused_output, &uniques));
-  // Datum value_counts;
-  // RETURN_NOT_OK(func->FlushFinal(&value_counts));
-  // auto data_type = std::make_shared<StructType>(std::vector<std::shared_ptr<Field>>{
-  //     std::make_shared<Field>(kValuesFieldName, uniques->type()),
-  //     std::make_shared<Field>(kCountsFieldName, int64())});
-  // *counts = std::make_shared<StructArray>(
-  //     data_type, uniques->length(),
-  //     std::vector<std::shared_ptr<Array>>{uniques, MakeArray(value_counts.array())});
-  // return Status::OK();
-  return Status::NotImplemented("NYI");
-}
-
-Result<std::shared_ptr<Array>> NthToIndices(const Array& values, int64_t n,
-                                            ExecContext* ctx) {
-  PartitionOptions options(/*pivot=*/n);
-  ARROW_ASSIGN_OR_RAISE(Datum result, ExecVectorFunction(ctx, "partition_indices",
-                                                         {Datum(values)}, &options));
-  return result.make_array();
-}
-
-Result<std::shared_ptr<Array>> SortToIndices(const Array& values, ExecContext* ctx) {
-  ARROW_ASSIGN_OR_RAISE(Datum result,
-                        ExecVectorFunction(ctx, "sort_indices", {Datum(values)}));
-  return result.make_array();
-}
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc
new file mode 100644
index 00000000000..4793ee813ad
--- /dev/null
+++ b/cpp/src/arrow/compute/api_scalar.cc
@@ -0,0 +1,114 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/api_scalar.h"
+
+#include <sstream>
+#include <string>
+#include <utility>
+
+#include "arrow/compute/exec.h"
+
+namespace arrow {
+namespace compute {
+
+#define SCALAR_EAGER_UNARY(NAME, REGISTRY_NAME)              \
+  Result<Datum> NAME(const Datum& value, ExecContext* ctx) { \
+    return ExecScalarFunction(ctx, REGISTRY_NAME, {value});  \
+  }
+
+#define SCALAR_EAGER_BINARY(NAME, REGISTRY_NAME)                                \
+  Result<Datum> NAME(const Datum& left, const Datum& right, ExecContext* ctx) { \
+    return ExecScalarFunction(ctx, REGISTRY_NAME, {left, right});               \
+  }
+
+// ----------------------------------------------------------------------
+// Arithmetic
+
+SCALAR_EAGER_BINARY(Add, "add")
+
+// ----------------------------------------------------------------------
+// Set-related operations
+
+static Result<Datum> ExecSetLookup(const std::string& func_name, const Datum& data,
+                                   std::shared_ptr<Array> value_set,
+                                   bool add_nulls_to_hash_table, ExecContext* ctx) {
+  if (value_set->length() > 0 && !data.type()->Equals(value_set->type())) {
+    std::stringstream ss;
+    ss << "Array type didn't match type of values set: " << data.type()->ToString()
+       << " vs " << value_set->type()->ToString();
+    return Status::Invalid(ss.str());
+  }
+  SetLookupOptions options(std::move(value_set), !add_nulls_to_hash_table);
+  return ExecScalarFunction(ctx, func_name, {data}, &options);
+}
+
+Result<Datum> IsIn(const Datum& values, std::shared_ptr<Array> value_set,
+                   ExecContext* ctx) {
+  return ExecSetLookup("isin", values, std::move(value_set),
+                       /*add_nulls_to_hash_table=*/false, ctx);
+}
+
+Result<Datum> Match(const Datum& values, std::shared_ptr<Array> value_set,
+                    ExecContext* ctx) {
+  return ExecSetLookup("match", values, std::move(value_set),
+                       /*add_nulls_to_hash_table=*/true, ctx);
+}
+
+// ----------------------------------------------------------------------
+// Boolean functions
+
+SCALAR_EAGER_UNARY(Invert, "invert")
+SCALAR_EAGER_BINARY(And, "and")
+SCALAR_EAGER_BINARY(KleeneAnd, "and_kleene")
+SCALAR_EAGER_BINARY(Or, "or")
+SCALAR_EAGER_BINARY(KleeneOr, "or_kleene")
+SCALAR_EAGER_BINARY(Xor, "xor")
+
+// ----------------------------------------------------------------------
+
+Result<Datum> Compare(const Datum& left, const Datum& right, CompareOptions options,
+                      ExecContext* ctx) {
+  std::string func_name;
+  switch (options.op) {
+    case CompareOperator::EQUAL:
+      func_name = "==";
+      break;
+    case CompareOperator::NOT_EQUAL:
+      func_name = "!=";
+      break;
+    case CompareOperator::GREATER:
+      func_name = ">";
+      break;
+    case CompareOperator::GREATER_EQUAL:
+      func_name = ">=";
+      break;
+    case CompareOperator::LESS:
+      func_name = "<";
+      break;
+    case CompareOperator::LESS_EQUAL:
+      func_name = "<=";
+      break;
+    default:
+      DCHECK(false);
+      break;
+  }
+  return ExecScalarFunction(ctx, func_name, {left, right}, &options);
+}
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/api_eager.h b/cpp/src/arrow/compute/api_scalar.h
similarity index 52%
rename from cpp/src/arrow/compute/api_eager.h
rename to cpp/src/arrow/compute/api_scalar.h
index c58348ba63d..67cbd3ea15d 100644
--- a/cpp/src/arrow/compute/api_eager.h
+++ b/cpp/src/arrow/compute/api_scalar.h
@@ -22,11 +22,8 @@
 
 #include <memory>
 
-#include "arrow/compute/cast.h"
 #include "arrow/compute/exec.h"
-#include "arrow/compute/filter.h"
 #include "arrow/compute/options.h"
-#include "arrow/compute/take.h"
 #include "arrow/datum.h"
 #include "arrow/result.h"
 
@@ -35,6 +32,41 @@ namespace compute {
 
 class ExecContext;
 
+// ----------------------------------------------------------------------
+// Convenience invocation APIs for a number of kernels
+
+/// \brief Cast from one array type to another
+/// \param[in] value array to cast
+/// \param[in] to_type type to cast to
+/// \param[in] options casting options
+/// \param[in] context the function execution context, optional
+/// \return the resulting array
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> Cast(const Array& value, std::shared_ptr<DataType> to_type,
+                                    const CastOptions& options = CastOptions::Safe(),
+                                    ExecContext* context = NULLPTR);
+
+/// \brief Cast from one value to another
+/// \param[in] value datum to cast
+/// \param[in] to_type type to cast to
+/// \param[in] options casting options
+/// \param[in] context the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Cast(const Datum& value, std::shared_ptr<DataType> to_type,
+                   const CastOptions& options = CastOptions::Safe(),
+                   ExecContext* context = NULLPTR);
+
+/// \brief Return true if a cast function is defined
+ARROW_EXPORT
+bool CanCast(const DataType& from_type, const DataType& to_type);
+
 // ----------------------------------------------------------------------
 
 /// \brief Add two values together. Array values must be the same length. If a
@@ -179,157 +211,5 @@ ARROW_EXPORT
 Result<Datum> Match(const Datum& haystack, std::shared_ptr<Array> needles,
                     ExecContext* ctx = NULLPTR);
 
-/// \brief Returns indices that partition an array around n-th
-/// sorted element.
-///
-/// Find index of n-th(0 based) smallest value and perform indirect
-/// partition of an array around that element. Output indices[0 ~ n-1]
-/// holds values no greater than n-th element, and indices[n+1 ~ end]
-/// holds values no less than n-th element. Elements in each partition
-/// is not sorted. Nulls will be partitioned to the end of the output.
-/// Output is not guaranteed to be stable.
-///
-/// \param[in] values array to be partitioned
-/// \param[in] n pivot array around sorted n-th element
-/// \param[in] ctx the function execution context, optional
-/// \return offsets indices that would partition an array
-ARROW_EXPORT
-Result<std::shared_ptr<Array>> NthToIndices(const Array& values, int64_t n,
-                                            ExecContext* ctx = NULLPTR);
-
-/// \brief Returns the indices that would sort an array.
-///
-/// Perform an indirect sort of array. The output array will contain
-/// indices that would sort an array, which would be the same length
-/// as input. Nulls will be stably partitioned to the end of the output.
-///
-/// For example given values = [null, 1, 3.3, null, 2, 5.3], the output
-/// will be [1, 4, 2, 5, 0, 3]
-///
-/// \param[in] values array to sort
-/// \param[in] ctx the function execution context, optional
-/// \return offsets indices that would sort an array
-ARROW_EXPORT
-Result<std::shared_ptr<Array>> SortToIndices(const Array& values,
-                                             ExecContext* ctx = NULLPTR);
-
-/// \brief Compute unique elements from an array-like object
-///
-/// Note if a null occurs in the input it will NOT be included in the output.
-///
-/// \param[in] datum array-like input
-/// \param[in] ctx the function execution context, optional
-/// \return result as Array
-///
-/// \since 1.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<std::shared_ptr<Array>> Unique(const Datum& datum, ExecContext* ctx = NULLPTR);
-
-// Constants for accessing the output of ValueCounts
-ARROW_EXPORT extern const char kValuesFieldName[];
-ARROW_EXPORT extern const char kCountsFieldName[];
-ARROW_EXPORT extern const int32_t kValuesFieldIndex;
-ARROW_EXPORT extern const int32_t kCountsFieldIndex;
-/// \brief Return counts of unique elements from an array-like object.
-///
-/// Note that the counts do not include counts for nulls in the array.  These can be
-/// obtained separately from metadata.
-///
-/// For floating point arrays there is no attempt to normalize -0.0, 0.0 and NaN values
-/// which can lead to unexpected results if the input Array has these values.
-///
-/// \param[in] value array-like input
-/// \param[in] ctx the function execution context, optional
-/// \return counts An array of  <input type "Values", int64_t "Counts"> structs.
-///
-/// \since 1.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<std::shared_ptr<Array>> ValueCounts(const Datum& value,
-                                           ExecContext* ctx = NULLPTR);
-
-/// \brief Dictionary-encode values in an array-like object
-/// \param[in] data array-like input
-/// \param[in] ctx the function execution context, optional
-/// \return result with same shape and type as input
-///
-/// \since 1.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> DictionaryEncode(const Datum& data, ExecContext* ctx = NULLPTR);
-
-// ----------------------------------------------------------------------
-// Aggregate functions
-
-/// \brief Count non-null (or null) values in an array.
-///
-/// \param[in] options counting options, see CountOptions for more information
-/// \param[in] datum to count
-/// \param[in] ctx the function execution context, optional
-/// \return out resulting datum
-///
-/// \since 1.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> Count(const Datum& datum, CountOptions options = CountOptions::Defaults(),
-                    ExecContext* ctx = NULLPTR);
-
-/// \brief Compute the mean of a numeric array.
-///
-/// \param[in] value datum to compute the mean, expecting Array
-/// \param[in] ctx the function execution context, optional
-/// \return datum of the computed mean as a DoubleScalar
-///
-/// \since 1.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> Mean(const Datum& value, ExecContext* ctx = NULLPTR);
-
-/// \brief Sum values of a numeric array.
-///
-/// \param[in] value datum to sum, expecting Array or ChunkedArray
-/// \param[in] ctx the function execution context, optional
-/// \return datum of the computed sum as a Scalar
-///
-/// \since 1.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> Sum(const Datum& value, ExecContext* ctx = NULLPTR);
-
-/// \brief Calculate the min / max of a numeric array
-///
-/// This function returns both the min and max as a struct scalar, with type
-/// struct<min: T, max: T>, where T is ht einput type
-///
-/// \param[in] value input datum, expecting Array or ChunkedArray
-/// \param[in] options see MinMaxOptions for more information
-/// \param[in] ctx the function execution context, optional
-/// \return resulting datum as a struct<min: T, max: T> scalar
-///
-/// \since 1.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> MinMax(const Datum& value,
-                     const MinMaxOptions& options = MinMaxOptions::Defaults(),
-                     ExecContext* ctx = NULLPTR);
-
-/// \brief Calculate the min / max of a numeric array.
-///
-/// This function returns both the min and max as a collection. The resulting
-/// datum thus consists of two scalar datums: {Datum(min), Datum(max)}
-///
-/// \param[in] array input array
-/// \param[in] options see MinMaxOptions for more information
-/// \param[in] ctx the function execution context, optional
-/// \return resulting datum containing a {min, max} collection
-///
-/// \since 1.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> MinMax(const Array& array,
-                     const MinMaxOptions& options = MinMaxOptions::Defaults(),
-                     ExecContext* ctx = NULLPTR);
-
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc
new file mode 100644
index 00000000000..75c384dce6c
--- /dev/null
+++ b/cpp/src/arrow/compute/api_vector.cc
@@ -0,0 +1,264 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/api_vector.h"
+
+#include <memory>
+
+#include "arrow/array/concatenate.h"
+#include "arrow/compute/exec.h"
+#include "arrow/compute/options.h"
+#include "arrow/datum.h"
+#include "arrow/result.h"
+
+namespace arrow {
+namespace compute {
+
+// ----------------------------------------------------------------------
+// Vector functions
+
+namespace {
+
+// Status InvokeHash(FunctionContext* ctx, HashKernel* func, const Datum& value,
+//                   std::vector<Datum>* kernel_outputs,
+//                   std::shared_ptr<Array>* dictionary) {
+//   RETURN_NOT_OK(detail::InvokeUnaryArrayKernel(ctx, func, value, kernel_outputs));
+//   std::shared_ptr<ArrayData> dict_data;
+//   RETURN_NOT_OK(func->GetDictionary(&dict_data));
+//   *dictionary = MakeArray(dict_data);
+//   return Status::OK();
+// }
+
+}  // namespace
+
+Result<std::shared_ptr<Array>> Unique(const Datum& value, ExecContext* ctx) {
+  // std::unique_ptr<HashKernel> func;
+  // RETURN_NOT_OK(GetUniqueKernel(ctx, value.type(), &func));
+  // std::vector<Datum> dummy_outputs;
+  // return InvokeHash(ctx, func.get(), value, &dummy_outputs, out);
+  return Status::NotImplemented("NYI");
+}
+
+Result<Datum> DictionaryEncode(const Datum& value, ExecContext* ctx) {
+  // std::unique_ptr<HashKernel> func;
+  // RETURN_NOT_OK(GetDictionaryEncodeKernel(ctx, value.type(), &func));
+  // std::shared_ptr<Array> dict;
+  // std::vector<Datum> indices_outputs;
+  // RETURN_NOT_OK(InvokeHash(ctx, func.get(), value, &indices_outputs, &dict));
+  // auto dict_type = dictionary(func->out_type(), dict->type());
+  // // Wrap indices in dictionary arrays for result
+  // std::vector<std::shared_ptr<Array>> dict_chunks;
+  // for (const Datum& datum : indices_outputs) {
+  //   dict_chunks.emplace_back(
+  //       std::make_shared<DictionaryArray>(dict_type, datum.make_array(), dict));
+  // }
+  // *out = detail::WrapArraysLike(value, dict_type, dict_chunks);
+  // return Status::OK();
+  return Status::NotImplemented("NYI");
+}
+
+const char kValuesFieldName[] = "values";
+const char kCountsFieldName[] = "counts";
+const int32_t kValuesFieldIndex = 0;
+const int32_t kCountsFieldIndex = 1;
+
+Result<std::shared_ptr<Array>> ValueCounts(const Datum& value, ExecContext* ctx) {
+  // std::unique_ptr<HashKernel> func;
+  // RETURN_NOT_OK(GetValueCountsKernel(ctx, value.type(), &func));
+  // // Calls return nothing for counts.
+  // std::vector<Datum> unused_output;
+  // std::shared_ptr<Array> uniques;
+  // RETURN_NOT_OK(InvokeHash(ctx, func.get(), value, &unused_output, &uniques));
+  // Datum value_counts;
+  // RETURN_NOT_OK(func->FlushFinal(&value_counts));
+  // auto data_type = std::make_shared<StructType>(std::vector<std::shared_ptr<Field>>{
+  //     std::make_shared<Field>(kValuesFieldName, uniques->type()),
+  //     std::make_shared<Field>(kCountsFieldName, int64())});
+  // *counts = std::make_shared<StructArray>(
+  //     data_type, uniques->length(),
+  //     std::vector<std::shared_ptr<Array>>{uniques, MakeArray(value_counts.array())});
+  // return Status::OK();
+  return Status::NotImplemented("NYI");
+}
+
+Result<std::shared_ptr<Array>> NthToIndices(const Array& values, int64_t n,
+                                            ExecContext* ctx) {
+  PartitionOptions options(/*pivot=*/n);
+  ARROW_ASSIGN_OR_RAISE(Datum result, ExecVectorFunction(ctx, "partition_indices",
+                                                         {Datum(values)}, &options));
+  return result.make_array();
+}
+
+Result<std::shared_ptr<Array>> SortToIndices(const Array& values, ExecContext* ctx) {
+  ARROW_ASSIGN_OR_RAISE(Datum result,
+                        ExecVectorFunction(ctx, "sort_indices", {Datum(values)}));
+  return result.make_array();
+}
+
+Result<Datum> Take(const Datum& values, const Datum& indices, const TakeOptions& options,
+                   ExecContext* ctx) {
+  return ExecVectorFunction(ctx, "take", {values, indices}, &options);
+}
+
+Result<std::shared_ptr<Array>> Take(const Array& values, const Array& indices,
+                                    const TakeOptions& options, ExecContext* ctx) {
+  ARROW_ASSIGN_OR_RAISE(Datum out_datum,
+                        Take(Datum(values.data()), Datum(indices.data()), options, ctx));
+  return out_datum.make_array();
+}
+
+Result<std::shared_ptr<ChunkedArray>> Take(const ChunkedArray& values,
+                                           const Array& indices,
+                                           const TakeOptions& options, ExecContext* ctx) {
+  auto num_chunks = values.num_chunks();
+  std::vector<std::shared_ptr<Array>> new_chunks(1);  // Hard-coded 1 for now
+  std::shared_ptr<Array> current_chunk;
+
+  // Case 1: `values` has a single chunk, so just use it
+  if (num_chunks == 1) {
+    current_chunk = values.chunk(0);
+  } else {
+    // TODO Case 2: See if all `indices` fall in the same chunk and call Array Take on it
+    // See
+    // https://github.com/apache/arrow/blob/6f2c9041137001f7a9212f244b51bc004efc29af/r/src/compute.cpp#L123-L151
+    // TODO Case 3: If indices are sorted, can slice them and call Array Take
+
+    // Case 4: Else, concatenate chunks and call Array Take
+    RETURN_NOT_OK(Concatenate(values.chunks(), default_memory_pool(), &current_chunk));
+  }
+  // Call Array Take on our single chunk
+  ARROW_ASSIGN_OR_RAISE(new_chunks[0], Take(*current_chunk, indices, options, ctx));
+  return std::make_shared<ChunkedArray>(std::move(new_chunks));
+}
+
+Result<std::shared_ptr<ChunkedArray>> Take(const ChunkedArray& values,
+                                           const ChunkedArray& indices,
+                                           const TakeOptions& options, ExecContext* ctx) {
+  auto num_chunks = indices.num_chunks();
+  std::vector<std::shared_ptr<Array>> new_chunks(num_chunks);
+  for (int i = 0; i < num_chunks; i++) {
+    // Take with that indices chunk
+    // Note that as currently implemented, this is inefficient because `values`
+    // will get concatenated on every iteration of this loop
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ChunkedArray> current_chunk,
+                          Take(values, *indices.chunk(i), options, ctx));
+    // Concatenate the result to make a single array for this chunk
+    RETURN_NOT_OK(
+        Concatenate(current_chunk->chunks(), default_memory_pool(), &new_chunks[i]));
+  }
+  return std::make_shared<ChunkedArray>(std::move(new_chunks));
+}
+
+Result<std::shared_ptr<ChunkedArray>> Take(const Array& values,
+                                           const ChunkedArray& indices,
+                                           const TakeOptions& options, ExecContext* ctx) {
+  auto num_chunks = indices.num_chunks();
+  std::vector<std::shared_ptr<Array>> new_chunks(num_chunks);
+  for (int i = 0; i < num_chunks; i++) {
+    // Take with that indices chunk
+    ARROW_ASSIGN_OR_RAISE(new_chunks[i], Take(values, *indices.chunk(i), options, ctx));
+  }
+  return std::make_shared<ChunkedArray>(std::move(new_chunks));
+}
+
+Result<std::shared_ptr<RecordBatch>> Take(const RecordBatch& batch, const Array& indices,
+                                          const TakeOptions& options, ExecContext* ctx) {
+  auto ncols = batch.num_columns();
+  auto nrows = indices.length();
+  std::vector<std::shared_ptr<Array>> columns(ncols);
+  for (int j = 0; j < ncols; j++) {
+    ARROW_ASSIGN_OR_RAISE(columns[j], Take(*batch.column(j), indices, options, ctx));
+  }
+  return RecordBatch::Make(batch.schema(), nrows, columns);
+}
+
+Result<std::shared_ptr<Table>> Take(const Table& table, const Array& indices,
+                                    const TakeOptions& options, ExecContext* ctx) {
+  auto ncols = table.num_columns();
+  std::vector<std::shared_ptr<ChunkedArray>> columns(ncols);
+
+  for (int j = 0; j < ncols; j++) {
+    ARROW_ASSIGN_OR_RAISE(columns[j], Take(*table.column(j), indices, options, ctx));
+  }
+  return Table::Make(table.schema(), columns);
+}
+
+Result<std::shared_ptr<Table>> Take(const Table& table, const ChunkedArray& indices,
+                                    const TakeOptions& options, ExecContext* ctx) {
+  auto ncols = table.num_columns();
+  std::vector<std::shared_ptr<ChunkedArray>> columns(ncols);
+  for (int j = 0; j < ncols; j++) {
+    ARROW_ASSIGN_OR_RAISE(columns[j], Take(*table.column(j), indices, options, ctx));
+  }
+  return Table::Make(table.schema(), columns);
+}
+
+// Status FilterRecordBatch(KernelContext* ctx, const RecordBatch& batch,
+//                          const Array& filter, FilterOptions options,
+//                          std::shared_ptr<RecordBatch>* out) {
+//   RETURN_NOT_OK(CheckFilterType(filter.type()));
+//   const auto& filter_array = checked_cast<const BooleanArray&>(filter);
+//   std::vector<std::unique_ptr<FilterKernel>> kernels(batch.num_columns());
+//   for (int i = 0; i < batch.num_columns(); ++i) {
+//     RETURN_NOT_OK(
+//         FilterKernel::Make(batch.schema()->field(i)->type(), options, &kernels[i]));
+//   }
+//   std::vector<std::shared_ptr<Array>> columns(batch.num_columns());
+//   auto out_length = OutputSize(options, filter_array);
+//   for (int i = 0; i < batch.num_columns(); ++i) {
+//     RETURN_NOT_OK(
+//         kernels[i]->Filter(ctx, *batch.column(i), filter_array, out_length,
+//         &columns[i]));
+//   }
+//   *out = RecordBatch::Make(batch.schema(), out_length, columns);
+//   return Status::OK();
+// }
+
+// Status Filter(KernelContext* ctx, const Datum& values, const Datum& filter,
+//               FilterOptions options, Datum* out) {
+//   if (values.kind() == Datum::RECORD_BATCH) {
+//     if (!filter.is_array()) {
+//       return Status::Invalid("Cannot filter a RecordBatch with a filter of kind ",
+//                              filter.kind());
+//     }
+//     auto values_batch = values.record_batch();
+//     auto filter_array = filter.make_array();
+//     std::shared_ptr<RecordBatch> out_batch;
+//     RETURN_NOT_OK(
+//         FilterRecordBatch(ctx, *values_batch, *filter_array, options, &out_batch));
+//     *out = std::move(out_batch);
+//     return Status::OK();
+//   }
+//   if (values.kind() == Datum::TABLE) {
+//     auto values_table = values.table();
+//     std::shared_ptr<Table> out_table;
+//     RETURN_NOT_OK(FilterTable(ctx, *values_table, filter, options, &out_table));
+//     *out = std::move(out_table);
+//     return Status::OK();
+//   }
+//   std::unique_ptr<FilterKernel> kernel;
+//   RETURN_NOT_OK(FilterKernel::Make(values.type(), options, &kernel));
+//   return kernel->Call(ctx, values, filter, out);
+// }
+
+Result<Datum> Filter(const Datum& values, const Datum& filter, FilterOptions options,
+                     ExecContext* context) {
+  return Status::NotImplemented("NYI");
+}
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/take.h b/cpp/src/arrow/compute/api_vector.h
similarity index 64%
rename from cpp/src/arrow/compute/take.h
rename to cpp/src/arrow/compute/api_vector.h
index 81a54a25343..b131fec4a65 100644
--- a/cpp/src/arrow/compute/take.h
+++ b/cpp/src/arrow/compute/api_vector.h
@@ -28,6 +28,39 @@ namespace compute {
 
 class ExecContext;
 
+/// \brief Filter with a boolean selection filter
+///
+/// The output will be populated with values from the input at positions
+/// where the selection filter is not 0. Nulls in the filter will be handled
+/// based on options.null_selection_behavior.
+///
+/// For example given values = ["a", "b", "c", null, "e", "f"] and
+/// filter = [0, 1, 1, 0, null, 1], the output will be
+/// (null_selection_behavior == DROP)      = ["b", "c", "f"]
+/// (null_selection_behavior == EMIT_NULL) = ["b", "c", null, "f"]
+///
+/// \param[in] values array to filter
+/// \param[in] filter indicates which values should be filtered out
+/// \param[in] options configures null_selection_behavior
+/// \param[in] context the function execution context, optional
+/// \return the resulting datum
+ARROW_EXPORT
+Result<Datum> Filter(const Datum& values, const Datum& filter,
+                     FilterOptions options = FilterOptions::Defaults(),
+                     ExecContext* context = NULLPTR);
+
+/// \brief Take from an array of values at indices in another array
+///
+/// \param[in] values datum from which to take
+/// \param[in] indices which values to take
+/// \param[in] options options
+/// \param[in] context the function execution context, optional
+/// \return the resulting datum
+ARROW_EXPORT
+Result<Datum> Take(const Datum& values, const Datum& indices,
+                   const TakeOptions& options = TakeOptions::Defaults(),
+                   ExecContext* context = NULLPTR);
+
 /// \brief Take from an array of values at indices in another array
 ///
 /// The output array will be of the same type as the input values
@@ -168,17 +201,85 @@ Result<std::shared_ptr<Table>> Take(const Table& table, const ChunkedArray& indi
                                     const TakeOptions& options = TakeOptions::Defaults(),
                                     ExecContext* context = NULLPTR);
 
-/// \brief Take from an array of values at indices in another array
+/// \brief Returns indices that partition an array around n-th
+/// sorted element.
 ///
-/// \param[in] values datum from which to take
-/// \param[in] indices which values to take
-/// \param[in] options options
-/// \param[in] context the function execution context, optional
-/// \return the resulting datum
+/// Find index of n-th(0 based) smallest value and perform indirect
+/// partition of an array around that element. Output indices[0 ~ n-1]
+/// holds values no greater than n-th element, and indices[n+1 ~ end]
+/// holds values no less than n-th element. Elements in each partition
+/// is not sorted. Nulls will be partitioned to the end of the output.
+/// Output is not guaranteed to be stable.
+///
+/// \param[in] values array to be partitioned
+/// \param[in] n pivot array around sorted n-th element
+/// \param[in] ctx the function execution context, optional
+/// \return offsets indices that would partition an array
 ARROW_EXPORT
-Result<Datum> Take(const Datum& values, const Datum& indices,
-                   const TakeOptions& options = TakeOptions::Defaults(),
-                   ExecContext* context = NULLPTR);
+Result<std::shared_ptr<Array>> NthToIndices(const Array& values, int64_t n,
+                                            ExecContext* ctx = NULLPTR);
+
+/// \brief Returns the indices that would sort an array.
+///
+/// Perform an indirect sort of array. The output array will contain
+/// indices that would sort an array, which would be the same length
+/// as input. Nulls will be stably partitioned to the end of the output.
+///
+/// For example given values = [null, 1, 3.3, null, 2, 5.3], the output
+/// will be [1, 4, 2, 5, 0, 3]
+///
+/// \param[in] values array to sort
+/// \param[in] ctx the function execution context, optional
+/// \return offsets indices that would sort an array
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> SortToIndices(const Array& values,
+                                             ExecContext* ctx = NULLPTR);
+
+/// \brief Compute unique elements from an array-like object
+///
+/// Note if a null occurs in the input it will NOT be included in the output.
+///
+/// \param[in] datum array-like input
+/// \param[in] ctx the function execution context, optional
+/// \return result as Array
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> Unique(const Datum& datum, ExecContext* ctx = NULLPTR);
+
+// Constants for accessing the output of ValueCounts
+ARROW_EXPORT extern const char kValuesFieldName[];
+ARROW_EXPORT extern const char kCountsFieldName[];
+ARROW_EXPORT extern const int32_t kValuesFieldIndex;
+ARROW_EXPORT extern const int32_t kCountsFieldIndex;
+/// \brief Return counts of unique elements from an array-like object.
+///
+/// Note that the counts do not include counts for nulls in the array.  These can be
+/// obtained separately from metadata.
+///
+/// For floating point arrays there is no attempt to normalize -0.0, 0.0 and NaN values
+/// which can lead to unexpected results if the input Array has these values.
+///
+/// \param[in] value array-like input
+/// \param[in] ctx the function execution context, optional
+/// \return counts An array of  <input type "Values", int64_t "Counts"> structs.
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> ValueCounts(const Datum& value,
+                                           ExecContext* ctx = NULLPTR);
+
+/// \brief Dictionary-encode values in an array-like object
+/// \param[in] data array-like input
+/// \param[in] ctx the function execution context, optional
+/// \return result with same shape and type as input
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> DictionaryEncode(const Datum& data, ExecContext* ctx = NULLPTR);
 
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/cast.h b/cpp/src/arrow/compute/cast.h
deleted file mode 100644
index 1c8c1f511d9..00000000000
--- a/cpp/src/arrow/compute/cast.h
+++ /dev/null
@@ -1,68 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <memory>
-
-#include "arrow/compute/exec.h"
-#include "arrow/compute/options.h"
-#include "arrow/datum.h"
-#include "arrow/result.h"
-
-namespace arrow {
-namespace compute {
-
-class ExecContext;
-
-// ----------------------------------------------------------------------
-// Convenience invocation APIs for a number of kernels
-
-/// \brief Cast from one array type to another
-/// \param[in] value array to cast
-/// \param[in] to_type type to cast to
-/// \param[in] options casting options
-/// \param[in] context the function execution context, optional
-/// \return the resulting array
-///
-/// \since 1.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<std::shared_ptr<Array>> Cast(const Array& value, std::shared_ptr<DataType> to_type,
-                                    const CastOptions& options = CastOptions::Safe(),
-                                    ExecContext* context = NULLPTR);
-
-/// \brief Cast from one value to another
-/// \param[in] value datum to cast
-/// \param[in] to_type type to cast to
-/// \param[in] options casting options
-/// \param[in] context the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 1.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> Cast(const Datum& value, std::shared_ptr<DataType> to_type,
-                   const CastOptions& options = CastOptions::Safe(),
-                   ExecContext* context = NULLPTR);
-
-/// \brief Return true if a cast function is defined
-ARROW_EXPORT
-bool CanCast(const DataType& from_type, const DataType& to_type);
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/filter.h b/cpp/src/arrow/compute/filter.h
deleted file mode 100644
index 260e9909b00..00000000000
--- a/cpp/src/arrow/compute/filter.h
+++ /dev/null
@@ -1,54 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <memory>
-
-#include "arrow/compute/exec.h"
-#include "arrow/compute/options.h"
-#include "arrow/datum.h"
-#include "arrow/result.h"
-
-namespace arrow {
-namespace compute {
-
-class ExecContext;
-
-/// \brief Filter with a boolean selection filter
-///
-/// The output will be populated with values from the input at positions
-/// where the selection filter is not 0. Nulls in the filter will be handled
-/// based on options.null_selection_behavior.
-///
-/// For example given values = ["a", "b", "c", null, "e", "f"] and
-/// filter = [0, 1, 1, 0, null, 1], the output will be
-/// (null_selection_behavior == DROP)      = ["b", "c", "f"]
-/// (null_selection_behavior == EMIT_NULL) = ["b", "c", null, "f"]
-///
-/// \param[in] values array to filter
-/// \param[in] filter indicates which values should be filtered out
-/// \param[in] options configures null_selection_behavior
-/// \param[in] context the function execution context, optional
-/// \return the resulting datum
-ARROW_EXPORT
-Result<Datum> Filter(const Datum& values, const Datum& filter,
-                     FilterOptions options = FilterOptions::Defaults(),
-                     ExecContext* context = NULLPTR);
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h
index 50c0afb8c4f..c7e6a51e933 100644
--- a/cpp/src/arrow/compute/kernel.h
+++ b/cpp/src/arrow/compute/kernel.h
@@ -84,6 +84,8 @@ class ARROW_EXPORT KernelContext {
   /// \brief Common state related to function execution
   ExecContext* exec_context() { return exec_ctx_; }
 
+  MemoryPool* memory_pool() { return exec_ctx_->memory_pool(); }
+
  private:
   ExecContext* exec_ctx_;
   Status status_;
diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc
index f341e6d4b14..1d5dd67520b 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_test.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc
@@ -23,7 +23,7 @@
 #include <gtest/gtest.h>
 
 #include "arrow/array.h"
-#include "arrow/compute/api_eager.h"
+#include "arrow/compute/api_aggregate.h"
 #include "arrow/compute/kernels/aggregate_internal.h"
 #include "arrow/compute/test_util.h"
 #include "arrow/type.h"
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h
index 78a2733c6a3..3252c9f13bc 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.h
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -30,6 +30,14 @@
 namespace arrow {
 namespace compute {
 
+#define CTX_RETURN_IF_ERROR(CTX, STATUS)        \
+  do {                                          \
+    if (ARROW_PREDICT_FALSE(!STATUS.ok()))      \
+      (CTX)->SetStatus(STATUS);                 \
+      return;                                   \
+    }                                           \
+  } while (0)
+
 // A kernel that exposes Call methods that handles iteration over ArrayData
 // inputs itself
 //
@@ -198,6 +206,13 @@ void ExecFail(KernelContext* ctx, const ExecBatch& batch, Datum* out);
 
 namespace codegen {
 
+template <typename T>
+void Extend(const std::vector<T>& values, std::vector<T>* out) {
+  for (const auto& t : values) {
+    out->push_back(t);
+  }
+}
+
 const std::vector<std::shared_ptr<DataType>>& BaseBinaryTypes();
 const std::vector<std::shared_ptr<DataType>>& SignedIntTypes();
 const std::vector<std::shared_ptr<DataType>>& UnsignedIntTypes();
diff --git a/cpp/src/arrow/compute/kernels/common.h b/cpp/src/arrow/compute/kernels/common.h
index 9f0a7432df9..527896b63da 100644
--- a/cpp/src/arrow/compute/kernels/common.h
+++ b/cpp/src/arrow/compute/kernels/common.h
@@ -46,5 +46,6 @@
 namespace arrow {
 
 using internal::checked_cast;
+using internal::checked_pointer_cast;
 
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/scalar_boolean_test.cc b/cpp/src/arrow/compute/kernels/scalar_boolean_test.cc
index 8566d93ab7c..8b48480ae62 100644
--- a/cpp/src/arrow/compute/kernels/scalar_boolean_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_boolean_test.cc
@@ -26,7 +26,7 @@
 #include "arrow/testing/gtest_common.h"
 #include "arrow/testing/gtest_util.h"
 
-#include "arrow/compute/api_eager.h"
+#include "arrow/compute/api_scalar.h"
 #include "arrow/compute/kernel.h"
 #include "arrow/compute/test_util.h"
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_compare_test.cc b/cpp/src/arrow/compute/kernels/scalar_compare_test.cc
index 14e0a18833b..df0a76b407b 100644
--- a/cpp/src/arrow/compute/kernels/scalar_compare_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_compare_test.cc
@@ -25,7 +25,7 @@
 #include <gtest/gtest.h>
 
 #include "arrow/array.h"
-#include "arrow/compute/api_eager.h"
+#include "arrow/compute/api.h"
 #include "arrow/compute/test_util.h"
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc
index 8e9b3c86506..2d405ea173b 100644
--- a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc
@@ -29,8 +29,7 @@
 
 #include "arrow/array.h"
 #include "arrow/buffer.h"
-#include "arrow/compute/api_eager.h"
-#include "arrow/compute/kernel.h"
+#include "arrow/compute/api.h"
 #include "arrow/compute/test_util.h"
 #include "arrow/memory_pool.h"
 #include "arrow/status.h"
diff --git a/cpp/src/arrow/compute/kernels/vector_filter.cc b/cpp/src/arrow/compute/kernels/vector_filter.cc
deleted file mode 100644
index 01b7ca2d19f..00000000000
--- a/cpp/src/arrow/compute/kernels/vector_filter.cc
+++ /dev/null
@@ -1,279 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/kernels/filter.h"
-
-#include <limits>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "arrow/array/concatenate.h"
-#include "arrow/builder.h"
-#include "arrow/compute/kernel.h"
-#include "arrow/compute/kernels/take_internal.h"
-#include "arrow/record_batch.h"
-#include "arrow/result.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/logging.h"
-
-namespace arrow {
-namespace compute {
-
-using internal::checked_cast;
-using internal::checked_pointer_cast;
-
-// IndexSequence which yields the indices of positions in a BooleanArray
-// which are either null or true
-template <FilterOptions::NullSelectionBehavior NullSelectionBehavior>
-class FilterIndexSequence {
- public:
-  // constexpr so we'll never instantiate bounds checking
-  constexpr bool never_out_of_bounds() const { return true; }
-  void set_never_out_of_bounds() {}
-
-  constexpr FilterIndexSequence() = default;
-
-  FilterIndexSequence(const BooleanArray& filter, int64_t out_length)
-      : filter_(&filter), out_length_(out_length) {}
-
-  std::pair<int64_t, bool> Next() {
-    if (NullSelectionBehavior == FilterOptions::DROP) {
-      // skip until an index is found at which the filter is true
-      while (filter_->IsNull(index_) || !filter_->Value(index_)) {
-        ++index_;
-      }
-      return std::make_pair(index_++, true);
-    }
-
-    // skip until an index is found at which the filter is either null or true
-    while (filter_->IsValid(index_) && !filter_->Value(index_)) {
-      ++index_;
-    }
-    bool is_valid = filter_->IsValid(index_);
-    return std::make_pair(index_++, is_valid);
-  }
-
-  int64_t length() const { return out_length_; }
-
-  int64_t null_count() const {
-    if (NullSelectionBehavior == FilterOptions::DROP) {
-      return 0;
-    }
-    return filter_->null_count();
-  }
-
- private:
-  const BooleanArray* filter_ = nullptr;
-  int64_t index_ = 0, out_length_ = -1;
-};
-
-static int64_t OutputSize(FilterOptions options, const BooleanArray& filter) {
-  // TODO(bkietz) this can be optimized. Use Bitmap::VisitWords
-  int64_t size = 0;
-  if (options.null_selection_behavior == FilterOptions::EMIT_NULL) {
-    for (auto i = 0; i < filter.length(); ++i) {
-      if (filter.IsNull(i) || filter.Value(i)) {
-        ++size;
-      }
-    }
-  } else {
-    for (auto i = 0; i < filter.length(); ++i) {
-      if (filter.IsValid(i) && filter.Value(i)) {
-        ++size;
-      }
-    }
-  }
-  return size;
-}
-
-static Status CheckFilterType(const std::shared_ptr<DataType>& type) {
-  if (type->id() != Type::BOOL) {
-    return Status::TypeError("filter array must be of boolean type, got ", *type);
-  }
-  return Status::OK();
-}
-
-static Status CheckFilterValuesLengths(int64_t values, int64_t filter) {
-  if (values != filter) {
-    return Status::Invalid("filter and value array must have identical lengths");
-  }
-  return Status::OK();
-}
-
-/// \brief BinaryKernel implementing Filter operation
-class FilterKernel : public BinaryKernel {
- public:
-  const FilterOptions& options() const { return options_; }
-
-  /// \brief BinaryKernel interface
-  ///
-  /// delegates to subclasses via Filter()
-  Status Call(FunctionContext* ctx, const Datum& values, const Datum& filter,
-              Datum* out) override;
-
-  /// \brief output type of this kernel (identical to type of values filtered)
-  std::shared_ptr<DataType> out_type() const override { return type_; }
-
-  /// \brief factory for FilterKernels
-  ///
-  /// \param[in] value_type constructed FilterKernel will support filtering
-  ///            values of this type
-  /// \param[in] options configures null_selection_behavior
-  /// \param[out] out created kernel
-  static Status Make(std::shared_ptr<DataType> value_type, FilterOptions options,
-                     std::unique_ptr<FilterKernel>* out);
-
-  /// \brief single-array implementation
-  virtual Status Filter(FunctionContext* ctx, const Array& values,
-                        const BooleanArray& filter, int64_t out_length,
-                        std::shared_ptr<Array>* out) = 0;
-
- protected:
-  explicit FilterKernel(std::shared_ptr<DataType> type, FilterOptions options)
-      : type_(std::move(type)), options_(options) {}
-
-  std::shared_ptr<DataType> type_;
-  FilterOptions options_;
-};
-
-template <typename IndexSequence>
-class FilterKernelImpl : public FilterKernel {
- public:
-  FilterKernelImpl(std::shared_ptr<DataType> type,
-                   std::unique_ptr<Taker<IndexSequence>> taker, FilterOptions options)
-      : FilterKernel(std::move(type), options), taker_(std::move(taker)) {}
-
-  Status Filter(FunctionContext* ctx, const Array& values, const BooleanArray& filter,
-                int64_t out_length, std::shared_ptr<Array>* out) override {
-    RETURN_NOT_OK(CheckFilterValuesLengths(values.length(), filter.length()));
-
-    RETURN_NOT_OK(taker_->SetContext(ctx));
-    RETURN_NOT_OK(taker_->Take(values, IndexSequence(filter, out_length)));
-    return taker_->Finish(out);
-  }
-
-  static Status Make(std::shared_ptr<DataType> value_type, FilterOptions options,
-                     std::unique_ptr<FilterKernel>* out) {
-    std::unique_ptr<Taker<IndexSequence>> taker;
-    RETURN_NOT_OK(Taker<IndexSequence>::Make(value_type, &taker));
-
-    out->reset(new FilterKernelImpl(std::move(value_type), std::move(taker), options));
-    return Status::OK();
-  }
-
-  std::unique_ptr<Taker<IndexSequence>> taker_;
-};
-
-Status FilterKernel::Make(std::shared_ptr<DataType> value_type, FilterOptions options,
-                          std::unique_ptr<FilterKernel>* out) {
-  if (options.null_selection_behavior == FilterOptions::EMIT_NULL) {
-    return FilterKernelImpl<FilterIndexSequence<FilterOptions::EMIT_NULL>>::Make(
-        std::move(value_type), options, out);
-  }
-  return FilterKernelImpl<FilterIndexSequence<FilterOptions::DROP>>::Make(
-      std::move(value_type), options, out);
-}
-
-Status FilterKernel::Call(FunctionContext* ctx, const Datum& values, const Datum& filter,
-                          Datum* out) {
-  if (!values.is_arraylike() || !filter.is_arraylike()) {
-    return Status::Invalid("FilterKernel::Call expects array-like values and filter");
-  }
-
-  RETURN_NOT_OK(CheckFilterType(filter.type()));
-  RETURN_NOT_OK(CheckFilterValuesLengths(values.length(), filter.length()));
-
-  auto chunks = internal::RechunkArraysConsistently({values.chunks(), filter.chunks()});
-  auto value_chunks = std::move(chunks[0]);
-  auto filter_chunks = std::move(chunks[1]);
-
-  for (size_t i = 0; i < value_chunks.size(); ++i) {
-    auto filter_chunk = checked_pointer_cast<BooleanArray>(filter_chunks[i]);
-    RETURN_NOT_OK(this->Filter(ctx, *value_chunks[i], *filter_chunk,
-                               OutputSize(options_, *filter_chunk), &value_chunks[i]));
-  }
-
-  if (values.is_array() && filter.is_array()) {
-    *out = std::move(value_chunks[0]);
-  } else {
-    // drop empty chunks
-    value_chunks.erase(
-        std::remove_if(value_chunks.begin(), value_chunks.end(),
-                       [](const std::shared_ptr<Array>& a) { return a->length() == 0; }),
-        value_chunks.end());
-
-    *out = std::make_shared<ChunkedArray>(std::move(value_chunks), values.type());
-  }
-  return Status::OK();
-}
-
-// Status FilterRecordBatch(FunctionContext* ctx, const RecordBatch& batch,
-//                          const Array& filter, FilterOptions options,
-//                          std::shared_ptr<RecordBatch>* out) {
-//   RETURN_NOT_OK(CheckFilterType(filter.type()));
-//   const auto& filter_array = checked_cast<const BooleanArray&>(filter);
-//   std::vector<std::unique_ptr<FilterKernel>> kernels(batch.num_columns());
-//   for (int i = 0; i < batch.num_columns(); ++i) {
-//     RETURN_NOT_OK(
-//         FilterKernel::Make(batch.schema()->field(i)->type(), options, &kernels[i]));
-//   }
-//   std::vector<std::shared_ptr<Array>> columns(batch.num_columns());
-//   auto out_length = OutputSize(options, filter_array);
-//   for (int i = 0; i < batch.num_columns(); ++i) {
-//     RETURN_NOT_OK(
-//         kernels[i]->Filter(ctx, *batch.column(i), filter_array, out_length,
-//         &columns[i]));
-//   }
-//   *out = RecordBatch::Make(batch.schema(), out_length, columns);
-//   return Status::OK();
-// }
-
-// Status Filter(FunctionContext* ctx, const Datum& values, const Datum& filter,
-//               FilterOptions options, Datum* out) {
-//   if (values.kind() == Datum::RECORD_BATCH) {
-//     if (!filter.is_array()) {
-//       return Status::Invalid("Cannot filter a RecordBatch with a filter of kind ",
-//                              filter.kind());
-//     }
-//     auto values_batch = values.record_batch();
-//     auto filter_array = filter.make_array();
-//     std::shared_ptr<RecordBatch> out_batch;
-//     RETURN_NOT_OK(
-//         FilterRecordBatch(ctx, *values_batch, *filter_array, options, &out_batch));
-//     *out = std::move(out_batch);
-//     return Status::OK();
-//   }
-//   if (values.kind() == Datum::TABLE) {
-//     auto values_table = values.table();
-//     std::shared_ptr<Table> out_table;
-//     RETURN_NOT_OK(FilterTable(ctx, *values_table, filter, options, &out_table));
-//     *out = std::move(out_table);
-//     return Status::OK();
-//   }
-//   std::unique_ptr<FilterKernel> kernel;
-//   RETURN_NOT_OK(FilterKernel::Make(values.type(), options, &kernel));
-//   return kernel->Call(ctx, values, filter, out);
-// }
-
-Result<Datum> Filter(const Datum& values, const Datum& filter, FilterOptions options,
-                     ExecContext* context) {
-  return Status::NotImplemented("NYI");
-}
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/vector_take_internal.h b/cpp/src/arrow/compute/kernels/vector_selection.cc
similarity index 74%
rename from cpp/src/arrow/compute/kernels/vector_take_internal.h
rename to cpp/src/arrow/compute/kernels/vector_selection.cc
index 68d0e223517..36c75dfbecb 100644
--- a/cpp/src/arrow/compute/kernels/vector_take_internal.h
+++ b/cpp/src/arrow/compute/kernels/vector_selection.cc
@@ -1,7 +1,7 @@
 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
+// returnGegarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
@@ -15,29 +15,18 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#pragma once
-
 #include <algorithm>
 #include <limits>
-#include <memory>
-#include <type_traits>
-#include <utility>
-#include <vector>
 
+#include "arrow/array/concatenate.h"
 #include "arrow/builder.h"
-#include "arrow/compute/kernel.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/logging.h"
-#include "arrow/visitor_inline.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/record_batch.h"
+#include "arrow/result.h"
 
 namespace arrow {
 namespace compute {
 
-using internal::checked_cast;
-using internal::checked_pointer_cast;
-
 template <typename T, typename R = void>
 using enable_if_not_base_binary =
     enable_if_t<!std::is_base_of<BaseBinaryType, T>::value, R>;
@@ -130,9 +119,9 @@ class Taker {
   // must be called once after construction before any other methods are called
   virtual Status Init() { return Status::OK(); }
 
-  // reset this Taker and set FunctionContext for taking an array
-  // must be called each time the FunctionContext may have changed
-  virtual Status SetContext(FunctionContext* ctx) = 0;
+  // reset this Taker and set KernelContext for taking an array
+  // must be called each time the KernelContext may have changed
+  virtual Status SetContext(KernelContext* ctx) = 0;
 
   // gather elements from an array at the provided indices
   virtual Status Take(const Array& values, IndexSequence indices) = 0;
@@ -247,7 +236,7 @@ class TakerImpl : public Taker<IndexSequence> {
 
   using Taker<IndexSequence>::Taker;
 
-  Status SetContext(FunctionContext* ctx) override {
+  Status SetContext(KernelContext* ctx) override {
     return this->MakeBuilder(ctx->memory_pool(), &builder_);
   }
 
@@ -277,7 +266,7 @@ class TakerImpl<IndexSequence, NullType> : public Taker<IndexSequence> {
  public:
   using Taker<IndexSequence>::Taker;
 
-  Status SetContext(FunctionContext*) override { return Status::OK(); }
+  Status SetContext(KernelContext*) override { return Status::OK(); }
 
   Status Take(const Array& values, IndexSequence indices) override {
     DCHECK(this->type_->Equals(values.type()));
@@ -313,7 +302,7 @@ class ListTakerImpl : public Taker<IndexSequence> {
     return Taker<RangeIndexSequence>::Make(list_type.value_type(), &value_taker_);
   }
 
-  Status SetContext(FunctionContext* ctx) override {
+  Status SetContext(KernelContext* ctx) override {
     auto pool = ctx->memory_pool();
     null_bitmap_builder_.reset(new TypedBufferBuilder<bool>(pool));
     offset_builder_.reset(new TypedBufferBuilder<offset_type>(pool));
@@ -392,7 +381,7 @@ class TakerImpl<IndexSequence, FixedSizeListType> : public Taker<IndexSequence>
     return Taker<RangeIndexSequence>::Make(list_type.value_type(), &value_taker_);
   }
 
-  Status SetContext(FunctionContext* ctx) override {
+  Status SetContext(KernelContext* ctx) override {
     auto pool = ctx->memory_pool();
     null_bitmap_builder_.reset(new TypedBufferBuilder<bool>(pool));
     return value_taker_->SetContext(ctx);
@@ -450,7 +439,7 @@ class TakerImpl<IndexSequence, StructType> : public Taker<IndexSequence> {
     return Status::OK();
   }
 
-  Status SetContext(FunctionContext* ctx) override {
+  Status SetContext(KernelContext* ctx) override {
     null_bitmap_builder_.reset(new TypedBufferBuilder<bool>(ctx->memory_pool()));
     for (int i = 0; i < this->type_->num_fields(); ++i) {
       RETURN_NOT_OK(children_[i]->SetContext(ctx));
@@ -526,7 +515,7 @@ class TakerImpl<IndexSequence, UnionType> : public Taker<IndexSequence> {
     return Status::OK();
   }
 
-  Status SetContext(FunctionContext* ctx) override {
+  Status SetContext(KernelContext* ctx) override {
     pool_ = ctx->memory_pool();
     null_bitmap_builder_.reset(new TypedBufferBuilder<bool>(pool_));
     type_code_builder_.reset(new TypedBufferBuilder<int8_t>(pool_));
@@ -684,7 +673,7 @@ class TakerImpl<IndexSequence, DictionaryType> : public Taker<IndexSequence> {
     return Taker<IndexSequence>::Make(dict_type.index_type(), &index_taker_);
   }
 
-  Status SetContext(FunctionContext* ctx) override {
+  Status SetContext(KernelContext* ctx) override {
     dictionary_ = nullptr;
     return index_taker_->SetContext(ctx);
   }
@@ -725,7 +714,7 @@ class TakerImpl<IndexSequence, ExtensionType> : public Taker<IndexSequence> {
     return Taker<IndexSequence>::Make(ext_type.storage_type(), &storage_taker_);
   }
 
-  Status SetContext(FunctionContext* ctx) override {
+  Status SetContext(KernelContext* ctx) override {
     return storage_taker_->SetContext(ctx);
   }
 
@@ -765,5 +754,256 @@ Status Taker<IndexSequence>::Make(const std::shared_ptr<DataType>& type,
   return VisitTypeInline(*type, &visitor);
 }
 
+// ----------------------------------------------------------------------
+// Filter implementation
+
+// IndexSequence which yields the indices of positions in a BooleanArray
+// which are either null or true
+template <FilterOptions::NullSelectionBehavior NullSelectionBehavior>
+class FilterIndexSequence {
+ public:
+  // constexpr so we'll never instantiate bounds checking
+  constexpr bool never_out_of_bounds() const { return true; }
+  void set_never_out_of_bounds() {}
+
+  constexpr FilterIndexSequence() = default;
+
+  FilterIndexSequence(const BooleanArray& filter, int64_t out_length)
+      : filter_(&filter), out_length_(out_length) {}
+
+  std::pair<int64_t, bool> Next() {
+    if (NullSelectionBehavior == FilterOptions::DROP) {
+      // skip until an index is found at which the filter is true
+      while (filter_->IsNull(index_) || !filter_->Value(index_)) {
+        ++index_;
+      }
+      return std::make_pair(index_++, true);
+    }
+
+    // skip until an index is found at which the filter is either null or true
+    while (filter_->IsValid(index_) && !filter_->Value(index_)) {
+      ++index_;
+    }
+    bool is_valid = filter_->IsValid(index_);
+    return std::make_pair(index_++, is_valid);
+  }
+
+  int64_t length() const { return out_length_; }
+
+  int64_t null_count() const {
+    if (NullSelectionBehavior == FilterOptions::DROP) {
+      return 0;
+    }
+    return filter_->null_count();
+  }
+
+ private:
+  const BooleanArray* filter_ = nullptr;
+  int64_t index_ = 0, out_length_ = -1;
+};
+
+static int64_t OutputSize(FilterOptions options, const BooleanArray& filter) {
+  // TODO(bkietz) this can be optimized. Use Bitmap::VisitWords
+  int64_t size = 0;
+  if (options.null_selection_behavior == FilterOptions::EMIT_NULL) {
+    for (auto i = 0; i < filter.length(); ++i) {
+      if (filter.IsNull(i) || filter.Value(i)) {
+        ++size;
+      }
+    }
+  } else {
+    for (auto i = 0; i < filter.length(); ++i) {
+      if (filter.IsValid(i) && filter.Value(i)) {
+        ++size;
+      }
+    }
+  }
+  return size;
+}
+
+// ----------------------------------------------------------------------
+// Take implementation
+
+
+template <typename ValueType, typename IndexType>
+struct FilterFunctor {
+  using ValueArrayType = typename TypeTraits<ValueType>::ArrayType;
+
+  template <typename IS>
+  static void ExecWith(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    std::shared_ptr<Array> arg0 = batch[0].make_array();
+    std::shared_ptr<Array> arg1 = batch[1].make_array();
+
+    std::unique_ptr<Taker<IS>> taker;
+    CTX_RETURN_IF_ERROR(ctx, Taker<IS>::Make(arg0->type(), &taker));
+
+
+    RETURN_NOT_OK(taker_->SetContext(ctx));
+    RETURN_NOT_OK(taker_->Take(values, IndexSequence(filter, out_length)));
+    return taker_->Finish(out);
+
+    CTX_RETURN_IF_ERROR(ctx, taker->SetContext(ctx));
+    CTX_RETURN_IF_ERROR(ctx, taker->Take(*arg0, IS(*arg1)));
+
+    std::shared_ptr<Array> result;
+    CTX_RETURN_IF_ERROR(ctx, taker_->Finish(*result));
+    out->value = result;
+
+    for (size_t i = 0; i < value_chunks.size(); ++i) {
+      auto filter_chunk = checked_pointer_cast<BooleanArray>(filter_chunks[i]);
+      RETURN_NOT_OK(this->Filter(ctx, *value_chunks[i], *filter_chunk,
+                                 OutputSize(options_, *filter_chunk), &value_chunks[i]));
+    }
+
+    if (values.is_array() && filter.is_array()) {
+      *out = std::move(value_chunks[0]);
+    } else {
+      // drop empty chunks
+      value_chunks.erase(
+          std::remove_if(value_chunks.begin(), value_chunks.end(),
+                         [](const std::shared_ptr<Array>& a) { return a->length() == 0; }),
+          value_chunks.end());
+
+      *out = std::make_shared<ChunkedArray>(std::move(value_chunks), values.type());
+    }
+  }
+
+  Status Filter(KernelContext* ctx, const Array& values, const BooleanArray& filter,
+                int64_t out_length, std::shared_ptr<Array>* out) override {
+  }
+
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    if (options.null_selection_behavior == FilterOptions::EMIT_NULL) {
+      ExecWith<FilterIndexSequence<FilterOptions::EMIT_NULL>>(ctx, batch, out);
+    } else {
+      ExecWith<FilterIndexSequence<FilterOptions::DROP>>(ctx, batch, out);
+    }
+  }
+};
+
+template <typename ValueType, typename IndexType>
+struct TakeFunctor {
+  using ValueArrayType = typename TypeTraits<ValueType>::ArrayType;
+  using IndexArrayType = typename TypeTraits<IndexType>::ArrayType;
+  using IS = ArrayIndexSequence<IndexType>;
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    std::shared_ptr<Array> arg0 = batch[0].make_array();
+    std::shared_ptr<Array> arg1 = batch[1].make_array();
+    std::unique_ptr<Taker<IS>> taker;
+
+    CTX_RETURN_IF_ERROR(ctx, Taker<IS>::Make(arg0.type(), &taker));
+    CTX_RETURN_IF_ERROR(ctx, taker->SetContext(ctx));
+    CTX_RETURN_IF_ERROR(ctx, taker->Take(*arg0, IS(*arg1)));
+
+    std::shared_ptr<Array> result;
+    CTX_RETURN_IF_ERROR(ctx, taker_->Finish(*result));
+    out->value = result;
+  }
+};
+
+template <template <typename...> class Functor,
+          typename ValueType>
+struct IndexDispatch {
+  ArrayKernelExec GetKernel(const DataType& index_type) {
+    switch (index_type.id()) {
+      case Type::INT8:
+        return Functor<ValueType, Int8Type>::Exec;
+      case Type::INT16:
+        return Functor<ValueType, Int16Type>::Exec;
+      case Type::INT32:
+        return Functor<ValueType, Int32Type>::Exec;
+      case Type::INT64:
+        return Functor<ValueType, Int64Type>::Exec;
+      case Type::UINT8:
+        return Functor<ValueType, UInt8Type>::Exec;
+      case Type::UINT16:
+        return Functor<ValueType, UInt16Type>::Exec;
+      case Type::UINT32:
+        return Functor<ValueType, UInt32Type>::Exec;
+      case Type::UINT64:
+        return Functor<ValueType, UInt64Type>::Exec;
+      default:
+        DCHECK(false) << "Index type not supported";
+        return ExecFail;
+    }
+  }
+};
+
+struct TakeVisitor {
+  TakeVisitor(const DataType& value_type, const DataType& index_type)
+      : value_type(value_type), index_type(index_type) {}
+
+  Status Visit(const T&) {
+    this->result = IndexDispatch<T>::GetKernel(index_type);
+    return Status::OK();
+  }
+
+  Status Create() { return VisitTypeInline(value_type, this); }
+
+  const DataType& value_type;
+  const DataType& index_type;
+  ArrayKernelExec result;
+};
+
+Status GetTakeKernel(const DataType& value_type, const DataType& index_type,
+                     ArrayKernelExec* exec) {
+  TakeVisitor visitor(value_type, index_type);
+  RETURN_NOT_OK(visitor.Create());
+  *exec = visitor.result;
+  return Status::OK();
+}
+
+namespace internal {
+
+static DataTypeVector g_take_index_types = {int8(), int16(), int32(), int64()};
+
+Result<ValueDescr> FirstType(const std::vector<ValueDescr>& descrs) { return descrs[0]; }
+
+void RegisterTakeFunctions(FunctionRegistry* registry) {
+  VectorKernel base;
+  base.init = TakeInit;
+  base.mem_allocation = MemAllocation::NO_PREALLOCATE;
+  base.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+
+  auto take = std::make_shared<VectorFunction>("take", /*arity=*/1);
+
+  DataTypeVector exact_types;
+  codegen::Extend({boolean()}, &exact_types);
+  codegen::Extend(NumericTypes(), &exact_types);
+  codegen::Extend(TemporalTypes(), &exact_types);
+  codegen::Extend(BaseBinaryTypes(), &exact_types);
+
+  OutputType out_sig_type(FirstType);
+  for (const auto& value_ty : types) {
+    InputType arg0_ty = InputType::Array(value_ty);
+    for (const auto& index_ty : g_take_index_types) {
+      base.signature =
+          KernelSignature::Make({arg0_ty, InputType::Array(index_ty)}, out_sig_type);
+      base.exec = GetTakeKernel(*value_ty, *index_ty);
+      DCHECK_OK(take->AddKernel(base));
+    }
+  }
+
+  // Construct dummy parametric types so that we can get VisitTypeInline to
+  // work above
+  DataTypeVector parametric_types = {
+      fixed_size_binary(0),        list(null()),       struct_({}),       decimal(12, 2),
+      dictionary(int32(), null()), fixed_size_list(0), large_list(null())};
+  OutputType out_sig_type(FirstType);
+  for (const auto& value_ty : parametric_types) {
+    InputType arg0_ty = InputType::Array(value_ty->id());
+    for (const auto& index_ty : g_take_index_types) {
+      base.signature =
+          KernelSignature::Make({arg0_ty, InputType::Array(index_ty)}, out_sig_type);
+      base.exec = GetTakeKernel(*value_ty, *index_ty);
+      DCHECK_OK(take->AddKernel(base));
+    }
+  }
+  DCHECK_OK(registry->AddFunction(std::move(take)));
+}
+
+}  // namespace internal
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc
index 5f3df78eb24..8dd890bead6 100644
--- a/cpp/src/arrow/compute/kernels/vector_sort.cc
+++ b/cpp/src/arrow/compute/kernels/vector_sort.cc
@@ -264,12 +264,12 @@ template <template <typename...> class ExecTemplate>
 struct SortingKernels {
   static void Add(VectorKernel base, VectorFunction* func) {
     for (const auto& ty : codegen::NumericTypes()) {
-      base.signature = KernelSignature::Make({ty}, uint64());
+      base.signature = KernelSignature::Make({InputType::Array(ty)}, uint64());
       base.exec = codegen::NumericSetReturn<ExecTemplate, UInt64Type>(*ty);
       DCHECK_OK(func->AddKernel(base));
     }
     for (const auto& ty : codegen::BaseBinaryTypes()) {
-      base.signature = KernelSignature::Make({ty}, uint64());
+      base.signature = KernelSignature::Make({InputType::Array(ty)}, uint64());
       base.exec = codegen::BaseBinarySetReturn<ExecTemplate, UInt64Type>(*ty);
       DCHECK_OK(func->AddKernel(base));
     }
diff --git a/cpp/src/arrow/compute/kernels/vector_sort_test.cc b/cpp/src/arrow/compute/kernels/vector_sort_test.cc
index 9089da7d366..95328c285ab 100644
--- a/cpp/src/arrow/compute/kernels/vector_sort_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_sort_test.cc
@@ -20,7 +20,7 @@
 #include <string>
 #include <vector>
 
-#include "arrow/compute/api_eager.h"
+#include "arrow/compute/api_vector.h"
 #include "arrow/compute/test_util.h"
 #include "arrow/testing/gtest_common.h"
 #include "arrow/testing/gtest_util.h"
diff --git a/cpp/src/arrow/compute/kernels/vector_take.cc b/cpp/src/arrow/compute/kernels/vector_take.cc
deleted file mode 100644
index d7c41c249f8..00000000000
--- a/cpp/src/arrow/compute/kernels/vector_take.cc
+++ /dev/null
@@ -1,207 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// returnGegarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/kernels/take.h"
-
-#include <algorithm>
-#include <limits>
-#include <memory>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "arrow/array/concatenate.h"
-#include "arrow/builder.h"
-#include "arrow/compute/kernel.h"
-#include "arrow/compute/kernels/take_internal.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/logging.h"
-#include "arrow/visitor_inline.h"
-
-namespace arrow {
-namespace compute {
-
-template <typename IndexType>
-class TakeKernelImpl : public TakeKernel {
- public:
-  explicit TakeKernelImpl(const std::shared_ptr<DataType>& value_type)
-      : TakeKernel(value_type) {}
-
-  Status Init() {
-    return Taker<ArrayIndexSequence<IndexType>>::Make(this->type_, &taker_);
-  }
-
-  Status Take(KernelContext* ctx, const Array& values, const Array& indices_array,
-              std::shared_ptr<Array>* out) override {
-    RETURN_NOT_OK(taker_->SetContext(ctx));
-    RETURN_NOT_OK(taker_->Take(values, ArrayIndexSequence<IndexType>(indices_array)));
-    return taker_->Finish(out);
-  }
-
-  std::unique_ptr<Taker<ArrayIndexSequence<IndexType>>> taker_;
-};
-
-struct UnpackIndices {
-  template <typename IndexType>
-  enable_if_integer<IndexType, Status> Visit(const IndexType&) {
-    auto out = new TakeKernelImpl<IndexType>(value_type_);
-    out_->reset(out);
-    return out->Init();
-  }
-
-  Status Visit(const DataType& other) {
-    return Status::TypeError("index type not supported: ", other);
-  }
-
-  std::shared_ptr<DataType> value_type_;
-  std::unique_ptr<TakeKernel>* out_;
-};
-
-Status TakeKernel::Make(const std::shared_ptr<DataType>& value_type,
-                        const std::shared_ptr<DataType>& index_type,
-                        std::unique_ptr<TakeKernel>* out) {
-  UnpackIndices visitor{value_type, out};
-  return VisitTypeInline(*index_type, &visitor);
-}
-
-Status TakeKernel::Call(KernelContext* ctx, const Datum& values, const Datum& indices,
-                        Datum* out) {
-  if (!values.is_array() || !indices.is_array()) {
-    return Status::Invalid("TakeKernel expects array values and indices");
-  }
-  auto values_array = values.make_array();
-  auto indices_array = indices.make_array();
-  std::shared_ptr<Array> out_array;
-  RETURN_NOT_OK(Take(ctx, *values_array, *indices_array, &out_array));
-  *out = Datum(out_array);
-  return Status::OK();
-}
-
-Status Take(ExecContext* ctx, const Array& values, const Array& indices,
-            const TakeOptions& options, std::shared_ptr<Array>* out) {
-  Datum out_datum;
-  RETURN_NOT_OK(
-      Take(ctx, Datum(values.data()), Datum(indices.data()), options, &out_datum));
-  *out = out_datum.make_array();
-  return Status::OK();
-}
-
-Status Take(ExecContext* ctx, const Datum& values, const Datum& indices,
-            const TakeOptions& options, Datum* out) {
-  std::unique_ptr<TakeKernel> kernel;
-  RETURN_NOT_OK(TakeKernel::Make(values.type(), indices.type(), &kernel));
-  return kernel->Call(ctx, values, indices, out);
-}
-
-Status Take(ExecContext* ctx, const ChunkedArray& values, const Array& indices,
-            const TakeOptions& options, std::shared_ptr<ChunkedArray>* out) {
-  auto num_chunks = values.num_chunks();
-  std::vector<std::shared_ptr<Array>> new_chunks(1);  // Hard-coded 1 for now
-  std::shared_ptr<Array> current_chunk;
-
-  // Case 1: `values` has a single chunk, so just use it
-  if (num_chunks == 1) {
-    current_chunk = values.chunk(0);
-  } else {
-    // TODO Case 2: See if all `indices` fall in the same chunk and call Array Take on it
-    // See
-    // https://github.com/apache/arrow/blob/6f2c9041137001f7a9212f244b51bc004efc29af/r/src/compute.cpp#L123-L151
-    // TODO Case 3: If indices are sorted, can slice them and call Array Take
-
-    // Case 4: Else, concatenate chunks and call Array Take
-    RETURN_NOT_OK(Concatenate(values.chunks(), default_memory_pool(), &current_chunk));
-  }
-  // Call Array Take on our single chunk
-  RETURN_NOT_OK(Take(ctx, *current_chunk, indices, options, &new_chunks[0]));
-  *out = std::make_shared<ChunkedArray>(std::move(new_chunks));
-  return Status::OK();
-}
-
-Status Take(ExecContext* ctx, const ChunkedArray& values, const ChunkedArray& indices,
-            const TakeOptions& options, std::shared_ptr<ChunkedArray>* out) {
-  auto num_chunks = indices.num_chunks();
-  std::vector<std::shared_ptr<Array>> new_chunks(num_chunks);
-  std::shared_ptr<ChunkedArray> current_chunk;
-
-  for (int i = 0; i < num_chunks; i++) {
-    // Take with that indices chunk
-    // Note that as currently implemented, this is inefficient because `values`
-    // will get concatenated on every iteration of this loop
-    RETURN_NOT_OK(Take(ctx, values, *indices.chunk(i), options, &current_chunk));
-    // Concatenate the result to make a single array for this chunk
-    RETURN_NOT_OK(
-        Concatenate(current_chunk->chunks(), default_memory_pool(), &new_chunks[i]));
-  }
-  *out = std::make_shared<ChunkedArray>(std::move(new_chunks));
-  return Status::OK();
-}
-
-Status Take(ExecContext* ctx, const Array& values, const ChunkedArray& indices,
-            const TakeOptions& options, std::shared_ptr<ChunkedArray>* out) {
-  auto num_chunks = indices.num_chunks();
-  std::vector<std::shared_ptr<Array>> new_chunks(num_chunks);
-
-  for (int i = 0; i < num_chunks; i++) {
-    // Take with that indices chunk
-    RETURN_NOT_OK(Take(ctx, values, *indices.chunk(i), options, &new_chunks[i]));
-  }
-  *out = std::make_shared<ChunkedArray>(std::move(new_chunks));
-  return Status::OK();
-}
-
-Status Take(ExecContext* ctx, const RecordBatch& batch, const Array& indices,
-            const TakeOptions& options, std::shared_ptr<RecordBatch>* out) {
-  auto ncols = batch.num_columns();
-  auto nrows = indices.length();
-
-  std::vector<std::shared_ptr<Array>> columns(ncols);
-
-  for (int j = 0; j < ncols; j++) {
-    RETURN_NOT_OK(Take(ctx, *batch.column(j), indices, options, &columns[j]));
-  }
-  *out = RecordBatch::Make(batch.schema(), nrows, columns);
-  return Status::OK();
-}
-
-Status Take(ExecContext* ctx, const Table& table, const Array& indices,
-            const TakeOptions& options, std::shared_ptr<Table>* out) {
-  auto ncols = table.num_columns();
-  std::vector<std::shared_ptr<ChunkedArray>> columns(ncols);
-
-  for (int j = 0; j < ncols; j++) {
-    RETURN_NOT_OK(Take(ctx, *table.column(j), indices, options, &columns[j]));
-  }
-  *out = Table::Make(table.schema(), columns);
-  return Status::OK();
-}
-
-Status Take(ExecContext* ctx, const Table& table, const ChunkedArray& indices,
-            const TakeOptions& options, std::shared_ptr<Table>* out) {
-  auto ncols = table.num_columns();
-  std::vector<std::shared_ptr<ChunkedArray>> columns(ncols);
-
-  for (int j = 0; j < ncols; j++) {
-    RETURN_NOT_OK(Take(ctx, *table.column(j), indices, options, &columns[j]));
-  }
-  *out = Table::Make(table.schema(), columns);
-  return Status::OK();
-}
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/dataset/filter.cc b/cpp/src/arrow/dataset/filter.cc
index c7ccea4509f..391dc92d73b 100644
--- a/cpp/src/arrow/dataset/filter.cc
+++ b/cpp/src/arrow/dataset/filter.cc
@@ -28,7 +28,7 @@
 
 #include "arrow/buffer.h"
 #include "arrow/buffer_builder.h"
-#include "arrow/compute/api_eager.h"
+#include "arrow/compute/api.h"
 #include "arrow/dataset/dataset.h"
 #include "arrow/io/memory.h"
 #include "arrow/ipc/reader.h"
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc
index 6a66ebc2c2b..4ba414a2e60 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -43,7 +43,7 @@
 #include "arrow/util/utf8.h"
 #include "arrow/visitor_inline.h"
 
-#include "arrow/compute/api_eager.h"
+#include "arrow/compute/api_scalar.h"
 
 #include "arrow/python/common.h"
 #include "arrow/python/config.h"
diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index 8bfbb94c619..6d8a4f33ec0 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -31,7 +31,7 @@
 #include <vector>
 
 #include "arrow/api.h"
-#include "arrow/compute/api_eager.h"
+#include "arrow/compute/api.h"
 #include "arrow/pretty_print.h"
 #include "arrow/record_batch.h"
 #include "arrow/testing/gtest_util.h"
diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index bd6dbe6b558..13f91e36529 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -28,7 +28,7 @@
 
 #include "arrow/array.h"
 #include "arrow/buffer_builder.h"
-#include "arrow/compute/api_eager.h"
+#include "arrow/compute/api.h"
 #include "arrow/status.h"
 #include "arrow/type.h"
 #include "arrow/type_traits.h"

From 65165d61e2f5673caac8f7c33ce13288126e36ac Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Tue, 19 May 2020 12:35:46 -0500
Subject: [PATCH 04/41] filter/take compiling finally

---
 cpp/src/arrow/CMakeLists.txt                  |   5 +-
 cpp/src/arrow/compute/api_vector.cc           |  54 ++--
 cpp/src/arrow/compute/kernels/CMakeLists.txt  |   5 +-
 .../arrow/compute/kernels/aggregate_basic.cc  |  12 +-
 .../{codegen_internal.cc => codegen.cc}       |  48 ++--
 .../kernels/{codegen_internal.h => codegen.h} | 178 +++++++-----
 cpp/src/arrow/compute/kernels/common.h        |   2 +-
 cpp/src/arrow/compute/kernels/registry.h      |  18 +-
 .../compute/kernels/scalar_arithmetic.cc      |   6 +-
 .../arrow/compute/kernels/scalar_boolean.cc   |   4 +-
 .../arrow/compute/kernels/scalar_compare.cc   |  10 +-
 .../compute/kernels/scalar_set_lookup.cc      |   2 +-
 .../arrow/compute/kernels/vector_filter.cc    | 179 ++++++++++++
 ...vector_selection.cc => vector_selection.h} | 263 +-----------------
 cpp/src/arrow/compute/kernels/vector_sort.cc  |  30 +-
 cpp/src/arrow/compute/kernels/vector_take.cc  | 116 ++++++++
 cpp/src/arrow/compute/options.h               |   2 +-
 cpp/src/arrow/compute/registry.cc             |  14 +-
 18 files changed, 536 insertions(+), 412 deletions(-)
 rename cpp/src/arrow/compute/kernels/{codegen_internal.cc => codegen.cc} (81%)
 rename cpp/src/arrow/compute/kernels/{codegen_internal.h => codegen.h} (79%)
 create mode 100644 cpp/src/arrow/compute/kernels/vector_filter.cc
 rename cpp/src/arrow/compute/kernels/{vector_selection.cc => vector_selection.h} (76%)
 create mode 100644 cpp/src/arrow/compute/kernels/vector_take.cc

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 6bed2e0a934..9df90013a22 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -327,14 +327,15 @@ if(ARROW_COMPUTE)
               compute/function.cc
               compute/kernel.cc
               compute/registry.cc
-              compute/kernels/codegen_internal.cc
+              compute/kernels/codegen.cc
               compute/kernels/aggregate_basic.cc
               compute/kernels/scalar_arithmetic.cc
               compute/kernels/scalar_boolean.cc
               compute/kernels/scalar_compare.cc
+              compute/kernels/vector_filter.cc
               compute/kernels/scalar_set_lookup.cc
               compute/kernels/vector_sort.cc
-              compute/kernels/vector_selection.cc
+              compute/kernels/vector_take.cc
               # compute/kernels/scalar_cast.cc
               # compute/kernels/vector_hash.cc
        )
diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc
index 75c384dce6c..fa0bd601d31 100644
--- a/cpp/src/arrow/compute/api_vector.cc
+++ b/cpp/src/arrow/compute/api_vector.cc
@@ -29,7 +29,31 @@ namespace arrow {
 namespace compute {
 
 // ----------------------------------------------------------------------
-// Vector functions
+// Direct exec interface to kernels
+
+Result<std::shared_ptr<Array>> NthToIndices(const Array& values, int64_t n,
+                                            ExecContext* ctx) {
+  PartitionOptions options(/*pivot=*/n);
+  ARROW_ASSIGN_OR_RAISE(Datum result, ExecVectorFunction(ctx, "partition_indices",
+                                                         {Datum(values)}, &options));
+  return result.make_array();
+}
+
+Result<std::shared_ptr<Array>> SortToIndices(const Array& values, ExecContext* ctx) {
+  ARROW_ASSIGN_OR_RAISE(Datum result,
+                        ExecVectorFunction(ctx, "sort_indices", {Datum(values)}));
+  return result.make_array();
+}
+
+Result<Datum> Filter(const Datum& values, const Datum& filter, FilterOptions options,
+                     ExecContext* ctx) {
+  return ExecVectorFunction(ctx, "take", {values, filter}, &options);
+}
+
+Result<Datum> Take(const Datum& values, const Datum& indices, const TakeOptions& options,
+                   ExecContext* ctx) {
+  return ExecVectorFunction(ctx, "take", {values, indices}, &options);
+}
 
 namespace {
 
@@ -95,24 +119,8 @@ Result<std::shared_ptr<Array>> ValueCounts(const Datum& value, ExecContext* ctx)
   return Status::NotImplemented("NYI");
 }
 
-Result<std::shared_ptr<Array>> NthToIndices(const Array& values, int64_t n,
-                                            ExecContext* ctx) {
-  PartitionOptions options(/*pivot=*/n);
-  ARROW_ASSIGN_OR_RAISE(Datum result, ExecVectorFunction(ctx, "partition_indices",
-                                                         {Datum(values)}, &options));
-  return result.make_array();
-}
-
-Result<std::shared_ptr<Array>> SortToIndices(const Array& values, ExecContext* ctx) {
-  ARROW_ASSIGN_OR_RAISE(Datum result,
-                        ExecVectorFunction(ctx, "sort_indices", {Datum(values)}));
-  return result.make_array();
-}
-
-Result<Datum> Take(const Datum& values, const Datum& indices, const TakeOptions& options,
-                   ExecContext* ctx) {
-  return ExecVectorFunction(ctx, "take", {values, indices}, &options);
-}
+// ----------------------------------------------------------------------
+// Take invocation conveniences
 
 Result<std::shared_ptr<Array>> Take(const Array& values, const Array& indices,
                                     const TakeOptions& options, ExecContext* ctx) {
@@ -207,6 +215,9 @@ Result<std::shared_ptr<Table>> Take(const Table& table, const ChunkedArray& indi
   return Table::Make(table.schema(), columns);
 }
 
+// ----------------------------------------------------------------------
+// Filter invocation conveniences
+
 // Status FilterRecordBatch(KernelContext* ctx, const RecordBatch& batch,
 //                          const Array& filter, FilterOptions options,
 //                          std::shared_ptr<RecordBatch>* out) {
@@ -255,10 +266,5 @@ Result<std::shared_ptr<Table>> Take(const Table& table, const ChunkedArray& indi
 //   return kernel->Call(ctx, values, filter, out);
 // }
 
-Result<Datum> Filter(const Datum& values, const Datum& filter, FilterOptions options,
-                     ExecContext* context) {
-  return Status::NotImplemented("NYI");
-}
-
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt
index daaf5222866..9a32cc35a86 100644
--- a/cpp/src/arrow/compute/kernels/CMakeLists.txt
+++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt
@@ -32,7 +32,10 @@ add_arrow_benchmark(scalar_compare_benchmark PREFIX "arrow-compute")
 # ----------------------------------------------------------------------
 # Vector kernels
 
-add_arrow_compute_test(vector_test SOURCES vector_sort_test.cc)
+add_arrow_compute_test(vector_test SOURCES
+  vector_filter_test.cc
+  vector_take_test.cc
+  vector_sort_test.cc)
 
 # add_arrow_compute_test(hash_test)
 
diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
index ed1dc4f535f..8e5b8f16396 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
@@ -500,7 +500,7 @@ void AddMinMaxKernels(KernelInit init,
   }
 }
 
-void RegisterBasicAggregateFunctions(FunctionRegistry* registry) {
+void RegisterScalarAggregateBasic(FunctionRegistry* registry) {
   auto func = std::make_shared<ScalarAggregateFunction>("count", /*arity=*/1);
 
   /// Takes any array input, outputs int64 scalar
@@ -510,17 +510,17 @@ void RegisterBasicAggregateFunctions(FunctionRegistry* registry) {
   DCHECK_OK(registry->AddFunction(std::move(func)));
 
   func = std::make_shared<ScalarAggregateFunction>("sum", /*arity=*/1);
-  AddBasicAggKernels(SumInit, codegen::SignedIntTypes(), int64(), func.get());
-  AddBasicAggKernels(SumInit, codegen::UnsignedIntTypes(), uint64(), func.get());
-  AddBasicAggKernels(SumInit, codegen::FloatingPointTypes(), float64(), func.get());
+  AddBasicAggKernels(SumInit, SignedIntTypes(), int64(), func.get());
+  AddBasicAggKernels(SumInit, UnsignedIntTypes(), uint64(), func.get());
+  AddBasicAggKernels(SumInit, FloatingPointTypes(), float64(), func.get());
   DCHECK_OK(registry->AddFunction(std::move(func)));
 
   func = std::make_shared<ScalarAggregateFunction>("mean", /*arity=*/1);
-  AddBasicAggKernels(MeanInit, codegen::NumericTypes(), float64(), func.get());
+  AddBasicAggKernels(MeanInit, NumericTypes(), float64(), func.get());
   DCHECK_OK(registry->AddFunction(std::move(func)));
 
   func = std::make_shared<ScalarAggregateFunction>("minmax", /*arity=*/1);
-  AddMinMaxKernels(MinMaxInit, codegen::NumericTypes(), func.get());
+  AddMinMaxKernels(MinMaxInit, NumericTypes(), func.get());
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen.cc
similarity index 81%
rename from cpp/src/arrow/compute/kernels/codegen_internal.cc
rename to cpp/src/arrow/compute/kernels/codegen.cc
index 403aeed25ac..abf3b79f7f4 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.cc
+++ b/cpp/src/arrow/compute/kernels/codegen.cc
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "arrow/compute/kernels/codegen_internal.h"
+#include "arrow/compute/kernels/codegen.h"
 
 #include <cstdint>
 #include <memory>
@@ -32,8 +32,6 @@ void ExecFail(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
   ctx->SetStatus(Status::NotImplemented("This kernel is malformed"));
 }
 
-namespace codegen {
-
 void BinaryExecFlipped(KernelContext* ctx, ArrayKernelExec exec,
                        const ExecBatch& batch, Datum* out) {
   ExecBatch flipped_batch = batch;
@@ -50,6 +48,7 @@ std::vector<std::shared_ptr<DataType>> g_floating_types;
 std::vector<std::shared_ptr<DataType>> g_numeric_types;
 std::vector<std::shared_ptr<DataType>> g_base_binary_types;
 std::vector<std::shared_ptr<DataType>> g_temporal_types;
+std::vector<std::shared_ptr<DataType>> g_non_parametric_types;
 static std::once_flag codegen_static_initialized;
 
 static void InitStaticData() {
@@ -66,30 +65,16 @@ static void InitStaticData() {
   g_unsigned_int_types.push_back(uint64());
 
   // All int types
-  g_int_types.push_back(int8());
-  g_int_types.push_back(int16());
-  g_int_types.push_back(int32());
-  g_int_types.push_back(int64());
-  g_int_types.push_back(uint8());
-  g_int_types.push_back(uint16());
-  g_int_types.push_back(uint32());
-  g_int_types.push_back(uint64());
+  Extend(g_unsigned_int_types, &g_int_types);
+  Extend(g_signed_int_types, &g_int_types);
 
   // Floating point types
   g_floating_types.push_back(float32());
   g_floating_types.push_back(float64());
 
   // Numeric types
-  g_numeric_types.push_back(uint8());
-  g_numeric_types.push_back(uint16());
-  g_numeric_types.push_back(uint32());
-  g_numeric_types.push_back(uint64());
-  g_numeric_types.push_back(int8());
-  g_numeric_types.push_back(int16());
-  g_numeric_types.push_back(int32());
-  g_numeric_types.push_back(int64());
-  g_numeric_types.push_back(float32());
-  g_numeric_types.push_back(float64());
+  Extend(g_int_types, &g_numeric_types);
+  Extend(g_floating_types, &g_numeric_types);
 
   // Temporal types
   g_temporal_types.push_back(date32());
@@ -108,6 +93,12 @@ static void InitStaticData() {
   g_base_binary_types.push_back(utf8());
   g_base_binary_types.push_back(large_binary());
   g_base_binary_types.push_back(large_utf8());
+
+  // Non-parametric, non-nested types
+  g_non_parametric_types.push_back(boolean());
+  Extend(g_numeric_types, &g_non_parametric_types);
+  Extend(g_temporal_types, &g_non_parametric_types);
+  Extend(g_base_binary_types, &g_non_parametric_types);
 }
 
 const std::vector<std::shared_ptr<DataType>>& BaseBinaryTypes() {
@@ -125,6 +116,11 @@ const std::vector<std::shared_ptr<DataType>>& UnsignedIntTypes() {
   return g_unsigned_int_types;
 }
 
+const std::vector<std::shared_ptr<DataType>>& IntTypes() {
+  std::call_once(codegen_static_initialized, InitStaticData);
+  return g_int_types;
+}
+
 const std::vector<std::shared_ptr<DataType>>& FloatingPointTypes() {
   std::call_once(codegen_static_initialized, InitStaticData);
   return g_floating_types;
@@ -140,6 +136,14 @@ const std::vector<std::shared_ptr<DataType>>& TemporalTypes() {
   return g_temporal_types;
 }
 
-}  // namespace codegen
+const std::vector<std::shared_ptr<DataType>>& NonParametricTypes() {
+  std::call_once(codegen_static_initialized, InitStaticData);
+  return g_non_parametric_types;
+}
+
+Result<ValueDescr> FirstType(const std::vector<ValueDescr>& descrs) {
+  return descrs[0];
+}
+
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen.h
similarity index 79%
rename from cpp/src/arrow/compute/kernels/codegen_internal.h
rename to cpp/src/arrow/compute/kernels/codegen.h
index 3252c9f13bc..5908c1a4559 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.h
+++ b/cpp/src/arrow/compute/kernels/codegen.h
@@ -30,12 +30,13 @@
 namespace arrow {
 namespace compute {
 
-#define CTX_RETURN_IF_ERROR(CTX, STATUS)        \
-  do {                                          \
-    if (ARROW_PREDICT_FALSE(!STATUS.ok()))      \
-      (CTX)->SetStatus(STATUS);                 \
-      return;                                   \
-    }                                           \
+#define CTX_RETURN_IF_ERROR(CTX, S)                             \
+  do {                                                          \
+    ::arrow::Status _s = ::arrow::internal::GenericToStatus(S); \
+    if (ARROW_PREDICT_FALSE(!_s.ok())) {                        \
+      (CTX)->SetStatus(_s);                                     \
+      return;                                                   \
+    }                                                           \
   } while (0)
 
 // A kernel that exposes Call methods that handles iteration over ArrayData
@@ -120,6 +121,42 @@ struct GetValueType<
 };
 
 // ----------------------------------------------------------------------
+// Generate an array kernel given template classes
+
+void ExecFail(KernelContext* ctx, const ExecBatch& batch, Datum* out);
+
+void BinaryExecFlipped(KernelContext* ctx, ArrayKernelExec exec,
+                       const ExecBatch& batch, Datum* out);
+
+// ----------------------------------------------------------------------
+// Boolean data utilities
+
+// ----------------------------------------------------------------------
+// Template kernel exec function generators
+
+template <typename T>
+void Extend(const std::vector<T>& values, std::vector<T>* out) {
+  for (const auto& t : values) {
+    out->push_back(t);
+  }
+}
+
+const std::vector<std::shared_ptr<DataType>>& BaseBinaryTypes();
+const std::vector<std::shared_ptr<DataType>>& SignedIntTypes();
+const std::vector<std::shared_ptr<DataType>>& UnsignedIntTypes();
+const std::vector<std::shared_ptr<DataType>>& IntTypes();
+const std::vector<std::shared_ptr<DataType>>& FloatingPointTypes();
+
+// Number types without boolean
+const std::vector<std::shared_ptr<DataType>>& NumericTypes();
+
+// Temporal types including time and timestamps for each unit
+const std::vector<std::shared_ptr<DataType>>& TemporalTypes();
+
+// Integer, floating point, base binary, and temporal
+const std::vector<std::shared_ptr<DataType>>& PrimitiveTypes();
+
+namespace codegen {
 
 struct SimpleExec {
   // Operator must implement
@@ -192,34 +229,6 @@ struct ScalarPrimitiveExec {
   }
 };
 
-// ----------------------------------------------------------------------
-// Generate an array kernel given template classes
-
-void ExecFail(KernelContext* ctx, const ExecBatch& batch, Datum* out);
-
-// ----------------------------------------------------------------------
-// Boolean data utilities
-
-// ----------------------------------------------------------------------
-// Code generator for numeric-type kernels where the input and output types
-// are all the same
-
-namespace codegen {
-
-template <typename T>
-void Extend(const std::vector<T>& values, std::vector<T>* out) {
-  for (const auto& t : values) {
-    out->push_back(t);
-  }
-}
-
-const std::vector<std::shared_ptr<DataType>>& BaseBinaryTypes();
-const std::vector<std::shared_ptr<DataType>>& SignedIntTypes();
-const std::vector<std::shared_ptr<DataType>>& UnsignedIntTypes();
-const std::vector<std::shared_ptr<DataType>>& FloatingPointTypes();
-const std::vector<std::shared_ptr<DataType>>& NumericTypes();
-const std::vector<std::shared_ptr<DataType>>& TemporalTypes();
-
 template <typename Type, typename Enable = void>
 struct OutputAdapter;
 
@@ -255,9 +264,6 @@ struct OutputAdapter<Type, enable_if_base_binary<Type>> {
   }
 };
 
-void BinaryExecFlipped(KernelContext* ctx, ArrayKernelExec exec,
-                       const ExecBatch& batch, Datum* out);
-
 // A binary kernel that outputs boolean values.
 template <typename OutType, typename Arg0Type, typename Arg1Type, typename Op,
           typename FlippedOp = Op>
@@ -311,9 +317,9 @@ template <typename OutType, typename ArgType, typename Op,
           typename FlippedOp = Op>
 using ScalarBinaryEqualTypes = ScalarBinary<OutType, ArgType, ArgType, Op, FlippedOp>;
 
-struct NumericEqualTypes {
+struct ScalarNumericEqualTypes {
   template <typename Op>
-  static ArrayKernelExec MakeUnary(const DataType& in_type) {
+  static ArrayKernelExec Unary(const DataType& in_type) {
     switch (in_type.id()) {
       case Type::INT8:
         return ScalarPrimitiveExec::Unary<Op, Int8Type, Int8Type>;
@@ -342,7 +348,7 @@ struct NumericEqualTypes {
   }
 
   template <typename Op>
-  static ArrayKernelExec MakeBinary(const DataType& in_type) {
+  static ArrayKernelExec Binary(const DataType& in_type) {
     switch (in_type.id()) {
       case Type::INT8:
         return ScalarPrimitiveExec::Binary<Op, Int8Type, Int8Type, Int8Type>;
@@ -372,29 +378,39 @@ struct NumericEqualTypes {
 };
 
 template <template <typename...> class Generator,
-          typename OutType, typename... Args>
-ArrayKernelExec NumericSetReturn(const DataType& in_type) {
+          typename Type0, typename... Args>
+ArrayKernelExec Numeric(const DataType& in_type) {
   switch (in_type.id()) {
     case Type::INT8:
-      return Generator<OutType, Int8Type, Args...>::Exec;
+      return Generator<Type0, Int8Type, Args...>::Exec;
     case Type::UINT8:
-      return Generator<OutType, UInt8Type, Args...>::Exec;
+      return Generator<Type0, UInt8Type, Args...>::Exec;
     case Type::INT16:
-      return Generator<OutType, Int16Type, Args...>::Exec;
+      return Generator<Type0, Int16Type, Args...>::Exec;
     case Type::UINT16:
-      return Generator<OutType, UInt16Type, Args...>::Exec;
+      return Generator<Type0, UInt16Type, Args...>::Exec;
     case Type::INT32:
-      return Generator<OutType, Int32Type, Args...>::Exec;
+      return Generator<Type0, Int32Type, Args...>::Exec;
     case Type::UINT32:
-      return Generator<OutType, UInt32Type, Args...>::Exec;
+      return Generator<Type0, UInt32Type, Args...>::Exec;
     case Type::INT64:
-      return Generator<OutType, Int64Type, Args...>::Exec;
+      return Generator<Type0, Int64Type, Args...>::Exec;
     case Type::UINT64:
-      return Generator<OutType, UInt64Type, Args...>::Exec;
+      return Generator<Type0, UInt64Type, Args...>::Exec;
+    default:
+      DCHECK(false);
+      return ExecFail;
+  }
+}
+
+template <template <typename...> class Generator,
+          typename Type0, typename... Args>
+ArrayKernelExec FloatingPoint(const DataType& type) {
+  switch (type.id()) {
     case Type::FLOAT:
-      return Generator<OutType, FloatType, Args...>::Exec;
+      return Generator<Type0, FloatType, Args...>::Exec;
     case Type::DOUBLE:
-      return Generator<OutType, DoubleType, Args...>::Exec;
+      return Generator<Type0, DoubleType, Args...>::Exec;
     default:
       DCHECK(false);
       return ExecFail;
@@ -402,17 +418,43 @@ ArrayKernelExec NumericSetReturn(const DataType& in_type) {
 }
 
 template <template <typename...> class Generator,
-          typename OutType, typename... Args>
-ArrayKernelExec BaseBinarySetReturn(const DataType& in_type) {
+          typename Type0, typename... Args>
+ArrayKernelExec Integer(const DataType& type) {
+  switch (type.id()) {
+    case Type::INT8:
+      return Generator<Type0, Int8Type, Args...>::Exec;
+    case Type::INT16:
+      return Generator<Type0, Int16Type, Args...>::Exec;
+    case Type::INT32:
+      return Generator<Type0, Int32Type, Args...>::Exec;
+    case Type::INT64:
+      return Generator<Type0, Int64Type, Args...>::Exec;
+    case Type::UINT8:
+      return Generator<Type0, UInt8Type, Args...>::Exec;
+    case Type::UINT16:
+      return Generator<Type0, UInt16Type, Args...>::Exec;
+    case Type::UINT32:
+      return Generator<Type0, UInt32Type, Args...>::Exec;
+    case Type::UINT64:
+      return Generator<Type0, UInt64Type, Args...>::Exec;
+    default:
+      DCHECK(false);
+      return ExecFail;
+  }
+}
+
+template <template <typename...> class Generator,
+          typename Type0, typename... Args>
+ArrayKernelExec BaseBinary(const DataType& in_type) {
   switch (in_type.id()) {
     case Type::BINARY:
-      return Generator<OutType, BinaryType, Args...>::Exec;
+      return Generator<Type0, BinaryType, Args...>::Exec;
     case Type::STRING:
-      return Generator<OutType, StringType, Args...>::Exec;
+      return Generator<Type0, StringType, Args...>::Exec;
     case Type::LARGE_BINARY:
-      return Generator<OutType, LargeBinaryType, Args...>::Exec;
+      return Generator<Type0, LargeBinaryType, Args...>::Exec;
     case Type::LARGE_STRING:
-      return Generator<OutType, LargeStringType, Args...>::Exec;
+      return Generator<Type0, LargeStringType, Args...>::Exec;
     default:
       DCHECK(false);
       return ExecFail;
@@ -420,19 +462,19 @@ ArrayKernelExec BaseBinarySetReturn(const DataType& in_type) {
 }
 
 template <template <typename...> class Generator,
-          typename OutType, typename... Args>
-ArrayKernelExec TemporalSetReturn(const DataType& in_type) {
+          typename Type0, typename... Args>
+ArrayKernelExec Temporal(const DataType& in_type) {
   switch (in_type.id()) {
     case Type::DATE32:
-      return Generator<OutType, Date32Type, Args...>::Exec;
+      return Generator<Type0, Date32Type, Args...>::Exec;
     case Type::DATE64:
-      return Generator<OutType, Date64Type, Args...>::Exec;
+      return Generator<Type0, Date64Type, Args...>::Exec;
     case Type::TIME32:
-      return Generator<OutType, Time32Type, Args...>::Exec;
+      return Generator<Type0, Time32Type, Args...>::Exec;
     case Type::TIME64:
-      return Generator<OutType, Time64Type, Args...>::Exec;
+      return Generator<Type0, Time64Type, Args...>::Exec;
     case Type::TIMESTAMP:
-      return Generator<OutType, TimestampType, Args...>::Exec;
+      return Generator<Type0, TimestampType, Args...>::Exec;
     default:
       DCHECK(false);
       return ExecFail;
@@ -440,5 +482,11 @@ ArrayKernelExec TemporalSetReturn(const DataType& in_type) {
 }
 
 }  // namespace codegen
+
+// ----------------------------------------------------------------------
+// Reusable type resolvers
+
+Result<ValueDescr> FirstType(const std::vector<ValueDescr>& descrs);
+
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/common.h b/cpp/src/arrow/compute/kernels/common.h
index 527896b63da..6d32ba00ee5 100644
--- a/cpp/src/arrow/compute/kernels/common.h
+++ b/cpp/src/arrow/compute/kernels/common.h
@@ -28,7 +28,7 @@
 #include "arrow/compute/exec.h"                      // IWYU pragma: export
 #include "arrow/compute/function.h"                  // IWYU pragma: export
 #include "arrow/compute/kernel.h"                    // IWYU pragma: export
-#include "arrow/compute/kernels/codegen_internal.h"  // IWYU pragma: export
+#include "arrow/compute/kernels/codegen.h"  // IWYU pragma: export
 #include "arrow/compute/options.h"                   // IWYU pragma: export
 #include "arrow/compute/registry.h"                  // IWYU pragma: export
 #include "arrow/datum.h"                             // IWYU pragma: export
diff --git a/cpp/src/arrow/compute/kernels/registry.h b/cpp/src/arrow/compute/kernels/registry.h
index 2e74c010c5a..5186fd0705f 100644
--- a/cpp/src/arrow/compute/kernels/registry.h
+++ b/cpp/src/arrow/compute/kernels/registry.h
@@ -26,19 +26,19 @@ namespace compute {
 namespace internal {
 
 // Built-in scalar / elementwise functions
-void RegisterArithmeticFunctions(FunctionRegistry* registry);
-void RegisterBooleanFunctions(FunctionRegistry* registry);
-void RegisterComparisonFunctions(FunctionRegistry* registry);
-void RegisterSetLookupFunctions(FunctionRegistry* registry);
+void RegisterScalarArithmetic(FunctionRegistry* registry);
+void RegisterScalarBoolean(FunctionRegistry* registry);
+void RegisterScalarComparison(FunctionRegistry* registry);
+void RegisterScalarSetLookup(FunctionRegistry* registry);
 
 // Vector functions
-void RegisterVectorFilterFunctions(FunctionRegistry* registry);
-void RegisterVectorHashFunctions(FunctionRegistry* registry);
-void RegisterVectorSortFunctions(FunctionRegistry* registry);
-void RegisterVectorTakeFunctions(FunctionRegistry* registry);
+void RegisterVectorFilter(FunctionRegistry* registry);
+void RegisterVectorHash(FunctionRegistry* registry);
+void RegisterVectorSort(FunctionRegistry* registry);
+void RegisterVectorTake(FunctionRegistry* registry);
 
 // Aggregate functions
-void RegisterBasicAggregateFunctions(FunctionRegistry* registry);
+void RegisterScalarAggregateBasic(FunctionRegistry* registry);
 
 }  // namespace internal
 }  // namespace compute
diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
index c6ebceb840f..3f353bf8cf2 100644
--- a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
@@ -32,9 +32,9 @@ namespace codegen {
 template <typename Op>
 void MakeBinaryFunction(std::string name, FunctionRegistry* registry) {
   auto func = std::make_shared<ScalarFunction>(name, /*arity=*/2);
-  for (const std::shared_ptr<DataType>& ty : codegen::NumericTypes()) {
+  for (const std::shared_ptr<DataType>& ty : NumericTypes()) {
     DCHECK_OK(func->AddKernel({InputType::Array(ty), InputType::Array(ty)}, ty,
-                              NumericEqualTypes::MakeBinary<Op>(*ty)));
+                              ScalarNumericEqualTypes::Binary<Op>(*ty)));
   }
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
@@ -43,7 +43,7 @@ void MakeBinaryFunction(std::string name, FunctionRegistry* registry) {
 
 namespace internal {
 
-void RegisterArithmeticFunctions(FunctionRegistry* registry) {
+void RegisterScalarArithmetic(FunctionRegistry* registry) {
   codegen::MakeBinaryFunction<Add>("add", registry);
 }
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_boolean.cc
index bee58e2f237..217772d4cff 100644
--- a/cpp/src/arrow/compute/kernels/scalar_boolean.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_boolean.cc
@@ -164,7 +164,9 @@ void MakeFunction(std::string name, int arity, ArrayKernelExec exec,
 
 namespace internal {
 
-void RegisterBooleanFunctions(FunctionRegistry* registry) {
+using codegen::SimpleExec;
+
+void RegisterScalarBoolean(FunctionRegistry* registry) {
   // These functions can write into sliced output bitmaps
   MakeFunction("invert", 1, SimpleExec::Unary<Invert>, registry);
   MakeFunction("and", 2, SimpleExec::Binary<And>, registry);
diff --git a/cpp/src/arrow/compute/kernels/scalar_compare.cc b/cpp/src/arrow/compute/kernels/scalar_compare.cc
index a48ff0b477b..39c27b78e49 100644
--- a/cpp/src/arrow/compute/kernels/scalar_compare.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_compare.cc
@@ -41,17 +41,15 @@ void MakeCompareFunction(std::string name, FunctionRegistry* registry) {
       ScalarBinary<BooleanType, BooleanType, BooleanType, Op, FlippedOp>::Exec));
 
   for (const std::shared_ptr<DataType>& ty : NumericTypes()) {
-    auto exec = NumericSetReturn<ScalarBinaryEqualTypes, BooleanType, Op, FlippedOp>(*ty);
+    auto exec = Numeric<ScalarBinaryEqualTypes, BooleanType, Op, FlippedOp>(*ty);
     DCHECK_OK(func->AddKernel({ty, ty}, out_ty, exec));
   }
   for (const std::shared_ptr<DataType>& ty : TemporalTypes()) {
-    auto exec =
-        TemporalSetReturn<ScalarBinaryEqualTypes, BooleanType, Op, FlippedOp>(*ty);
+    auto exec = Temporal<ScalarBinaryEqualTypes, BooleanType, Op, FlippedOp>(*ty);
     DCHECK_OK(func->AddKernel({ty, ty}, out_ty, exec));
   }
   for (const std::shared_ptr<DataType>& ty : BaseBinaryTypes()) {
-    auto exec =
-        BaseBinarySetReturn<ScalarBinaryEqualTypes, BooleanType, Op, FlippedOp>(*ty);
+    auto exec = BaseBinary<ScalarBinaryEqualTypes, BooleanType, Op, FlippedOp>(*ty);
     DCHECK_OK(func->AddKernel({ty, ty}, out_ty, exec));
   }
   DCHECK_OK(registry->AddFunction(std::move(func)));
@@ -103,7 +101,7 @@ struct LessEqual {
   }
 };
 
-void RegisterComparisonFunctions(FunctionRegistry* registry) {
+void RegisterScalarComparison(FunctionRegistry* registry) {
   codegen::MakeCompareFunction<Equal>("==", registry);
   codegen::MakeCompareFunction<NotEqual>("!=", registry);
   codegen::MakeCompareFunction<Less, Greater>("<", registry);
diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
index 0a725dbf8a3..26975553bd0 100644
--- a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
@@ -280,7 +280,7 @@ void AddBasicSetLookupKernels(ScalarKernel kernel,
 
 namespace internal {
 
-void RegisterSetLookupFunctions(FunctionRegistry* registry) {
+void RegisterScalarSetLookup(FunctionRegistry* registry) {
   // IsIn always writes into preallocated memory
   {
     ScalarKernel isin_base;
diff --git a/cpp/src/arrow/compute/kernels/vector_filter.cc b/cpp/src/arrow/compute/kernels/vector_filter.cc
new file mode 100644
index 00000000000..7a2797ad38b
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/vector_filter.cc
@@ -0,0 +1,179 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// returnGegarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+#include <limits>
+
+#include "arrow/array/concatenate.h"
+#include "arrow/builder.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/compute/kernels/vector_selection.h"
+#include "arrow/record_batch.h"
+#include "arrow/result.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+
+// IndexSequence which yields the indices of positions in a BooleanArray
+// which are either null or true
+template <FilterOptions::NullSelectionBehavior NullSelectionBehavior>
+class FilterIndexSequence {
+ public:
+  // constexpr so we'll never instantiate bounds checking
+  constexpr bool never_out_of_bounds() const { return true; }
+  void set_never_out_of_bounds() {}
+
+  constexpr FilterIndexSequence() = default;
+
+  FilterIndexSequence(const BooleanArray& filter, int64_t out_length)
+      : filter_(&filter), out_length_(out_length) {}
+
+  std::pair<int64_t, bool> Next() {
+    if (NullSelectionBehavior == FilterOptions::DROP) {
+      // skip until an index is found at which the filter is true
+      while (filter_->IsNull(index_) || !filter_->Value(index_)) {
+        ++index_;
+      }
+      return std::make_pair(index_++, true);
+    }
+
+    // skip until an index is found at which the filter is either null or true
+    while (filter_->IsValid(index_) && !filter_->Value(index_)) {
+      ++index_;
+    }
+    bool is_valid = filter_->IsValid(index_);
+    return std::make_pair(index_++, is_valid);
+  }
+
+  int64_t length() const { return out_length_; }
+
+  int64_t null_count() const {
+    if (NullSelectionBehavior == FilterOptions::DROP) {
+      return 0;
+    }
+    return filter_->null_count();
+  }
+
+ private:
+  const BooleanArray* filter_ = nullptr;
+  int64_t index_ = 0, out_length_ = -1;
+};
+
+static int64_t OutputSize(FilterOptions::NullSelectionBehavior null_selection,
+                          const BooleanArray& filter) {
+  // TODO(bkietz) this can be optimized. Use Bitmap::VisitWords
+  int64_t size = 0;
+  if (null_selection == FilterOptions::EMIT_NULL) {
+    for (auto i = 0; i < filter.length(); ++i) {
+      if (filter.IsNull(i) || filter.Value(i)) {
+        ++size;
+      }
+    }
+  } else {
+    for (auto i = 0; i < filter.length(); ++i) {
+      if (filter.IsValid(i) && filter.Value(i)) {
+        ++size;
+      }
+    }
+  }
+  return size;
+}
+
+struct FilterState : public KernelState {
+  FilterState(const FilterOptions& options) : options(options) {}
+  FilterOptions options;
+};
+
+std::unique_ptr<KernelState> InitFilter(KernelContext*, const Kernel&,
+                                      const FunctionOptions* options) {
+  auto filter_options = static_cast<const FilterOptions*>(options);
+  return std::unique_ptr<KernelState>(new FilterState{*filter_options});
+}
+
+template <typename ValueType>
+struct FilterFunctor {
+  using ArrayType = typename TypeTraits<ValueType>::ArrayType;
+
+  template <FilterOptions::NullSelectionBehavior NullSelection>
+  static void ExecImpl(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    using IS = FilterIndexSequence<NullSelection>;
+    ArrayType values(batch[0].array());
+    BooleanArray filter(batch[1].array());
+    const int64_t output_size = OutputSize(NullSelection, filter);
+    std::shared_ptr<Array> result;
+    CTX_RETURN_IF_ERROR(ctx, Select(ctx, values, IS(filter, output_size), &result));
+    out->value = result->data();
+  }
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const auto& state = checked_cast<const FilterState&>(*ctx->state());
+    if (state.options.null_selection_behavior == FilterOptions::EMIT_NULL) {
+      ExecImpl<FilterOptions::EMIT_NULL>(ctx, batch, out);
+    } else {
+      ExecImpl<FilterOptions::DROP>(ctx, batch, out);
+    }
+  }
+};
+
+struct FilterKernelVisitor {
+
+  template <typename Type>
+  Status Visit(const Type&) {
+    this->result = FilterFunctor<Type>::Exec;
+    return Status::OK();
+  }
+
+  Status Create(const DataType& type) { return VisitTypeInline(type, this); }
+  ArrayKernelExec result;
+};
+
+Status GetFilterKernel(const DataType& type, ArrayKernelExec* exec) {
+  FilterKernelVisitor visitor;
+  RETURN_NOT_OK(visitor.Create(type));
+  *exec = visitor.result;
+  return Status::OK();
+}
+
+void RegisterVectorFilter(FunctionRegistry* registry) {
+  VectorKernel base;
+  base.init = InitFilter;
+  base.mem_allocation = MemAllocation::NO_PREALLOCATE;
+  base.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+
+  auto filter = std::make_shared<VectorFunction>("filter", /*arity=*/2);
+  OutputType out_ty(FirstType);
+  InputType arg1_ty = InputType::Array(boolean());
+  for (const auto& value_ty : PrimitiveTypes()) {
+    InputType arg0_ty = InputType::Array(value_ty);
+    base.signature = KernelSignature::Make({arg0_ty, arg1_ty}, out_ty);
+    DCHECK_OK(GetFilterKernel(*value_ty, &base.exec));
+    DCHECK_OK(filter->AddKernel(base));
+  }
+
+  for (const auto& value_ty : g_dummy_parametric_types) {
+    InputType arg0_ty = InputType::Array(value_ty->id());
+    base.signature = KernelSignature::Make({arg0_ty, arg1_ty}, out_ty);
+    DCHECK_OK(GetFilterKernel(*value_ty, &base.exec));
+    DCHECK_OK(filter->AddKernel(base));
+  }
+  DCHECK_OK(registry->AddFunction(std::move(filter)));
+}
+
+}  // namespace internal
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.h
similarity index 76%
rename from cpp/src/arrow/compute/kernels/vector_selection.cc
rename to cpp/src/arrow/compute/kernels/vector_selection.h
index 36c75dfbecb..4edd2ac16f1 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection.h
@@ -18,7 +18,6 @@
 #include <algorithm>
 #include <limits>
 
-#include "arrow/array/concatenate.h"
 #include "arrow/builder.h"
 #include "arrow/compute/kernels/common.h"
 #include "arrow/record_batch.h"
@@ -26,6 +25,7 @@
 
 namespace arrow {
 namespace compute {
+namespace internal {
 
 template <typename T, typename R = void>
 using enable_if_not_base_binary =
@@ -754,255 +754,22 @@ Status Taker<IndexSequence>::Make(const std::shared_ptr<DataType>& type,
   return VisitTypeInline(*type, &visitor);
 }
 
-// ----------------------------------------------------------------------
-// Filter implementation
-
-// IndexSequence which yields the indices of positions in a BooleanArray
-// which are either null or true
-template <FilterOptions::NullSelectionBehavior NullSelectionBehavior>
-class FilterIndexSequence {
- public:
-  // constexpr so we'll never instantiate bounds checking
-  constexpr bool never_out_of_bounds() const { return true; }
-  void set_never_out_of_bounds() {}
-
-  constexpr FilterIndexSequence() = default;
-
-  FilterIndexSequence(const BooleanArray& filter, int64_t out_length)
-      : filter_(&filter), out_length_(out_length) {}
-
-  std::pair<int64_t, bool> Next() {
-    if (NullSelectionBehavior == FilterOptions::DROP) {
-      // skip until an index is found at which the filter is true
-      while (filter_->IsNull(index_) || !filter_->Value(index_)) {
-        ++index_;
-      }
-      return std::make_pair(index_++, true);
-    }
-
-    // skip until an index is found at which the filter is either null or true
-    while (filter_->IsValid(index_) && !filter_->Value(index_)) {
-      ++index_;
-    }
-    bool is_valid = filter_->IsValid(index_);
-    return std::make_pair(index_++, is_valid);
-  }
-
-  int64_t length() const { return out_length_; }
-
-  int64_t null_count() const {
-    if (NullSelectionBehavior == FilterOptions::DROP) {
-      return 0;
-    }
-    return filter_->null_count();
-  }
-
- private:
-  const BooleanArray* filter_ = nullptr;
-  int64_t index_ = 0, out_length_ = -1;
-};
-
-static int64_t OutputSize(FilterOptions options, const BooleanArray& filter) {
-  // TODO(bkietz) this can be optimized. Use Bitmap::VisitWords
-  int64_t size = 0;
-  if (options.null_selection_behavior == FilterOptions::EMIT_NULL) {
-    for (auto i = 0; i < filter.length(); ++i) {
-      if (filter.IsNull(i) || filter.Value(i)) {
-        ++size;
-      }
-    }
-  } else {
-    for (auto i = 0; i < filter.length(); ++i) {
-      if (filter.IsValid(i) && filter.Value(i)) {
-        ++size;
-      }
-    }
-  }
-  return size;
-}
-
-// ----------------------------------------------------------------------
-// Take implementation
-
-
-template <typename ValueType, typename IndexType>
-struct FilterFunctor {
-  using ValueArrayType = typename TypeTraits<ValueType>::ArrayType;
-
-  template <typename IS>
-  static void ExecWith(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    std::shared_ptr<Array> arg0 = batch[0].make_array();
-    std::shared_ptr<Array> arg1 = batch[1].make_array();
-
-    std::unique_ptr<Taker<IS>> taker;
-    CTX_RETURN_IF_ERROR(ctx, Taker<IS>::Make(arg0->type(), &taker));
-
-
-    RETURN_NOT_OK(taker_->SetContext(ctx));
-    RETURN_NOT_OK(taker_->Take(values, IndexSequence(filter, out_length)));
-    return taker_->Finish(out);
-
-    CTX_RETURN_IF_ERROR(ctx, taker->SetContext(ctx));
-    CTX_RETURN_IF_ERROR(ctx, taker->Take(*arg0, IS(*arg1)));
-
-    std::shared_ptr<Array> result;
-    CTX_RETURN_IF_ERROR(ctx, taker_->Finish(*result));
-    out->value = result;
-
-    for (size_t i = 0; i < value_chunks.size(); ++i) {
-      auto filter_chunk = checked_pointer_cast<BooleanArray>(filter_chunks[i]);
-      RETURN_NOT_OK(this->Filter(ctx, *value_chunks[i], *filter_chunk,
-                                 OutputSize(options_, *filter_chunk), &value_chunks[i]));
-    }
-
-    if (values.is_array() && filter.is_array()) {
-      *out = std::move(value_chunks[0]);
-    } else {
-      // drop empty chunks
-      value_chunks.erase(
-          std::remove_if(value_chunks.begin(), value_chunks.end(),
-                         [](const std::shared_ptr<Array>& a) { return a->length() == 0; }),
-          value_chunks.end());
-
-      *out = std::make_shared<ChunkedArray>(std::move(value_chunks), values.type());
-    }
-  }
-
-  Status Filter(KernelContext* ctx, const Array& values, const BooleanArray& filter,
-                int64_t out_length, std::shared_ptr<Array>* out) override {
-  }
-
-
-  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    if (options.null_selection_behavior == FilterOptions::EMIT_NULL) {
-      ExecWith<FilterIndexSequence<FilterOptions::EMIT_NULL>>(ctx, batch, out);
-    } else {
-      ExecWith<FilterIndexSequence<FilterOptions::DROP>>(ctx, batch, out);
-    }
-  }
-};
-
-template <typename ValueType, typename IndexType>
-struct TakeFunctor {
-  using ValueArrayType = typename TypeTraits<ValueType>::ArrayType;
-  using IndexArrayType = typename TypeTraits<IndexType>::ArrayType;
-  using IS = ArrayIndexSequence<IndexType>;
-
-  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    std::shared_ptr<Array> arg0 = batch[0].make_array();
-    std::shared_ptr<Array> arg1 = batch[1].make_array();
-    std::unique_ptr<Taker<IS>> taker;
-
-    CTX_RETURN_IF_ERROR(ctx, Taker<IS>::Make(arg0.type(), &taker));
-    CTX_RETURN_IF_ERROR(ctx, taker->SetContext(ctx));
-    CTX_RETURN_IF_ERROR(ctx, taker->Take(*arg0, IS(*arg1)));
-
-    std::shared_ptr<Array> result;
-    CTX_RETURN_IF_ERROR(ctx, taker_->Finish(*result));
-    out->value = result;
-  }
-};
-
-template <template <typename...> class Functor,
-          typename ValueType>
-struct IndexDispatch {
-  ArrayKernelExec GetKernel(const DataType& index_type) {
-    switch (index_type.id()) {
-      case Type::INT8:
-        return Functor<ValueType, Int8Type>::Exec;
-      case Type::INT16:
-        return Functor<ValueType, Int16Type>::Exec;
-      case Type::INT32:
-        return Functor<ValueType, Int32Type>::Exec;
-      case Type::INT64:
-        return Functor<ValueType, Int64Type>::Exec;
-      case Type::UINT8:
-        return Functor<ValueType, UInt8Type>::Exec;
-      case Type::UINT16:
-        return Functor<ValueType, UInt16Type>::Exec;
-      case Type::UINT32:
-        return Functor<ValueType, UInt32Type>::Exec;
-      case Type::UINT64:
-        return Functor<ValueType, UInt64Type>::Exec;
-      default:
-        DCHECK(false) << "Index type not supported";
-        return ExecFail;
-    }
-  }
-};
-
-struct TakeVisitor {
-  TakeVisitor(const DataType& value_type, const DataType& index_type)
-      : value_type(value_type), index_type(index_type) {}
-
-  Status Visit(const T&) {
-    this->result = IndexDispatch<T>::GetKernel(index_type);
-    return Status::OK();
-  }
-
-  Status Create() { return VisitTypeInline(value_type, this); }
-
-  const DataType& value_type;
-  const DataType& index_type;
-  ArrayKernelExec result;
-};
-
-Status GetTakeKernel(const DataType& value_type, const DataType& index_type,
-                     ArrayKernelExec* exec) {
-  TakeVisitor visitor(value_type, index_type);
-  RETURN_NOT_OK(visitor.Create());
-  *exec = visitor.result;
-  return Status::OK();
+template <typename IndexSequence>
+Status Select(KernelContext* ctx, const Array& values, IndexSequence sequence,
+              std::shared_ptr<Array>* out) {
+  std::unique_ptr<Taker<IndexSequence>> taker;
+  RETURN_NOT_OK(Taker<IndexSequence>::Make(values.type(), &taker));
+  RETURN_NOT_OK(taker->SetContext(ctx));
+  RETURN_NOT_OK(taker->Take(values, std::move(sequence)));
+  return taker->Finish(out);
 }
 
-namespace internal {
-
-static DataTypeVector g_take_index_types = {int8(), int16(), int32(), int64()};
-
-Result<ValueDescr> FirstType(const std::vector<ValueDescr>& descrs) { return descrs[0]; }
-
-void RegisterTakeFunctions(FunctionRegistry* registry) {
-  VectorKernel base;
-  base.init = TakeInit;
-  base.mem_allocation = MemAllocation::NO_PREALLOCATE;
-  base.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
-
-  auto take = std::make_shared<VectorFunction>("take", /*arity=*/1);
-
-  DataTypeVector exact_types;
-  codegen::Extend({boolean()}, &exact_types);
-  codegen::Extend(NumericTypes(), &exact_types);
-  codegen::Extend(TemporalTypes(), &exact_types);
-  codegen::Extend(BaseBinaryTypes(), &exact_types);
-
-  OutputType out_sig_type(FirstType);
-  for (const auto& value_ty : types) {
-    InputType arg0_ty = InputType::Array(value_ty);
-    for (const auto& index_ty : g_take_index_types) {
-      base.signature =
-          KernelSignature::Make({arg0_ty, InputType::Array(index_ty)}, out_sig_type);
-      base.exec = GetTakeKernel(*value_ty, *index_ty);
-      DCHECK_OK(take->AddKernel(base));
-    }
-  }
-
-  // Construct dummy parametric types so that we can get VisitTypeInline to
-  // work above
-  DataTypeVector parametric_types = {
-      fixed_size_binary(0),        list(null()),       struct_({}),       decimal(12, 2),
-      dictionary(int32(), null()), fixed_size_list(0), large_list(null())};
-  OutputType out_sig_type(FirstType);
-  for (const auto& value_ty : parametric_types) {
-    InputType arg0_ty = InputType::Array(value_ty->id());
-    for (const auto& index_ty : g_take_index_types) {
-      base.signature =
-          KernelSignature::Make({arg0_ty, InputType::Array(index_ty)}, out_sig_type);
-      base.exec = GetTakeKernel(*value_ty, *index_ty);
-      DCHECK_OK(take->AddKernel(base));
-    }
-  }
-  DCHECK_OK(registry->AddFunction(std::move(take)));
-}
+// Construct dummy parametric types so that we can get VisitTypeInline to
+// work above
+static DataTypeVector g_dummy_parametric_types = {
+  fixed_size_binary(0),        list(null()),       struct_({}),       decimal(12, 2),
+  dictionary(int32(), null()), fixed_size_list(field("dummy", null()), 0),
+  large_list(null())};
 
 }  // namespace internal
 }  // namespace compute
diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc
index 8dd890bead6..cd9e18ae6d3 100644
--- a/cpp/src/arrow/compute/kernels/vector_sort.cc
+++ b/cpp/src/arrow/compute/kernels/vector_sort.cc
@@ -261,35 +261,33 @@ std::unique_ptr<KernelState> InitPartitionIndices(KernelContext*, const Kernel&,
 }
 
 template <template <typename...> class ExecTemplate>
-struct SortingKernels {
-  static void Add(VectorKernel base, VectorFunction* func) {
-    for (const auto& ty : codegen::NumericTypes()) {
-      base.signature = KernelSignature::Make({InputType::Array(ty)}, uint64());
-      base.exec = codegen::NumericSetReturn<ExecTemplate, UInt64Type>(*ty);
-      DCHECK_OK(func->AddKernel(base));
-    }
-    for (const auto& ty : codegen::BaseBinaryTypes()) {
-      base.signature = KernelSignature::Make({InputType::Array(ty)}, uint64());
-      base.exec = codegen::BaseBinarySetReturn<ExecTemplate, UInt64Type>(*ty);
-      DCHECK_OK(func->AddKernel(base));
-    }
+void AddSortingKernels(VectorKernel base, VectorFunction* func) {
+  for (const auto& ty : NumericTypes()) {
+    base.signature = KernelSignature::Make({InputType::Array(ty)}, uint64());
+    base.exec = codegen::Numeric<ExecTemplate, UInt64Type>(*ty);
+    DCHECK_OK(func->AddKernel(base));
   }
-};
+  for (const auto& ty : BaseBinaryTypes()) {
+    base.signature = KernelSignature::Make({InputType::Array(ty)}, uint64());
+    base.exec = codegen::BaseBinary<ExecTemplate, UInt64Type>(*ty);
+    DCHECK_OK(func->AddKernel(base));
+  }
+}
 
-void RegisterVectorSortFunctions(FunctionRegistry* registry) {
+void RegisterVectorSort(FunctionRegistry* registry) {
   // The kernel outputs into preallocated memory and is never null
   VectorKernel base;
   base.mem_allocation = MemAllocation::PREALLOCATE;
   base.null_handling = NullHandling::OUTPUT_NOT_NULL;
 
   auto sort_indices = std::make_shared<VectorFunction>("sort_indices", /*arity=*/1);
-  SortingKernels<SortIndices>::Add(base, sort_indices.get());
+  AddSortingKernels<SortIndices>(base, sort_indices.get());
   DCHECK_OK(registry->AddFunction(std::move(sort_indices)));
 
   // partition_indices has a parameter so needs its init function
   auto part_indices = std::make_shared<VectorFunction>("partition_indices", /*arity=*/1);
   base.init = InitPartitionIndices;
-  SortingKernels<PartitionIndices>::Add(base, part_indices.get());
+  AddSortingKernels<PartitionIndices>(base, part_indices.get());
   DCHECK_OK(registry->AddFunction(std::move(part_indices)));
 }
 
diff --git a/cpp/src/arrow/compute/kernels/vector_take.cc b/cpp/src/arrow/compute/kernels/vector_take.cc
new file mode 100644
index 00000000000..23129e1b0b5
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/vector_take.cc
@@ -0,0 +1,116 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// returnGegarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+#include <limits>
+
+#include "arrow/builder.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/compute/kernels/vector_selection.h"
+#include "arrow/record_batch.h"
+#include "arrow/result.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+
+struct TakeState : public KernelState {
+  TakeState(const TakeOptions& options) : options(options) {}
+  TakeOptions options;
+};
+
+std::unique_ptr<KernelState> InitTake(KernelContext*, const Kernel&,
+                                      const FunctionOptions* options) {
+  // NOTE: TakeOptions are currently unused, but we pass it through anyway
+  auto take_options = static_cast<const TakeOptions*>(options);
+  return std::unique_ptr<KernelState>(new TakeState{*take_options});
+}
+
+template <typename ValueType, typename IndexType>
+struct TakeFunctor {
+  using ValueArrayType = typename TypeTraits<ValueType>::ArrayType;
+  using IndexArrayType = typename TypeTraits<IndexType>::ArrayType;
+  using IS = ArrayIndexSequence<IndexType>;
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    ValueArrayType values(batch[0].array());
+    IndexArrayType indices(batch[1].array());
+    std::shared_ptr<Array> result;
+    CTX_RETURN_IF_ERROR(ctx, Select(ctx, values, IS(indices), &result));
+    out->value = result->data();
+  }
+};
+
+struct TakeKernelVisitor {
+  TakeKernelVisitor(const DataType& value_type, const DataType& index_type)
+      : value_type(value_type), index_type(index_type) {}
+
+  template <typename Type>
+  Status Visit(const Type&) {
+    this->result = codegen::Integer<TakeFunctor, Type>(index_type);
+    return Status::OK();
+  }
+
+  Status Create() { return VisitTypeInline(value_type, this); }
+
+  const DataType& value_type;
+  const DataType& index_type;
+  ArrayKernelExec result;
+};
+
+Status GetTakeKernel(const DataType& value_type, const DataType& index_type,
+                     ArrayKernelExec* exec) {
+  TakeKernelVisitor visitor(value_type, index_type);
+  RETURN_NOT_OK(visitor.Create());
+  *exec = visitor.result;
+  return Status::OK();
+}
+
+void RegisterVectorTake(FunctionRegistry* registry) {
+  VectorKernel base;
+  base.init = InitTake;
+  base.mem_allocation = MemAllocation::NO_PREALLOCATE;
+  base.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+
+  auto take = std::make_shared<VectorFunction>("take", /*arity=*/2);
+
+  OutputType out_ty(FirstType);
+  for (const auto& value_ty : PrimitiveTypes()) {
+    InputType arg0_ty = InputType::Array(value_ty);
+    for (const auto& index_ty : SignedIntTypes()) {
+      base.signature =
+          KernelSignature::Make({arg0_ty, InputType::Array(index_ty)}, out_ty);
+      DCHECK_OK(GetTakeKernel(*value_ty, *index_ty, &base.exec));
+      DCHECK_OK(take->AddKernel(base));
+    }
+  }
+
+  for (const auto& value_ty : g_dummy_parametric_types) {
+    InputType arg0_ty = InputType::Array(value_ty->id());
+    for (const auto& index_ty : SignedIntTypes()) {
+      base.signature =
+          KernelSignature::Make({arg0_ty, InputType::Array(index_ty)}, out_ty);
+      DCHECK_OK(GetTakeKernel(*value_ty, *index_ty, &base.exec));
+      DCHECK_OK(take->AddKernel(base));
+    }
+  }
+  DCHECK_OK(registry->AddFunction(std::move(take)));
+}
+
+}  // namespace internal
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/options.h b/cpp/src/arrow/compute/options.h
index aeb659953c7..31373e56941 100644
--- a/cpp/src/arrow/compute/options.h
+++ b/cpp/src/arrow/compute/options.h
@@ -109,7 +109,7 @@ struct ARROW_EXPORT SetLookupOptions : public FunctionOptions {
   bool skip_nulls;
 };
 
-struct FilterOptions {
+struct FilterOptions : public FunctionOptions {
   /// Configure the action taken when a slot of the selection mask is null
   enum NullSelectionBehavior {
     /// the corresponding filtered value will be removed in the output
diff --git a/cpp/src/arrow/compute/registry.cc b/cpp/src/arrow/compute/registry.cc
index 4beb9c3bdd1..0a6630791cc 100644
--- a/cpp/src/arrow/compute/registry.cc
+++ b/cpp/src/arrow/compute/registry.cc
@@ -101,16 +101,18 @@ static void CreateBuiltInRegistry() {
   g_registry = FunctionRegistry::Make();
 
   // Scalar functions
-  RegisterArithmeticFunctions(g_registry.get());
-  RegisterBooleanFunctions(g_registry.get());
-  RegisterComparisonFunctions(g_registry.get());
-  RegisterSetLookupFunctions(g_registry.get());
+  RegisterScalarArithmetic(g_registry.get());
+  RegisterScalarBoolean(g_registry.get());
+  RegisterScalarComparison(g_registry.get());
+  RegisterScalarSetLookup(g_registry.get());
 
   // Aggregate functions
-  RegisterBasicAggregateFunctions(g_registry.get());
+  RegisterScalarAggregateBasic(g_registry.get());
 
   // Vector functions
-  RegisterVectorSortFunctions(g_registry.get());
+  RegisterVectorFilter(g_registry.get());
+  RegisterVectorSort(g_registry.get());
+  RegisterVectorTake(g_registry.get());
 }
 
 }  // namespace internal

From e4b0db31bd94c924a9302cd958bb087c85fa725e Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Tue, 19 May 2020 17:59:20 -0500
Subject: [PATCH 05/41] Fix UBSAN/ASAN issues, take and filter kernels
 completely working now

---
 cpp/src/arrow/CMakeLists.txt                  |   2 +-
 cpp/src/arrow/compute/api.h                   |   8 +-
 cpp/src/arrow/compute/api_vector.cc           | 115 ++++----
 cpp/src/arrow/compute/exec.cc                 | 262 +++++++++++++-----
 cpp/src/arrow/compute/exec_test.cc            |   2 +-
 cpp/src/arrow/compute/function.cc             |   6 +-
 cpp/src/arrow/compute/function_test.cc        |   6 +-
 cpp/src/arrow/compute/kernel.h                |  29 +-
 cpp/src/arrow/compute/kernel_test.cc          |   7 +-
 .../{codegen.cc => codegen_internal.cc}       |  22 +-
 .../kernels/{codegen.h => codegen_internal.h} |  29 +-
 cpp/src/arrow/compute/kernels/common.h        |   2 +-
 .../arrow/compute/kernels/vector_filter.cc    |  14 +-
 .../compute/kernels/vector_filter_test.cc     |  17 +-
 ...election.h => vector_selection_internal.h} |  18 +-
 cpp/src/arrow/compute/kernels/vector_take.cc  |   9 +-
 .../arrow/compute/kernels/vector_take_test.cc | 246 ++++++++--------
 cpp/src/arrow/datum.cc                        |  31 ++-
 .../parquet/arrow/arrow_reader_writer_test.cc |   1 -
 19 files changed, 490 insertions(+), 336 deletions(-)
 rename cpp/src/arrow/compute/kernels/{codegen.cc => codegen_internal.cc} (89%)
 rename cpp/src/arrow/compute/kernels/{codegen.h => codegen_internal.h} (96%)
 rename cpp/src/arrow/compute/kernels/{vector_selection.h => vector_selection_internal.h} (98%)

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 9df90013a22..6c81ba7e9ca 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -327,8 +327,8 @@ if(ARROW_COMPUTE)
               compute/function.cc
               compute/kernel.cc
               compute/registry.cc
-              compute/kernels/codegen.cc
               compute/kernels/aggregate_basic.cc
+              compute/kernels/codegen_internal.cc
               compute/kernels/scalar_arithmetic.cc
               compute/kernels/scalar_boolean.cc
               compute/kernels/scalar_compare.cc
diff --git a/cpp/src/arrow/compute/api.h b/cpp/src/arrow/compute/api.h
index 2b5a8d8d503..b31bbdc0a45 100644
--- a/cpp/src/arrow/compute/api.h
+++ b/cpp/src/arrow/compute/api.h
@@ -23,7 +23,7 @@
 #include "arrow/compute/api_aggregate.h"  // IWYU pragma: export
 #include "arrow/compute/api_scalar.h"     // IWYU pragma: export
 #include "arrow/compute/api_vector.h"     // IWYU pragma: export
-#include "arrow/compute/exec.h"       // IWYU pragma: export
-#include "arrow/compute/function.h"   // IWYU pragma: export
-#include "arrow/compute/kernel.h"     // IWYU pragma: export
-#include "arrow/compute/registry.h"   // IWYU pragma: export
+#include "arrow/compute/exec.h"           // IWYU pragma: export
+#include "arrow/compute/function.h"       // IWYU pragma: export
+#include "arrow/compute/kernel.h"         // IWYU pragma: export
+#include "arrow/compute/registry.h"       // IWYU pragma: export
diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc
index fa0bd601d31..454b1c02acb 100644
--- a/cpp/src/arrow/compute/api_vector.cc
+++ b/cpp/src/arrow/compute/api_vector.cc
@@ -18,9 +18,12 @@
 #include "arrow/compute/api_vector.h"
 
 #include <memory>
+#include <utility>
+#include <vector>
 
 #include "arrow/array/concatenate.h"
 #include "arrow/compute/exec.h"
+#include "arrow/compute/kernels/vector_selection_internal.h"
 #include "arrow/compute/options.h"
 #include "arrow/datum.h"
 #include "arrow/result.h"
@@ -45,11 +48,6 @@ Result<std::shared_ptr<Array>> SortToIndices(const Array& values, ExecContext* c
   return result.make_array();
 }
 
-Result<Datum> Filter(const Datum& values, const Datum& filter, FilterOptions options,
-                     ExecContext* ctx) {
-  return ExecVectorFunction(ctx, "take", {values, filter}, &options);
-}
-
 Result<Datum> Take(const Datum& values, const Datum& indices, const TakeOptions& options,
                    ExecContext* ctx) {
   return ExecVectorFunction(ctx, "take", {values, indices}, &options);
@@ -119,6 +117,62 @@ Result<std::shared_ptr<Array>> ValueCounts(const Datum& value, ExecContext* ctx)
   return Status::NotImplemented("NYI");
 }
 
+// ----------------------------------------------------------------------
+// Filter with conveniences to filter RecordBatch, Table
+
+Result<std::shared_ptr<RecordBatch>> FilterRecordBatch(const RecordBatch& batch,
+                                                       const Datum& filter,
+                                                       FilterOptions options,
+                                                       ExecContext* ctx) {
+  if (!filter.is_array()) {
+    return Status::Invalid("Cannot filter a RecordBatch with a filter of kind ",
+                           filter.kind());
+  }
+
+  // TODO: Rewrite this to convert to selection vector and use Take
+  std::vector<std::shared_ptr<Array>> columns(batch.num_columns());
+  for (int i = 0; i < batch.num_columns(); ++i) {
+    ARROW_ASSIGN_OR_RAISE(Datum out,
+                          Filter(batch.column(i)->data(), filter, options, ctx));
+    columns[i] = out.make_array();
+  }
+
+  int64_t out_length;
+  if (columns.size() == 0) {
+    out_length =
+        internal::FilterOutputSize(options.null_selection_behavior, *filter.make_array());
+  } else {
+    out_length = columns[0]->length();
+  }
+  return RecordBatch::Make(batch.schema(), out_length, columns);
+}
+
+Result<std::shared_ptr<Table>> FilterTable(const Table& table, const Datum& filter,
+                                           FilterOptions options, ExecContext* ctx) {
+  auto new_columns = table.columns();
+  for (auto& column : new_columns) {
+    ARROW_ASSIGN_OR_RAISE(Datum out_column, Filter(column, filter, options, ctx));
+    column = out_column.chunked_array();
+  }
+  return Table::Make(table.schema(), std::move(new_columns));
+}
+
+Result<Datum> Filter(const Datum& values, const Datum& filter, FilterOptions options,
+                     ExecContext* ctx) {
+  if (values.kind() == Datum::RECORD_BATCH) {
+    auto values_batch = values.record_batch();
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<RecordBatch> out_batch,
+                          FilterRecordBatch(*values_batch, filter, options, ctx));
+    return Datum(out_batch);
+  } else if (values.kind() == Datum::TABLE) {
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Table> out_table,
+                          FilterTable(*values.table(), filter, options, ctx));
+    return Datum(out_table);
+  } else {
+    return ExecVectorFunction(ctx, "filter", {values, filter}, &options);
+  }
+}
+
 // ----------------------------------------------------------------------
 // Take invocation conveniences
 
@@ -215,56 +269,5 @@ Result<std::shared_ptr<Table>> Take(const Table& table, const ChunkedArray& indi
   return Table::Make(table.schema(), columns);
 }
 
-// ----------------------------------------------------------------------
-// Filter invocation conveniences
-
-// Status FilterRecordBatch(KernelContext* ctx, const RecordBatch& batch,
-//                          const Array& filter, FilterOptions options,
-//                          std::shared_ptr<RecordBatch>* out) {
-//   RETURN_NOT_OK(CheckFilterType(filter.type()));
-//   const auto& filter_array = checked_cast<const BooleanArray&>(filter);
-//   std::vector<std::unique_ptr<FilterKernel>> kernels(batch.num_columns());
-//   for (int i = 0; i < batch.num_columns(); ++i) {
-//     RETURN_NOT_OK(
-//         FilterKernel::Make(batch.schema()->field(i)->type(), options, &kernels[i]));
-//   }
-//   std::vector<std::shared_ptr<Array>> columns(batch.num_columns());
-//   auto out_length = OutputSize(options, filter_array);
-//   for (int i = 0; i < batch.num_columns(); ++i) {
-//     RETURN_NOT_OK(
-//         kernels[i]->Filter(ctx, *batch.column(i), filter_array, out_length,
-//         &columns[i]));
-//   }
-//   *out = RecordBatch::Make(batch.schema(), out_length, columns);
-//   return Status::OK();
-// }
-
-// Status Filter(KernelContext* ctx, const Datum& values, const Datum& filter,
-//               FilterOptions options, Datum* out) {
-//   if (values.kind() == Datum::RECORD_BATCH) {
-//     if (!filter.is_array()) {
-//       return Status::Invalid("Cannot filter a RecordBatch with a filter of kind ",
-//                              filter.kind());
-//     }
-//     auto values_batch = values.record_batch();
-//     auto filter_array = filter.make_array();
-//     std::shared_ptr<RecordBatch> out_batch;
-//     RETURN_NOT_OK(
-//         FilterRecordBatch(ctx, *values_batch, *filter_array, options, &out_batch));
-//     *out = std::move(out_batch);
-//     return Status::OK();
-//   }
-//   if (values.kind() == Datum::TABLE) {
-//     auto values_table = values.table();
-//     std::shared_ptr<Table> out_table;
-//     RETURN_NOT_OK(FilterTable(ctx, *values_table, filter, options, &out_table));
-//     *out = std::move(out_table);
-//     return Status::OK();
-//   }
-//   std::unique_ptr<FilterKernel> kernel;
-//   RETURN_NOT_OK(FilterKernel::Make(values.type(), options, &kernel));
-//   return kernel->Call(ctx, values, filter, out);
-// }
-
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc
index 7683ef23403..7b461f5176d 100644
--- a/cpp/src/arrow/compute/exec.cc
+++ b/cpp/src/arrow/compute/exec.cc
@@ -187,6 +187,21 @@ bool ExecBatchIterator::Next(ExecBatch* batch) {
   return true;
 }
 
+bool ArrayHasNulls(const ArrayData& data) {
+  // As discovered in ARROW-8863 (and not only for that reason)
+  // ArrayData::null_count can -1 even when buffers[0] is nullptr. So we check
+  // for both cases (nullptr means no nulls, or null_count already computed)
+  if (data.type->id() == Type::NA) {
+    return true;
+  } else if (data.buffers[0] == nullptr) {
+    return false;
+  } else {
+    // Do not count the bits if they haven't been counted already
+    const int64_t known_null_count = data.null_count.load();
+    return known_null_count == kUnknownNullCount || known_null_count > 0;
+  }
+}
+
 // Null propagation implementation that deals both with preallocated bitmaps
 // and maybe-to-be allocated bitmaps
 //
@@ -206,9 +221,7 @@ class NullPropagator {
     // all be value-like
     for (const Datum& val : batch_.values) {
       if (val.kind() == Datum::ARRAY) {
-        // Do not count the bits if they haven't been counted already
-        const int64_t known_null_count = val.array()->null_count.load();
-        if (known_null_count == kUnknownNullCount || known_null_count > 0) {
+        if (ArrayHasNulls(*val.array())) {
           values_with_nulls_.push_back(&val);
         }
       } else if (!val.scalar()->is_valid) {
@@ -318,6 +331,9 @@ class NullPropagator {
     RETURN_NOT_OK(EnsureAllocated());
 
     auto Accumulate = [&](const ArrayData& left, const ArrayData& right) {
+      // This is a precondition of reaching this code path
+      DCHECK(left.buffers[0]);
+      DCHECK(right.buffers[0]);
       internal::BitmapAnd(left.buffers[0]->data(), left.offset, right.buffers[0]->data(),
                           right.offset, output_->length, output_->offset,
                           output_->buffers[0]->mutable_data());
@@ -442,6 +458,13 @@ class FunctionExecutorImpl : public FunctionExecutor {
     return Status::OK();
   }
 
+  // This is overridden by the VectorExecutor
+  virtual Status SetupArgIteration(const std::vector<Datum>& args) {
+    ARROW_ASSIGN_OR_RAISE(batch_iterator_,
+                          ExecBatchIterator::Make(args, exec_ctx_->exec_chunksize()));
+    return Status::OK();
+  }
+
   Status BindArgs(const std::vector<Datum>& args) {
     std::vector<ValueDescr> arg_descrs;
     RETURN_NOT_OK(GetValueDescriptors(args, &arg_descrs));
@@ -451,14 +474,28 @@ class FunctionExecutorImpl : public FunctionExecutor {
     ARROW_ASSIGN_OR_RAISE(output_descr_,
                           kernel_->signature->out_type().Resolve(arg_descrs));
 
-    ARROW_ASSIGN_OR_RAISE(batch_iterator_,
-                          ExecBatchIterator::Make(args, exec_ctx_->exec_chunksize()));
+    return SetupArgIteration(args);
+  }
 
-    return Status::OK();
+  Result<std::shared_ptr<ArrayData>> PrepareOutput(int64_t length) {
+    auto out = std::make_shared<ArrayData>(output_descr_.type, length);
+    out->buffers.resize(output_num_buffers_);
+
+    if (validity_preallocated_) {
+      ARROW_ASSIGN_OR_RAISE(out->buffers[0], kernel_ctx_.AllocateBitmap(length));
+    }
+    if (data_preallocated_) {
+      const auto& fw_type = checked_cast<const FixedWidthType&>(*out->type);
+      ARROW_ASSIGN_OR_RAISE(
+          out->buffers[1], AllocateDataBuffer(&kernel_ctx_, length, fw_type.bit_width()));
+    }
+    return out;
   }
 
   ValueDescr output_descr() const override { return output_descr_; }
 
+  // Not all of these members are used for every executor type
+
   ExecContext* exec_ctx_;
   KernelContext kernel_ctx_;
   const FunctionType* func_;
@@ -467,15 +504,39 @@ class FunctionExecutorImpl : public FunctionExecutor {
   std::unique_ptr<KernelState> state_;
   ValueDescr output_descr_;
   const FunctionOptions* options_;
+
+  int output_num_buffers_;
+
+  // If true, then the kernel writes into a preallocated data buffer
+  bool data_preallocated_ = false;
+
+  // If true, then memory is preallocated for the validity bitmap with the same
+  // strategy as the data buffer(s).
+  bool validity_preallocated_ = false;
 };
 
-// Executor for SCALAR and VECTOR functions
-template <typename FunctionType>
-class ArrayExecutor : public FunctionExecutorImpl<FunctionType> {
+class ScalarExecutor : public FunctionExecutorImpl<ScalarFunction> {
  public:
-  using BASE = FunctionExecutorImpl<FunctionType>;
+  using FunctionType = ScalarFunction;
+  static constexpr Function::Kind function_kind = Function::SCALAR;
+  using BASE = FunctionExecutorImpl<ScalarFunction>;
   using BASE::BASE;
 
+  Status Execute(const std::vector<Datum>& args, ExecListener* listener) override {
+    RETURN_NOT_OK(PrepareExecute(args));
+    ExecBatch batch;
+    while (batch_iterator_->Next(&batch)) {
+      RETURN_NOT_OK(ExecuteBatch(batch, listener));
+    }
+    if (preallocate_contiguous_) {
+      // If we preallocated one big chunk, since the kernel execution is
+      // completed, we can now emit it
+      RETURN_NOT_OK(listener->OnResult(std::move(preallocated_)));
+    }
+    return Status::OK();
+  }
+
+ protected:
   Status ExecuteBatch(const ExecBatch& batch, ExecListener* listener) {
     Datum out;
     RETURN_NOT_OK(PrepareNextOutput(batch, &out));
@@ -499,33 +560,17 @@ class ArrayExecutor : public FunctionExecutorImpl<FunctionType> {
     this->Reset();
     RETURN_NOT_OK(this->BindArgs(args));
     RETURN_NOT_OK(this->InitState());
-    output_num_buffers_ = static_cast<int>(output_descr_.type->layout().buffers.size());
 
-    // If the executor is configured to produce a single large Array output for
-    // kernels supporting preallocation, then we do so up front and then
-    // iterate over slices of that large array. Otherwise, we preallocate prior
-    // to processing each batch emitted from the ExecBatchIterator
     if (output_descr_.shape == ValueDescr::ARRAY) {
+      // If the executor is configured to produce a single large Array output for
+      // kernels supporting preallocation, then we do so up front and then
+      // iterate over slices of that large array. Otherwise, we preallocate prior
+      // to processing each batch emitted from the ExecBatchIterator
       RETURN_NOT_OK(SetupPreallocation(batch_iterator_->length()));
     }
     return Status::OK();
   }
 
-  Status Execute(const std::vector<Datum>& args, ExecListener* listener) override {
-    RETURN_NOT_OK(PrepareExecute(args));
-    ExecBatch batch;
-    while (batch_iterator_->Next(&batch)) {
-      RETURN_NOT_OK(ExecuteBatch(batch, listener));
-    }
-    if (preallocate_contiguous_) {
-      // If we preallocated one big chunk, since the kernel execution is
-      // completed, we can now emit it
-      RETURN_NOT_OK(listener->OnResult(std::move(preallocated_)));
-    }
-    return Status::OK();
-  }
-
- protected:
   // We must accommodate two different modes of execution for preallocated
   // execution
   //
@@ -566,30 +611,16 @@ class ArrayExecutor : public FunctionExecutorImpl<FunctionType> {
         ARROW_ASSIGN_OR_RAISE(out->value, PrepareOutput(batch.length));
       }
     }
-    // XXX: Scalar outputs are the responsibility of the kernel?
+    // Scalar outputs are the responsibility of the kernel
     return Status::OK();
   }
 
-  Result<std::shared_ptr<ArrayData>> PrepareOutput(int64_t length) {
-    auto out = std::make_shared<ArrayData>(output_descr_.type, length);
-    out->buffers.resize(output_num_buffers_);
-
-    const auto& fw_type = checked_cast<const FixedWidthType&>(*out->type);
-    if (validity_preallocated_) {
-      ARROW_ASSIGN_OR_RAISE(out->buffers[0], kernel_ctx_.AllocateBitmap(length));
-    }
-    if (data_preallocated_) {
-      ARROW_ASSIGN_OR_RAISE(
-          out->buffers[1], AllocateDataBuffer(&kernel_ctx_, length, fw_type.bit_width()));
-    }
-    return out;
-  }
-
   Status SetupPreallocation(int64_t total_length) {
+    output_num_buffers_ = static_cast<int>(output_descr_.type->layout().buffers.size());
+
     // Decide if we need to preallocate memory for this kernel
     data_preallocated_ = ((kernel_->mem_allocation == MemAllocation::PREALLOCATE) &&
                           CanPreallocate(*output_descr_.type));
-
     validity_preallocated_ =
         (kernel_->null_handling != NullHandling::COMPUTED_NO_PREALLOCATE &&
          kernel_->null_handling != NullHandling::OUTPUT_NOT_NULL);
@@ -613,25 +644,6 @@ class ArrayExecutor : public FunctionExecutorImpl<FunctionType> {
     return Status::OK();
   }
 
-  // Lift protected members so we don't have to use this->
-  using BASE::batch_iterator_;
-  using BASE::exec_ctx_;
-  using BASE::func_;
-  using BASE::kernel_;
-  using BASE::kernel_ctx_;
-  using BASE::options_;
-  using BASE::output_descr_;
-  using BASE::state_;
-
-  int output_num_buffers_;
-
-  // If true, then the kernel writes into a preallocated data buffer
-  bool data_preallocated_ = false;
-
-  // If true, then memory is preallocated for the validity bitmap with the same
-  // strategy as the data buffer(s).
-  bool validity_preallocated_ = false;
-
   // If true, and the kernel and output type supports preallocation (for both
   // the validity and data buffers), then we allocate one big array and then
   // iterate through it while executing the kernel in chunks
@@ -641,20 +653,110 @@ class ArrayExecutor : public FunctionExecutorImpl<FunctionType> {
   std::shared_ptr<ArrayData> preallocated_;
 };
 
-class ScalarExecutor : public ArrayExecutor<ScalarFunction> {
- public:
-  using FunctionType = ScalarFunction;
-  static constexpr Function::Kind function_kind = Function::SCALAR;
-  using BASE = ArrayExecutor<ScalarFunction>;
-  using BASE::BASE;
-};
+Status PackBatchNoChunks(const std::vector<Datum>& args, ExecBatch* out) {
+  int64_t length = 0;
+  for (size_t i = 0; i < args.size(); ++i) {
+    switch (args[i].kind()) {
+      case Datum::SCALAR:
+      case Datum::ARRAY:
+        length = std::max(args[i].length(), length);
+        break;
+      case Datum::CHUNKED_ARRAY:
+        return Status::Invalid("Kernel does not support chunked array arguments");
+      default:
+        DCHECK(false);
+        break;
+    }
+  }
+  out->length = length;
+  out->values = args;
+  return Status::OK();
+}
 
-class VectorExecutor : public ArrayExecutor<VectorFunction> {
+class VectorExecutor : public FunctionExecutorImpl<VectorFunction> {
  public:
   using FunctionType = VectorFunction;
   static constexpr Function::Kind function_kind = Function::VECTOR;
-  using BASE = ArrayExecutor<VectorFunction>;
+  using BASE = FunctionExecutorImpl<VectorFunction>;
   using BASE::BASE;
+
+  Status Execute(const std::vector<Datum>& args, ExecListener* listener) override {
+    RETURN_NOT_OK(PrepareExecute(args));
+    ExecBatch batch;
+    if (kernel_->can_execute_chunkwise) {
+      while (batch_iterator_->Next(&batch)) {
+        RETURN_NOT_OK(ExecuteBatch(batch, listener));
+      }
+    } else {
+      RETURN_NOT_OK(PackBatchNoChunks(args, &batch));
+      RETURN_NOT_OK(ExecuteBatch(batch, listener));
+    }
+    return Finalize(listener);
+  }
+
+ protected:
+  Status ExecuteBatch(const ExecBatch& batch, ExecListener* listener) {
+    Datum out;
+    if (output_descr_.shape == ValueDescr::ARRAY) {
+      // We preallocate (maybe) only for the output of processing the current
+      // batch
+      ARROW_ASSIGN_OR_RAISE(out.value, PrepareOutput(batch.length));
+    }
+
+    if (kernel_->null_handling == NullHandling::INTERSECTION &&
+        output_descr_.shape == ValueDescr::ARRAY) {
+      RETURN_NOT_OK(PropagateNulls(&kernel_ctx_, batch, out.mutable_array()));
+    }
+    kernel_->exec(&kernel_ctx_, batch, &out);
+    CTX_RETURN_IF_ERROR(&kernel_ctx_);
+    if (!kernel_->finalize) {
+      // If there is no result finalizer (e.g. for hash-based functions, we can
+      // emit the processed batch right away rather than waiting
+      RETURN_NOT_OK(listener->OnResult(std::move(out)));
+    } else {
+      results_.emplace_back(std::move(out));
+    }
+    return Status::OK();
+  }
+
+  Status Finalize(ExecListener* listener) {
+    if (kernel_->finalize) {
+      // Intermediate results require post-processing after the execution is
+      // completed (possibly involving some accumulated state)
+      for (const auto& result : results_) {
+        Datum finalized_result;
+        kernel_->finalize(&kernel_ctx_, result, &finalized_result);
+        CTX_RETURN_IF_ERROR(&kernel_ctx_);
+        RETURN_NOT_OK(listener->OnResult(std::move(finalized_result)));
+      }
+    }
+    return Status::OK();
+  }
+
+  Status SetupArgIteration(const std::vector<Datum>& args) override {
+    if (kernel_->can_execute_chunkwise) {
+      ARROW_ASSIGN_OR_RAISE(batch_iterator_,
+                            ExecBatchIterator::Make(args, exec_ctx_->exec_chunksize()));
+    }
+    return Status::OK();
+  }
+
+  Status PrepareExecute(const std::vector<Datum>& args) {
+    this->Reset();
+    RETURN_NOT_OK(this->BindArgs(args));
+    RETURN_NOT_OK(this->InitState());
+    output_num_buffers_ = static_cast<int>(output_descr_.type->layout().buffers.size());
+
+    // Decide if we need to preallocate memory for this kernel
+    data_preallocated_ = ((kernel_->mem_allocation == MemAllocation::PREALLOCATE) &&
+                          CanPreallocate(*output_descr_.type));
+    validity_preallocated_ =
+        (kernel_->null_handling != NullHandling::COMPUTED_NO_PREALLOCATE &&
+         kernel_->null_handling != NullHandling::OUTPUT_NOT_NULL);
+    return Status::OK();
+  }
+
+  std::vector<Datum> results_;
 };
 
 class ScalarAggExecutor : public FunctionExecutorImpl<ScalarAggregateFunction> {
@@ -727,7 +829,8 @@ Result<std::unique_ptr<FunctionExecutor>> FunctionExecutor::Make(
 Status CheckAllValues(const std::vector<Datum>& values) {
   for (const auto& value : values) {
     if (!value.is_value()) {
-      return Status::Invalid("Datum contained non-scalar/array type");
+      return Status::Invalid("Tried executing kernel with unsupported argument type: ",
+                             value.ToString());
     }
   }
   return Status::OK();
@@ -784,7 +887,12 @@ std::shared_ptr<ChunkedArray> ToChunkedArray(const std::vector<Datum>& values,
                                              const std::shared_ptr<DataType>& type) {
   std::vector<std::shared_ptr<Array>> arrays;
   for (const auto& val : values) {
-    arrays.emplace_back(val.make_array());
+    auto boxed = val.make_array();
+    if (boxed->length() == 0) {
+      // Skip empty chunks
+      continue;
+    }
+    arrays.emplace_back(std::move(boxed));
   }
   return std::make_shared<ChunkedArray>(arrays, type);
 }
diff --git a/cpp/src/arrow/compute/exec_test.cc b/cpp/src/arrow/compute/exec_test.cc
index ceee12785fd..95ea112390c 100644
--- a/cpp/src/arrow/compute/exec_test.cc
+++ b/cpp/src/arrow/compute/exec_test.cc
@@ -698,7 +698,7 @@ TEST_F(TestExecScalarFunction, ArgumentValidation) {
 
   // Cannot do scalar
   args = {Datum(std::make_shared<Int32Scalar>(5))};
-  ASSERT_RAISES(KeyError, ExecScalarFunction(exec_ctx_.get(), "copy", args));
+  ASSERT_RAISES(NotImplemented, ExecScalarFunction(exec_ctx_.get(), "copy", args));
 }
 
 TEST_F(TestExecScalarFunction, PreallocationCases) {
diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc
index 1c29ab7ed3b..ae58da49f71 100644
--- a/cpp/src/arrow/compute/function.cc
+++ b/cpp/src/arrow/compute/function.cc
@@ -73,9 +73,9 @@ Result<const KernelType*> DispatchExactImpl(const Function& func,
       return &kernel;
     }
   }
-  return Status::KeyError("Function ", func.name(),
-                          " has no kernel exactly matching input types ",
-                          FormatArgTypes(values));
+  return Status::NotImplemented("Function ", func.name(),
+                                " has no kernel matching input types ",
+                                FormatArgTypes(values));
 }
 
 Status ScalarFunction::AddKernel(std::vector<InputType> in_types, OutputType out_type,
diff --git a/cpp/src/arrow/compute/function_test.cc b/cpp/src/arrow/compute/function_test.cc
index 89c3ed00352..c720a33ace7 100644
--- a/cpp/src/arrow/compute/function_test.cc
+++ b/cpp/src/arrow/compute/function_test.cc
@@ -134,7 +134,7 @@ void CheckAddDispatch(FunctionType* func) {
   ASSERT_TRUE(kernel->signature->Equals(expected_sig));
 
   // No kernel available
-  ASSERT_RAISES(KeyError, func->DispatchExact({utf8(), utf8()}));
+  ASSERT_RAISES(NotImplemented, func->DispatchExact({utf8(), utf8()}));
 
   // Wrong arity
   ASSERT_RAISES(Invalid, func->DispatchExact({}));
@@ -172,7 +172,7 @@ TEST(ArrayFunction, Varargs) {
 
   // No dispatch possible because args incompatible
   args[2] = int32();
-  ASSERT_RAISES(KeyError, va_func.DispatchExact(args));
+  ASSERT_RAISES(NotImplemented, va_func.DispatchExact(args));
 }
 
 TEST(ScalarAggregateFunction, Basics) {
@@ -226,7 +226,7 @@ TEST(ScalarAggregateFunction, DispatchExact) {
 
   // We declared that only arrays are accepted
   dispatch_args[0] = {ValueDescr::Scalar(int8())};
-  ASSERT_RAISES(KeyError, func.DispatchExact(dispatch_args));
+  ASSERT_RAISES(NotImplemented, func.DispatchExact(dispatch_args));
 
   // Didn't qualify the float64() kernel so this actually dispatches (even
   // though that may not be what you want)
diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h
index c7e6a51e933..22c444aa903 100644
--- a/cpp/src/arrow/compute/kernel.h
+++ b/cpp/src/arrow/compute/kernel.h
@@ -120,7 +120,7 @@ class ARROW_EXPORT InputType {
 
   InputType(std::shared_ptr<DataType> type,
             ValueDescr::Shape shape = ValueDescr::ANY)  // NOLINT implicit construction
-      : kind_(EXACT_TYPE), shape_(shape), type_(std::move(type)) {}
+      : kind_(EXACT_TYPE), shape_(shape), type_(std::move(type)), type_id_(type_->id()) {}
 
   InputType(const ValueDescr& descr)  // NOLINT implicit construction
       : InputType(descr.type, descr.shape) {}
@@ -201,13 +201,13 @@ class ARROW_EXPORT InputType {
 
   Kind kind_;
 
-  ValueDescr::Shape shape_;
+  ValueDescr::Shape shape_ = ValueDescr::ANY;
 
   // For EXACT_TYPE ArgKind
   std::shared_ptr<DataType> type_;
 
   // For SAME_TYPE_ID ArgKind
-  Type::type type_id_;
+  Type::type type_id_ = Type::NA;
 };
 
 /// \brief Container to capture both exact and input-dependent output types
@@ -230,7 +230,7 @@ class ARROW_EXPORT OutputType {
   using Resolver = std::function<Result<ValueDescr>(const std::vector<ValueDescr>&)>;
 
   OutputType(std::shared_ptr<DataType> type)  // NOLINT implicit construction
-      : kind_(FIXED), type_(std::move(type)), shape_(ValueDescr::ANY) {}
+      : kind_(FIXED), type_(std::move(type)) {}
 
   /// For outputting a particular type and shape
   OutputType(ValueDescr descr);  // NOLINT implicit construction
@@ -239,6 +239,7 @@ class ARROW_EXPORT OutputType {
 
   OutputType(const OutputType& other) {
     this->kind_ = other.kind_;
+    this->shape_ = other.shape_;
     this->type_ = other.type_;
     this->resolver_ = other.resolver_;
   }
@@ -246,6 +247,7 @@ class ARROW_EXPORT OutputType {
   OutputType(OutputType&& other) {
     this->kind_ = other.kind_;
     this->type_ = std::move(other.type_);
+    this->shape_ = other.shape_;
     this->resolver_ = other.resolver_;
   }
 
@@ -278,7 +280,7 @@ class ARROW_EXPORT OutputType {
   // For FIXED resolution
   std::shared_ptr<DataType> type_;
 
-  ValueDescr::Shape shape_;
+  ValueDescr::Shape shape_ = ValueDescr::ANY;
 
   // For COMPUTED resolution
   Resolver resolver_;
@@ -433,8 +435,8 @@ struct ScalarKernel : public ArrayKernel {
   MemAllocation::type mem_allocation = MemAllocation::PREALLOCATE;
 };
 
-// Finalize returns Datum to permit multiple return values
-using VectorFinalize = std::function<void(KernelContext*, std::vector<Datum>*)>;
+// Convert intermediate result into finalized result
+using VectorFinalize = std::function<void(KernelContext*, const Datum&, Datum*)>;
 
 struct VectorKernel : public ArrayKernel {
   VectorKernel() {}
@@ -452,12 +454,17 @@ struct VectorKernel : public ArrayKernel {
 
   VectorFinalize finalize;
 
-  // Since vector kernels generally are implemented rather differently from
-  // scalar/elementwise kernels (and they may not even yield arrays of the same
-  // size), so we make the developer opt-in to any memory preallocation rather
-  // than having to turn it off.
+  /// Since vector kernels generally are implemented rather differently from
+  /// scalar/elementwise kernels (and they may not even yield arrays of the same
+  /// size), so we make the developer opt-in to any memory preallocation rather
+  /// than having to turn it off.
   NullHandling::type null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
   MemAllocation::type mem_allocation = MemAllocation::NO_PREALLOCATE;
+
+  /// Some vector kernels can do chunkwise execution using ExecBatchIterator,
+  /// in some cases accumulating some state. Other kernels (like Take) need to
+  /// be passed whole arrays and don't work on ChunkedArray inputs
+  bool can_execute_chunkwise = true;
 };
 
 using ScalarAggregateConsume = std::function<void(KernelContext*, const ExecBatch&)>;
diff --git a/cpp/src/arrow/compute/kernel_test.cc b/cpp/src/arrow/compute/kernel_test.cc
index 5b61e93a3f3..599a9fd4e6a 100644
--- a/cpp/src/arrow/compute/kernel_test.cc
+++ b/cpp/src/arrow/compute/kernel_test.cc
@@ -237,7 +237,7 @@ TEST(OutputType, Constructors) {
   // ToString
 
   // ty1 was copied to ty3
-  ASSERT_EQ("int8", ty3.ToString());
+  ASSERT_EQ("any[int8]", ty3.ToString());
   ASSERT_EQ("computed", ty2.ToString());
 }
 
@@ -416,7 +416,8 @@ TEST(KernelSignature, ToString) {
                                      InputType(Type::DECIMAL, ValueDescr::ARRAY),
                                      InputType(utf8())};
   KernelSignature sig(in_types, utf8());
-  ASSERT_EQ("(scalar[int8], array[decimal*], any[string]) -> string", sig.ToString());
+  ASSERT_EQ("(scalar[int8], array[decimal*], any[string]) -> any[string]",
+            sig.ToString());
 
   OutputType out_type(
       [](const std::vector<ValueDescr>& args) { return Status::Invalid("NYI"); });
@@ -426,7 +427,7 @@ TEST(KernelSignature, ToString) {
 
 TEST(KernelSignature, VarargsToString) {
   KernelSignature sig({int8()}, utf8(), /*is_varargs=*/true);
-  ASSERT_EQ("varargs[any[int8]] -> string", sig.ToString());
+  ASSERT_EQ("varargs[any[int8]] -> any[string]", sig.ToString());
 }
 
 }  // namespace compute
diff --git a/cpp/src/arrow/compute/kernels/codegen.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc
similarity index 89%
rename from cpp/src/arrow/compute/kernels/codegen.cc
rename to cpp/src/arrow/compute/kernels/codegen_internal.cc
index abf3b79f7f4..463c6a714c0 100644
--- a/cpp/src/arrow/compute/kernels/codegen.cc
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "arrow/compute/kernels/codegen.h"
+#include "arrow/compute/kernels/codegen_internal.h"
 
 #include <cstdint>
 #include <memory>
@@ -48,7 +48,7 @@ std::vector<std::shared_ptr<DataType>> g_floating_types;
 std::vector<std::shared_ptr<DataType>> g_numeric_types;
 std::vector<std::shared_ptr<DataType>> g_base_binary_types;
 std::vector<std::shared_ptr<DataType>> g_temporal_types;
-std::vector<std::shared_ptr<DataType>> g_non_parametric_types;
+std::vector<std::shared_ptr<DataType>> g_primitive_types;
 static std::once_flag codegen_static_initialized;
 
 static void InitStaticData() {
@@ -94,11 +94,15 @@ static void InitStaticData() {
   g_base_binary_types.push_back(large_binary());
   g_base_binary_types.push_back(large_utf8());
 
-  // Non-parametric, non-nested types
-  g_non_parametric_types.push_back(boolean());
-  Extend(g_numeric_types, &g_non_parametric_types);
-  Extend(g_temporal_types, &g_non_parametric_types);
-  Extend(g_base_binary_types, &g_non_parametric_types);
+  // Non-parametric, non-nested types. This also DOES NOT include
+  //
+  // * Decimal
+  // * Fixed Size Binary
+  g_primitive_types.push_back(null());
+  g_primitive_types.push_back(boolean());
+  Extend(g_numeric_types, &g_primitive_types);
+  Extend(g_temporal_types, &g_primitive_types);
+  Extend(g_base_binary_types, &g_primitive_types);
 }
 
 const std::vector<std::shared_ptr<DataType>>& BaseBinaryTypes() {
@@ -136,9 +140,9 @@ const std::vector<std::shared_ptr<DataType>>& TemporalTypes() {
   return g_temporal_types;
 }
 
-const std::vector<std::shared_ptr<DataType>>& NonParametricTypes() {
+const std::vector<std::shared_ptr<DataType>>& PrimitiveTypes() {
   std::call_once(codegen_static_initialized, InitStaticData);
-  return g_non_parametric_types;
+  return g_primitive_types;
 }
 
 Result<ValueDescr> FirstType(const std::vector<ValueDescr>& descrs) {
diff --git a/cpp/src/arrow/compute/kernels/codegen.h b/cpp/src/arrow/compute/kernels/codegen_internal.h
similarity index 96%
rename from cpp/src/arrow/compute/kernels/codegen.h
rename to cpp/src/arrow/compute/kernels/codegen_internal.h
index 5908c1a4559..791e0a9376f 100644
--- a/cpp/src/arrow/compute/kernels/codegen.h
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -192,8 +192,6 @@ struct ScalarPrimitiveExec {
     using OUT = typename OutType::c_type;
     using ARG0 = typename Arg0Type::c_type;
 
-    // No support for selection vectors yet implemented
-    DCHECK_EQ(nullptr, batch.selection_vector);
     if (batch[0].kind() == Datum::SCALAR) {
       ctx->SetStatus(Status::NotImplemented("NYI"));
     } else {
@@ -212,9 +210,6 @@ struct ScalarPrimitiveExec {
     using ARG0 = typename Arg0Type::c_type;
     using ARG1 = typename Arg1Type::c_type;
 
-    // No support for selection vectors yet implemented
-    DCHECK_EQ(nullptr, batch.selection_vector);
-
     if (batch[0].kind() == Datum::SCALAR || batch[1].kind() == Datum::SCALAR) {
       ctx->SetStatus(Status::NotImplemented("NYI"));
     } else {
@@ -319,8 +314,8 @@ using ScalarBinaryEqualTypes = ScalarBinary<OutType, ArgType, ArgType, Op, Flipp
 
 struct ScalarNumericEqualTypes {
   template <typename Op>
-  static ArrayKernelExec Unary(const DataType& in_type) {
-    switch (in_type.id()) {
+  static ArrayKernelExec Unary(const DataType& type) {
+    switch (type.id()) {
       case Type::INT8:
         return ScalarPrimitiveExec::Unary<Op, Int8Type, Int8Type>;
       case Type::UINT8:
@@ -348,8 +343,8 @@ struct ScalarNumericEqualTypes {
   }
 
   template <typename Op>
-  static ArrayKernelExec Binary(const DataType& in_type) {
-    switch (in_type.id()) {
+  static ArrayKernelExec Binary(const DataType& type) {
+    switch (type.id()) {
       case Type::INT8:
         return ScalarPrimitiveExec::Binary<Op, Int8Type, Int8Type, Int8Type>;
       case Type::UINT8:
@@ -379,8 +374,8 @@ struct ScalarNumericEqualTypes {
 
 template <template <typename...> class Generator,
           typename Type0, typename... Args>
-ArrayKernelExec Numeric(const DataType& in_type) {
-  switch (in_type.id()) {
+ArrayKernelExec Numeric(const DataType& type) {
+  switch (type.id()) {
     case Type::INT8:
       return Generator<Type0, Int8Type, Args...>::Exec;
     case Type::UINT8:
@@ -397,6 +392,10 @@ ArrayKernelExec Numeric(const DataType& in_type) {
       return Generator<Type0, Int64Type, Args...>::Exec;
     case Type::UINT64:
       return Generator<Type0, UInt64Type, Args...>::Exec;
+    case Type::FLOAT:
+      return Generator<Type0, FloatType, Args...>::Exec;
+    case Type::DOUBLE:
+      return Generator<Type0, DoubleType, Args...>::Exec;
     default:
       DCHECK(false);
       return ExecFail;
@@ -445,8 +444,8 @@ ArrayKernelExec Integer(const DataType& type) {
 
 template <template <typename...> class Generator,
           typename Type0, typename... Args>
-ArrayKernelExec BaseBinary(const DataType& in_type) {
-  switch (in_type.id()) {
+ArrayKernelExec BaseBinary(const DataType& type) {
+  switch (type.id()) {
     case Type::BINARY:
       return Generator<Type0, BinaryType, Args...>::Exec;
     case Type::STRING:
@@ -463,8 +462,8 @@ ArrayKernelExec BaseBinary(const DataType& in_type) {
 
 template <template <typename...> class Generator,
           typename Type0, typename... Args>
-ArrayKernelExec Temporal(const DataType& in_type) {
-  switch (in_type.id()) {
+ArrayKernelExec Temporal(const DataType& type) {
+  switch (type.id()) {
     case Type::DATE32:
       return Generator<Type0, Date32Type, Args...>::Exec;
     case Type::DATE64:
diff --git a/cpp/src/arrow/compute/kernels/common.h b/cpp/src/arrow/compute/kernels/common.h
index 6d32ba00ee5..527896b63da 100644
--- a/cpp/src/arrow/compute/kernels/common.h
+++ b/cpp/src/arrow/compute/kernels/common.h
@@ -28,7 +28,7 @@
 #include "arrow/compute/exec.h"                      // IWYU pragma: export
 #include "arrow/compute/function.h"                  // IWYU pragma: export
 #include "arrow/compute/kernel.h"                    // IWYU pragma: export
-#include "arrow/compute/kernels/codegen.h"  // IWYU pragma: export
+#include "arrow/compute/kernels/codegen_internal.h"  // IWYU pragma: export
 #include "arrow/compute/options.h"                   // IWYU pragma: export
 #include "arrow/compute/registry.h"                  // IWYU pragma: export
 #include "arrow/datum.h"                             // IWYU pragma: export
diff --git a/cpp/src/arrow/compute/kernels/vector_filter.cc b/cpp/src/arrow/compute/kernels/vector_filter.cc
index 7a2797ad38b..83acc298c66 100644
--- a/cpp/src/arrow/compute/kernels/vector_filter.cc
+++ b/cpp/src/arrow/compute/kernels/vector_filter.cc
@@ -21,7 +21,7 @@
 #include "arrow/array/concatenate.h"
 #include "arrow/builder.h"
 #include "arrow/compute/kernels/common.h"
-#include "arrow/compute/kernels/vector_selection.h"
+#include "arrow/compute/kernels/vector_selection_internal.h"
 #include "arrow/record_batch.h"
 #include "arrow/result.h"
 
@@ -74,8 +74,9 @@ class FilterIndexSequence {
   int64_t index_ = 0, out_length_ = -1;
 };
 
-static int64_t OutputSize(FilterOptions::NullSelectionBehavior null_selection,
-                          const BooleanArray& filter) {
+int64_t FilterOutputSize(FilterOptions::NullSelectionBehavior null_selection,
+                         const Array& arr) {
+  const auto& filter = checked_cast<const BooleanArray&>(arr);
   // TODO(bkietz) this can be optimized. Use Bitmap::VisitWords
   int64_t size = 0;
   if (null_selection == FilterOptions::EMIT_NULL) {
@@ -95,12 +96,12 @@ static int64_t OutputSize(FilterOptions::NullSelectionBehavior null_selection,
 }
 
 struct FilterState : public KernelState {
-  FilterState(const FilterOptions& options) : options(options) {}
+  explicit FilterState(const FilterOptions& options) : options(options) {}
   FilterOptions options;
 };
 
 std::unique_ptr<KernelState> InitFilter(KernelContext*, const Kernel&,
-                                      const FunctionOptions* options) {
+                                        const FunctionOptions* options) {
   auto filter_options = static_cast<const FilterOptions*>(options);
   return std::unique_ptr<KernelState>(new FilterState{*filter_options});
 }
@@ -114,7 +115,7 @@ struct FilterFunctor {
     using IS = FilterIndexSequence<NullSelection>;
     ArrayType values(batch[0].array());
     BooleanArray filter(batch[1].array());
-    const int64_t output_size = OutputSize(NullSelection, filter);
+    const int64_t output_size = FilterOutputSize(NullSelection, filter);
     std::shared_ptr<Array> result;
     CTX_RETURN_IF_ERROR(ctx, Select(ctx, values, IS(filter, output_size), &result));
     out->value = result->data();
@@ -131,7 +132,6 @@ struct FilterFunctor {
 };
 
 struct FilterKernelVisitor {
-
   template <typename Type>
   Status Visit(const Type&) {
     this->result = FilterFunctor<Type>::Exec;
diff --git a/cpp/src/arrow/compute/kernels/vector_filter_test.cc b/cpp/src/arrow/compute/kernels/vector_filter_test.cc
index edffa97bfb9..320a8fbd7e7 100644
--- a/cpp/src/arrow/compute/kernels/vector_filter_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_filter_test.cc
@@ -21,7 +21,6 @@
 #include <vector>
 
 #include "arrow/compute/api.h"
-#include "arrow/compute/kernel.h"
 #include "arrow/compute/test_util.h"
 #include "arrow/testing/gtest_common.h"
 #include "arrow/testing/gtest_util.h"
@@ -48,7 +47,7 @@ std::shared_ptr<Array> CoalesceNullToFalse(std::shared_ptr<Array> filter) {
 }
 
 template <typename ArrowType>
-class TestFilterKernel : public TestBase {
+class TestFilterKernel : public ::testing::Test {
  protected:
   TestFilterKernel() {
     emit_null_.null_selection_behavior = FilterOptions::EMIT_NULL;
@@ -267,8 +266,8 @@ TYPED_TEST(TestFilterKernelWithNumeric, CompareScalarAndFilterRandomNumeric) {
     CType c_fifty = 50;
     auto fifty = std::make_shared<ScalarType>(c_fifty);
     for (auto op : {EQUAL, NOT_EQUAL, GREATER, LESS_EQUAL}) {
-      ASSERT_OK_AND_ASSIGN(Datum selection,
-                           arrow::compute::Compare(array, fifty, CompareOptions(op)));
+      ASSERT_OK_AND_ASSIGN(Datum selection, arrow::compute::Compare(array, Datum(fifty),
+                                                                    CompareOptions(op)));
       ASSERT_OK_AND_ASSIGN(Datum filtered, arrow::compute::Filter(array, selection, {}));
       auto filtered_array = filtered.make_array();
       ASSERT_OK(filtered_array->ValidateFull());
@@ -315,10 +314,12 @@ TYPED_TEST(TestFilterKernelWithNumeric, ScalarInRangeAndFilterRandomNumeric) {
     CType c_fifty = 50, c_hundred = 100;
     auto fifty = std::make_shared<ScalarType>(c_fifty);
     auto hundred = std::make_shared<ScalarType>(c_hundred);
-    ASSERT_OK_AND_ASSIGN(Datum greater_than_fifty,
-                         arrow::compute::Compare(array, fifty, CompareOptions(GREATER)));
-    ASSERT_OK_AND_ASSIGN(Datum less_than_hundred,
-                         arrow::compute::Compare(array, hundred, CompareOptions(LESS)));
+    ASSERT_OK_AND_ASSIGN(
+        Datum greater_than_fifty,
+        arrow::compute::Compare(array, Datum(fifty), CompareOptions(GREATER)));
+    ASSERT_OK_AND_ASSIGN(
+        Datum less_than_hundred,
+        arrow::compute::Compare(array, Datum(hundred), CompareOptions(LESS)));
     ASSERT_OK_AND_ASSIGN(Datum selection,
                          arrow::compute::And(greater_than_fifty, less_than_hundred));
     ASSERT_OK_AND_ASSIGN(Datum filtered, arrow::compute::Filter(array, selection, {}));
diff --git a/cpp/src/arrow/compute/kernels/vector_selection.h b/cpp/src/arrow/compute/kernels/vector_selection_internal.h
similarity index 98%
rename from cpp/src/arrow/compute/kernels/vector_selection.h
rename to cpp/src/arrow/compute/kernels/vector_selection_internal.h
index 4edd2ac16f1..ac2d548dde7 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection.h
+++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.h
@@ -17,6 +17,9 @@
 
 #include <algorithm>
 #include <limits>
+#include <memory>
+#include <utility>
+#include <vector>
 
 #include "arrow/builder.h"
 #include "arrow/compute/kernels/common.h"
@@ -754,6 +757,9 @@ Status Taker<IndexSequence>::Make(const std::shared_ptr<DataType>& type,
   return VisitTypeInline(*type, &visitor);
 }
 
+int64_t FilterOutputSize(FilterOptions::NullSelectionBehavior null_selection,
+                         const Array& filter);
+
 template <typename IndexSequence>
 Status Select(KernelContext* ctx, const Array& values, IndexSequence sequence,
               std::shared_ptr<Array>* out) {
@@ -767,9 +773,15 @@ Status Select(KernelContext* ctx, const Array& values, IndexSequence sequence,
 // Construct dummy parametric types so that we can get VisitTypeInline to
 // work above
 static DataTypeVector g_dummy_parametric_types = {
-  fixed_size_binary(0),        list(null()),       struct_({}),       decimal(12, 2),
-  dictionary(int32(), null()), fixed_size_list(field("dummy", null()), 0),
-  large_list(null())};
+    decimal(12, 2),
+    fixed_size_binary(0),
+    list(null()),
+    large_list(null()),
+    fixed_size_list(field("dummy", null()), 0),
+    struct_({}),
+    union_({}),
+    dictionary(int32(), null()),
+    map(null(), null())};
 
 }  // namespace internal
 }  // namespace compute
diff --git a/cpp/src/arrow/compute/kernels/vector_take.cc b/cpp/src/arrow/compute/kernels/vector_take.cc
index 23129e1b0b5..0f93b98eb99 100644
--- a/cpp/src/arrow/compute/kernels/vector_take.cc
+++ b/cpp/src/arrow/compute/kernels/vector_take.cc
@@ -20,7 +20,7 @@
 
 #include "arrow/builder.h"
 #include "arrow/compute/kernels/common.h"
-#include "arrow/compute/kernels/vector_selection.h"
+#include "arrow/compute/kernels/vector_selection_internal.h"
 #include "arrow/record_batch.h"
 #include "arrow/result.h"
 
@@ -29,7 +29,7 @@ namespace compute {
 namespace internal {
 
 struct TakeState : public KernelState {
-  TakeState(const TakeOptions& options) : options(options) {}
+  explicit TakeState(const TakeOptions& options) : options(options) {}
   TakeOptions options;
 };
 
@@ -85,13 +85,14 @@ void RegisterVectorTake(FunctionRegistry* registry) {
   base.init = InitTake;
   base.mem_allocation = MemAllocation::NO_PREALLOCATE;
   base.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+  base.can_execute_chunkwise = false;
 
   auto take = std::make_shared<VectorFunction>("take", /*arity=*/2);
 
   OutputType out_ty(FirstType);
   for (const auto& value_ty : PrimitiveTypes()) {
     InputType arg0_ty = InputType::Array(value_ty);
-    for (const auto& index_ty : SignedIntTypes()) {
+    for (const auto& index_ty : IntTypes()) {
       base.signature =
           KernelSignature::Make({arg0_ty, InputType::Array(index_ty)}, out_ty);
       DCHECK_OK(GetTakeKernel(*value_ty, *index_ty, &base.exec));
@@ -101,7 +102,7 @@ void RegisterVectorTake(FunctionRegistry* registry) {
 
   for (const auto& value_ty : g_dummy_parametric_types) {
     InputType arg0_ty = InputType::Array(value_ty->id());
-    for (const auto& index_ty : SignedIntTypes()) {
+    for (const auto& index_ty : IntTypes()) {
       base.signature =
           KernelSignature::Make({arg0_ty, InputType::Array(index_ty)}, out_ty);
       DCHECK_OK(GetTakeKernel(*value_ty, *index_ty, &base.exec));
diff --git a/cpp/src/arrow/compute/kernels/vector_take_test.cc b/cpp/src/arrow/compute/kernels/vector_take_test.cc
index 1ee336c1a7f..bcd607af0fe 100644
--- a/cpp/src/arrow/compute/kernels/vector_take_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_take_test.cc
@@ -21,7 +21,7 @@
 #include <utility>
 #include <vector>
 
-#include "arrow/compute/kernel.h"
+#include "arrow/compute/api.h"
 #include "arrow/compute/test_util.h"
 #include "arrow/testing/gtest_common.h"
 #include "arrow/testing/gtest_util.h"
@@ -37,79 +37,72 @@ using util::string_view;
 
 constexpr auto kSeed = 0x0ff1ce;
 
-template <typename ArrowType>
-class TestTakeKernel : public TestBase {
- protected:
-  void AssertTakeArrays(const std::shared_ptr<Array>& values,
-                        const std::shared_ptr<Array>& indices,
-                        const std::shared_ptr<Array>& expected) {
-    ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> actual, Take(*values, *indices));
-    ASSERT_OK(actual->ValidateFull());
-    AssertArraysEqual(*expected, *actual);
-  }
+void AssertTakeArrays(const std::shared_ptr<Array>& values,
+                      const std::shared_ptr<Array>& indices,
+                      const std::shared_ptr<Array>& expected) {
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> actual, Take(*values, *indices));
+  ASSERT_OK(actual->ValidateFull());
+  AssertArraysEqual(*expected, *actual);
+}
 
-  void AssertTake(const std::shared_ptr<DataType>& type, const std::string& values,
-                  const std::string& indices, const std::string& expected) {
-    std::shared_ptr<Array> actual;
+Status TakeJSON(const std::shared_ptr<DataType>& type, const std::string& values,
+                const std::shared_ptr<DataType>& index_type, const std::string& indices,
+                std::shared_ptr<Array>* out) {
+  return Take(*ArrayFromJSON(type, values), *ArrayFromJSON(index_type, indices))
+      .Value(out);
+}
 
-    for (auto index_type : {int8(), uint32()}) {
-      ASSERT_OK(this->Take(type, values, index_type, indices, &actual));
-      ASSERT_OK(actual->ValidateFull());
-      AssertArraysEqual(*ArrayFromJSON(type, expected), *actual);
-    }
-  }
+void CheckTake(const std::shared_ptr<DataType>& type, const std::string& values,
+               const std::string& indices, const std::string& expected) {
+  std::shared_ptr<Array> actual;
 
-  Status Take(const std::shared_ptr<DataType>& type, const std::string& values,
-              const std::shared_ptr<DataType>& index_type, const std::string& indices,
-              std::shared_ptr<Array>* out) {
-    return Take(*ArrayFromJSON(type, values), *ArrayFromJSON(index_type, indices))
-        .Value(out);
+  for (auto index_type : {int8(), uint32()}) {
+    ASSERT_OK(TakeJSON(type, values, index_type, indices, &actual));
+    ASSERT_OK(actual->ValidateFull());
+    AssertArraysEqual(*ArrayFromJSON(type, expected), *actual);
   }
-};
+}
 
-class TestTakeKernelWithNull : public TestTakeKernel<NullType> {
- protected:
-  void AssertTake(const std::string& values, const std::string& indices,
-                  const std::string& expected) {
-    TestTakeKernel<NullType>::AssertTake(null(), values, indices, expected);
-  }
-};
+void AssertTakeNull(const std::string& values, const std::string& indices,
+                    const std::string& expected) {
+  CheckTake(null(), values, indices, expected);
+}
+
+void AssertTakeBoolean(const std::string& values, const std::string& indices,
+                       const std::string& expected) {
+  CheckTake(boolean(), values, indices, expected);
+}
+
+template <typename ArrowType>
+class TestTakeKernel : public ::testing::Test {};
 
-TEST_F(TestTakeKernelWithNull, TakeNull) {
-  this->AssertTake("[null, null, null]", "[0, 1, 0]", "[null, null, null]");
+TEST(TestTakeKernel, TakeNull) {
+  AssertTakeNull("[null, null, null]", "[0, 1, 0]", "[null, null, null]");
 
   std::shared_ptr<Array> arr;
   ASSERT_RAISES(IndexError,
-                this->Take(null(), "[null, null, null]", int8(), "[0, 9, 0]", &arr));
+                TakeJSON(null(), "[null, null, null]", int8(), "[0, 9, 0]", &arr));
   ASSERT_RAISES(IndexError,
-                this->Take(boolean(), "[null, null, null]", int8(), "[0, -1, 0]", &arr));
+                TakeJSON(boolean(), "[null, null, null]", int8(), "[0, -1, 0]", &arr));
 }
 
-TEST_F(TestTakeKernelWithNull, InvalidIndexType) {
+TEST(TestTakeKernel, InvalidIndexType) {
   std::shared_ptr<Array> arr;
-  ASSERT_RAISES(TypeError, this->Take(null(), "[null, null, null]", float32(),
-                                      "[0.0, 1.0, 0.1]", &arr));
+  ASSERT_RAISES(NotImplemented, TakeJSON(null(), "[null, null, null]", float32(),
+                                         "[0.0, 1.0, 0.1]", &arr));
 }
 
-class TestTakeKernelWithBoolean : public TestTakeKernel<BooleanType> {
- protected:
-  void AssertTake(const std::string& values, const std::string& indices,
-                  const std::string& expected) {
-    TestTakeKernel<BooleanType>::AssertTake(boolean(), values, indices, expected);
-  }
-};
-
-TEST_F(TestTakeKernelWithBoolean, TakeBoolean) {
-  this->AssertTake("[7, 8, 9]", "[]", "[]");
-  this->AssertTake("[true, false, true]", "[0, 1, 0]", "[true, false, true]");
-  this->AssertTake("[null, false, true]", "[0, 1, 0]", "[null, false, null]");
-  this->AssertTake("[true, false, true]", "[null, 1, 0]", "[null, false, true]");
+TEST(TestTakeKernel, TakeBoolean) {
+  AssertTakeBoolean("[7, 8, 9]", "[]", "[]");
+  AssertTakeBoolean("[true, false, true]", "[0, 1, 0]", "[true, false, true]");
+  AssertTakeBoolean("[null, false, true]", "[0, 1, 0]", "[null, false, null]");
+  AssertTakeBoolean("[true, false, true]", "[null, 1, 0]", "[null, false, true]");
 
   std::shared_ptr<Array> arr;
   ASSERT_RAISES(IndexError,
-                this->Take(boolean(), "[true, false, true]", int8(), "[0, 9, 0]", &arr));
+                TakeJSON(boolean(), "[true, false, true]", int8(), "[0, 9, 0]", &arr));
   ASSERT_RAISES(IndexError,
-                this->Take(boolean(), "[true, false, true]", int8(), "[0, -1, 0]", &arr));
+                TakeJSON(boolean(), "[true, false, true]", int8(), "[0, -1, 0]", &arr));
 }
 
 template <typename ArrowType>
@@ -117,7 +110,7 @@ class TestTakeKernelWithNumeric : public TestTakeKernel<ArrowType> {
  protected:
   void AssertTake(const std::string& values, const std::string& indices,
                   const std::string& expected) {
-    TestTakeKernel<ArrowType>::AssertTake(type_singleton(), values, indices, expected);
+    CheckTake(type_singleton(), values, indices, expected);
   }
 
   std::shared_ptr<DataType> type_singleton() {
@@ -153,10 +146,10 @@ TYPED_TEST(TestTakeKernelWithNumeric, TakeNumeric) {
   this->AssertTake("[7, 8, 9]", "[0, 0, 0, 0, 0, 0, 2]", "[7, 7, 7, 7, 7, 7, 9]");
 
   std::shared_ptr<Array> arr;
-  ASSERT_RAISES(IndexError, this->Take(this->type_singleton(), "[7, 8, 9]", int8(),
-                                       "[0, 9, 0]", &arr));
-  ASSERT_RAISES(IndexError, this->Take(this->type_singleton(), "[7, 8, 9]", int8(),
-                                       "[0, -1, 0]", &arr));
+  ASSERT_RAISES(IndexError,
+                TakeJSON(this->type_singleton(), "[7, 8, 9]", int8(), "[0, 9, 0]", &arr));
+  ASSERT_RAISES(IndexError, TakeJSON(this->type_singleton(), "[7, 8, 9]", int8(),
+                                     "[0, -1, 0]", &arr));
 }
 
 TYPED_TEST(TestTakeKernelWithNumeric, TakeRandomNumeric) {
@@ -187,7 +180,7 @@ class TestTakeKernelWithString : public TestTakeKernel<TypeClass> {
 
   void AssertTake(const std::string& values, const std::string& indices,
                   const std::string& expected) {
-    TestTakeKernel<TypeClass>::AssertTake(value_type(), values, indices, expected);
+    CheckTake(value_type(), values, indices, expected);
   }
   void AssertTakeDictionary(const std::string& dictionary_values,
                             const std::string& dictionary_indices,
@@ -202,7 +195,7 @@ class TestTakeKernelWithString : public TestTakeKernel<TypeClass> {
         auto expected,
         DictionaryArray::FromArrays(type, ArrayFromJSON(int8(), expected_indices), dict));
     auto take_indices = ArrayFromJSON(int8(), indices);
-    this->AssertTakeArrays(values, take_indices, expected);
+    AssertTakeArrays(values, take_indices, expected);
   }
 };
 
@@ -216,9 +209,9 @@ TYPED_TEST(TestTakeKernelWithString, TakeString) {
   std::shared_ptr<DataType> type = this->value_type();
   std::shared_ptr<Array> arr;
   ASSERT_RAISES(IndexError,
-                this->Take(type, R"(["a", "b", "c"])", int8(), "[0, 9, 0]", &arr));
-  ASSERT_RAISES(IndexError, this->Take(type, R"(["a", "b", null, "ddd", "ee"])", int64(),
-                                       "[2, 5]", &arr));
+                TakeJSON(type, R"(["a", "b", "c"])", int8(), "[0, 9, 0]", &arr));
+  ASSERT_RAISES(IndexError, TakeJSON(type, R"(["a", "b", null, "ddd", "ee"])", int64(),
+                                     "[2, 5]", &arr));
 }
 
 TYPED_TEST(TestTakeKernelWithString, TakeDictionary) {
@@ -232,14 +225,14 @@ class TestTakeKernelWithList : public TestTakeKernel<ListType> {};
 
 TEST_F(TestTakeKernelWithList, TakeListInt32) {
   std::string list_json = "[[], [1,2], null, [3]]";
-  this->AssertTake(list(int32()), list_json, "[]", "[]");
-  this->AssertTake(list(int32()), list_json, "[3, 2, 1]", "[[3], null, [1,2]]");
-  this->AssertTake(list(int32()), list_json, "[null, 3, 0]", "[null, [3], []]");
-  this->AssertTake(list(int32()), list_json, "[null, null]", "[null, null]");
-  this->AssertTake(list(int32()), list_json, "[3, 0, 0, 3]", "[[3], [], [], [3]]");
-  this->AssertTake(list(int32()), list_json, "[0, 1, 2, 3]", list_json);
-  this->AssertTake(list(int32()), list_json, "[0, 0, 0, 0, 0, 0, 1]",
-                   "[[], [], [], [], [], [], [1, 2]]");
+  CheckTake(list(int32()), list_json, "[]", "[]");
+  CheckTake(list(int32()), list_json, "[3, 2, 1]", "[[3], null, [1,2]]");
+  CheckTake(list(int32()), list_json, "[null, 3, 0]", "[null, [3], []]");
+  CheckTake(list(int32()), list_json, "[null, null]", "[null, null]");
+  CheckTake(list(int32()), list_json, "[3, 0, 0, 3]", "[[3], [], [], [3]]");
+  CheckTake(list(int32()), list_json, "[0, 1, 2, 3]", list_json);
+  CheckTake(list(int32()), list_json, "[0, 0, 0, 0, 0, 0, 1]",
+            "[[], [], [], [], [], [], [1, 2]]");
 }
 
 TEST_F(TestTakeKernelWithList, TakeListListInt32) {
@@ -250,49 +243,47 @@ TEST_F(TestTakeKernelWithList, TakeListListInt32) {
     [[3, null], null]
   ])";
   auto type = list(list(int32()));
-  this->AssertTake(type, list_json, "[]", "[]");
-  this->AssertTake(type, list_json, "[3, 2, 1]", R"([
+  CheckTake(type, list_json, "[]", "[]");
+  CheckTake(type, list_json, "[3, 2, 1]", R"([
     [[3, null], null],
     null,
     [[1], [2, null, 2], []]
   ])");
-  this->AssertTake(type, list_json, "[null, 3, 0]", R"([
+  CheckTake(type, list_json, "[null, 3, 0]", R"([
     null,
     [[3, null], null],
     []
   ])");
-  this->AssertTake(type, list_json, "[null, null]", "[null, null]");
-  this->AssertTake(type, list_json, "[3, 0, 0, 3]",
-                   "[[[3, null], null], [], [], [[3, null], null]]");
-  this->AssertTake(type, list_json, "[0, 1, 2, 3]", list_json);
-  this->AssertTake(type, list_json, "[0, 0, 0, 0, 0, 0, 1]",
-                   "[[], [], [], [], [], [], [[1], [2, null, 2], []]]");
+  CheckTake(type, list_json, "[null, null]", "[null, null]");
+  CheckTake(type, list_json, "[3, 0, 0, 3]",
+            "[[[3, null], null], [], [], [[3, null], null]]");
+  CheckTake(type, list_json, "[0, 1, 2, 3]", list_json);
+  CheckTake(type, list_json, "[0, 0, 0, 0, 0, 0, 1]",
+            "[[], [], [], [], [], [], [[1], [2, null, 2], []]]");
 }
 
 class TestTakeKernelWithLargeList : public TestTakeKernel<LargeListType> {};
 
 TEST_F(TestTakeKernelWithLargeList, TakeLargeListInt32) {
   std::string list_json = "[[], [1,2], null, [3]]";
-  this->AssertTake(large_list(int32()), list_json, "[]", "[]");
-  this->AssertTake(large_list(int32()), list_json, "[null, 1, 2, 0]",
-                   "[null, [1,2], null, []]");
+  CheckTake(large_list(int32()), list_json, "[]", "[]");
+  CheckTake(large_list(int32()), list_json, "[null, 1, 2, 0]", "[null, [1,2], null, []]");
 }
 
 class TestTakeKernelWithFixedSizeList : public TestTakeKernel<FixedSizeListType> {};
 
 TEST_F(TestTakeKernelWithFixedSizeList, TakeFixedSizeListInt32) {
   std::string list_json = "[null, [1, null, 3], [4, 5, 6], [7, 8, null]]";
-  this->AssertTake(fixed_size_list(int32(), 3), list_json, "[]", "[]");
-  this->AssertTake(fixed_size_list(int32(), 3), list_json, "[3, 2, 1]",
-                   "[[7, 8, null], [4, 5, 6], [1, null, 3]]");
-  this->AssertTake(fixed_size_list(int32(), 3), list_json, "[null, 2, 0]",
-                   "[null, [4, 5, 6], null]");
-  this->AssertTake(fixed_size_list(int32(), 3), list_json, "[null, null]",
-                   "[null, null]");
-  this->AssertTake(fixed_size_list(int32(), 3), list_json, "[3, 0, 0, 3]",
-                   "[[7, 8, null], null, null, [7, 8, null]]");
-  this->AssertTake(fixed_size_list(int32(), 3), list_json, "[0, 1, 2, 3]", list_json);
-  this->AssertTake(
+  CheckTake(fixed_size_list(int32(), 3), list_json, "[]", "[]");
+  CheckTake(fixed_size_list(int32(), 3), list_json, "[3, 2, 1]",
+            "[[7, 8, null], [4, 5, 6], [1, null, 3]]");
+  CheckTake(fixed_size_list(int32(), 3), list_json, "[null, 2, 0]",
+            "[null, [4, 5, 6], null]");
+  CheckTake(fixed_size_list(int32(), 3), list_json, "[null, null]", "[null, null]");
+  CheckTake(fixed_size_list(int32(), 3), list_json, "[3, 0, 0, 3]",
+            "[[7, 8, null], null, null, [7, 8, null]]");
+  CheckTake(fixed_size_list(int32(), 3), list_json, "[0, 1, 2, 3]", list_json);
+  CheckTake(
       fixed_size_list(int32(), 3), list_json, "[2, 2, 2, 2, 2, 2, 1]",
       "[[4, 5, 6], [4, 5, 6], [4, 5, 6], [4, 5, 6], [4, 5, 6], [4, 5, 6], [1, null, 3]]");
 }
@@ -306,21 +297,21 @@ TEST_F(TestTakeKernelWithMap, TakeMapStringToInt32) {
     [["cap", 8]],
     []
   ])";
-  this->AssertTake(map(utf8(), int32()), map_json, "[]", "[]");
-  this->AssertTake(map(utf8(), int32()), map_json, "[3, 1, 3, 1, 3]",
-                   "[[], null, [], null, []]");
-  this->AssertTake(map(utf8(), int32()), map_json, "[2, 1, null]", R"([
+  CheckTake(map(utf8(), int32()), map_json, "[]", "[]");
+  CheckTake(map(utf8(), int32()), map_json, "[3, 1, 3, 1, 3]",
+            "[[], null, [], null, []]");
+  CheckTake(map(utf8(), int32()), map_json, "[2, 1, null]", R"([
     [["cap", 8]],
     null,
     null
   ])");
-  this->AssertTake(map(utf8(), int32()), map_json, "[2, 1, 0]", R"([
+  CheckTake(map(utf8(), int32()), map_json, "[2, 1, 0]", R"([
     [["cap", 8]],
     null,
     [["joe", 0], ["mark", null]]
   ])");
-  this->AssertTake(map(utf8(), int32()), map_json, "[0, 1, 2, 3]", map_json);
-  this->AssertTake(map(utf8(), int32()), map_json, "[0, 0, 0, 0, 0, 0, 3]", R"([
+  CheckTake(map(utf8(), int32()), map_json, "[0, 1, 2, 3]", map_json);
+  CheckTake(map(utf8(), int32()), map_json, "[0, 0, 0, 0, 0, 0, 3]", R"([
     [["joe", 0], ["mark", null]],
     [["joe", 0], ["mark", null]],
     [["joe", 0], ["mark", null]],
@@ -341,21 +332,21 @@ TEST_F(TestTakeKernelWithStruct, TakeStruct) {
     {"a": 2, "b": "hello"},
     {"a": 4, "b": "eh"}
   ])";
-  this->AssertTake(struct_type, struct_json, "[]", "[]");
-  this->AssertTake(struct_type, struct_json, "[3, 1, 3, 1, 3]", R"([
+  CheckTake(struct_type, struct_json, "[]", "[]");
+  CheckTake(struct_type, struct_json, "[3, 1, 3, 1, 3]", R"([
     {"a": 4, "b": "eh"},
     {"a": 1, "b": ""},
     {"a": 4, "b": "eh"},
     {"a": 1, "b": ""},
     {"a": 4, "b": "eh"}
   ])");
-  this->AssertTake(struct_type, struct_json, "[3, 1, 0]", R"([
+  CheckTake(struct_type, struct_json, "[3, 1, 0]", R"([
     {"a": 4, "b": "eh"},
     {"a": 1, "b": ""},
     null
   ])");
-  this->AssertTake(struct_type, struct_json, "[0, 1, 2, 3]", struct_json);
-  this->AssertTake(struct_type, struct_json, "[0, 2, 2, 2, 2, 2, 2]", R"([
+  CheckTake(struct_type, struct_json, "[0, 1, 2, 3]", struct_json);
+  CheckTake(struct_type, struct_json, "[0, 2, 2, 2, 2, 2, 2]", R"([
     null,
     {"a": 2, "b": "hello"},
     {"a": 2, "b": "hello"},
@@ -379,21 +370,21 @@ TEST_F(TestTakeKernelWithUnion, TakeUnion) {
       null,
       [2, 111]
     ])";
-    this->AssertTake(union_type, union_json, "[]", "[]");
-    this->AssertTake(union_type, union_json, "[3, 1, 3, 1, 3]", R"([
+    CheckTake(union_type, union_json, "[]", "[]");
+    CheckTake(union_type, union_json, "[3, 1, 3, 1, 3]", R"([
       [5, "eh"],
       [2, 222],
       [5, "eh"],
       [2, 222],
       [5, "eh"]
     ])");
-    this->AssertTake(union_type, union_json, "[4, 2, 1]", R"([
+    CheckTake(union_type, union_json, "[4, 2, 1]", R"([
       null,
       [5, "hello"],
       [2, 222]
     ])");
-    this->AssertTake(union_type, union_json, "[0, 1, 2, 3, 4, 5]", union_json);
-    this->AssertTake(union_type, union_json, "[0, 2, 2, 2, 2, 2, 2]", R"([
+    CheckTake(union_type, union_json, "[0, 1, 2, 3, 4, 5]", union_json);
+    CheckTake(union_type, union_json, "[0, 2, 2, 2, 2, 2, 2]", R"([
       null,
       [5, "hello"],
       [5, "hello"],
@@ -407,27 +398,28 @@ TEST_F(TestTakeKernelWithUnion, TakeUnion) {
 
 class TestPermutationsWithTake : public TestBase {
  protected:
-  void Take(const Int16Array& values, const Int16Array& indices,
-            std::shared_ptr<Int16Array>* out) {
+  void DoTake(const Int16Array& values, const Int16Array& indices,
+              std::shared_ptr<Int16Array>* out) {
     ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> boxed_out, Take(values, indices));
     ASSERT_OK(boxed_out->ValidateFull());
     *out = checked_pointer_cast<Int16Array>(std::move(boxed_out));
   }
 
-  std::shared_ptr<Int16Array> Take(const Int16Array& values, const Int16Array& indices) {
+  std::shared_ptr<Int16Array> DoTake(const Int16Array& values,
+                                     const Int16Array& indices) {
     std::shared_ptr<Int16Array> out;
-    Take(values, indices, &out);
+    DoTake(values, indices, &out);
     return out;
   }
 
-  std::shared_ptr<Int16Array> TakeN(uint64_t n, std::shared_ptr<Int16Array> array) {
+  std::shared_ptr<Int16Array> DoTakeN(uint64_t n, std::shared_ptr<Int16Array> array) {
     auto power_of_2 = array;
     array = Identity(array->length());
     while (n != 0) {
       if (n & 1) {
-        array = Take(*array, *power_of_2);
+        array = DoTake(*array, *power_of_2);
       }
-      power_of_2 = Take(*power_of_2, *power_of_2);
+      power_of_2 = DoTake(*power_of_2, *power_of_2);
       n >>= 1;
     }
     return array;
@@ -471,7 +463,7 @@ class TestPermutationsWithTake : public TestBase {
     auto permutation_to_the_i = permutation;
     for (int16_t cycle_length = 1; cycle_length <= length; ++cycle_length) {
       cycle_lengths[cycle_length] = HasTrivialCycle(*permutation_to_the_i);
-      permutation_to_the_i = Take(*permutation, *permutation_to_the_i);
+      permutation_to_the_i = DoTake(*permutation, *permutation_to_the_i);
     }
 
     uint64_t cycle_to_identity_length = 1;
@@ -490,7 +482,7 @@ class TestPermutationsWithTake : public TestBase {
       cycle_to_identity_length *= cycle_length;
     }
 
-    return TakeN(cycle_to_identity_length - 1, permutation);
+    return DoTakeN(cycle_to_identity_length - 1, permutation);
   }
 
   bool HasTrivialCycle(const Int16Array& permutation) {
@@ -513,7 +505,7 @@ TEST_F(TestPermutationsWithTake, InvertPermutation) {
       if (inverse == nullptr) {
         break;
       }
-      ASSERT_TRUE(Take(*inverse, *permutation)->Equals(identity));
+      ASSERT_TRUE(DoTake(*inverse, *permutation)->Equals(identity));
     }
   }
 }
@@ -525,15 +517,15 @@ class TestTakeKernelWithRecordBatch : public TestTakeKernel<RecordBatch> {
     std::shared_ptr<RecordBatch> actual;
 
     for (auto index_type : {int8(), uint32()}) {
-      ASSERT_OK(this->Take(schm, batch_json, index_type, indices, &actual));
+      ASSERT_OK(TakeJSON(schm, batch_json, index_type, indices, &actual));
       ASSERT_OK(actual->ValidateFull());
       ASSERT_BATCHES_EQUAL(*RecordBatchFromJSON(schm, expected_batch), *actual);
     }
   }
 
-  Status Take(const std::shared_ptr<Schema>& schm, const std::string& batch_json,
-              const std::shared_ptr<DataType>& index_type, const std::string& indices,
-              std::shared_ptr<RecordBatch>* out) {
+  Status TakeJSON(const std::shared_ptr<Schema>& schm, const std::string& batch_json,
+                  const std::shared_ptr<DataType>& index_type, const std::string& indices,
+                  std::shared_ptr<RecordBatch>* out) {
     auto batch = RecordBatchFromJSON(schm, batch_json);
     return Take(*batch, *ArrayFromJSON(index_type, indices)).Value(out);
   }
diff --git a/cpp/src/arrow/datum.cc b/cpp/src/arrow/datum.cc
index 21d075c70a3..9ad6564e34c 100644
--- a/cpp/src/arrow/datum.cc
+++ b/cpp/src/arrow/datum.cc
@@ -170,8 +170,35 @@ static std::string FormatValueDescr(const ValueDescr& descr) {
 std::string ValueDescr::ToString() const { return FormatValueDescr(*this); }
 
 std::string Datum::ToString() const {
-  // TODO: Formatting for other values
-  return FormatValueDescr(this->descr());
+  switch (this->kind()) {
+    case Datum::NONE:
+      return "nullptr";
+    case Datum::SCALAR:
+      return "Scalar";
+    case Datum::ARRAY:
+      return "Array";
+    case Datum::CHUNKED_ARRAY:
+      return "ChunkedArray";
+    case Datum::RECORD_BATCH:
+      return "RecordBatch";
+    case Datum::TABLE:
+      return "Table";
+    case Datum::COLLECTION: {
+      std::stringstream ss;
+      ss << "Collection(";
+      const auto& values = this->collection();
+      for (size_t i = 0; i < values.size(); ++i) {
+        if (i > 0) {
+          ss << ", ";
+        }
+        ss << values[i].ToString();
+      }
+      return ss.str();
+    }
+    default:
+      DCHECK(false);
+      break;
+  }
 }
 
 ValueDescr::Shape GetBroadcastShape(const std::vector<ValueDescr>& args) {
diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index 6d8a4f33ec0..f5ebd2d50d1 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -23,7 +23,6 @@
 
 #include "gtest/gtest.h"
 
-#include <arrow/compute/api.h>
 #include <cstdint>
 #include <functional>
 #include <iostream>

From 1c89e38b675d2b1e322adefdea70a32f295b1da3 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Tue, 19 May 2020 21:06:25 -0500
Subject: [PATCH 06/41] Port hash kernels, adapt execution framework to suit

---
 cpp/src/arrow/CMakeLists.txt                  |   2 +-
 cpp/src/arrow/compute/api_aggregate.cc        |   8 +-
 cpp/src/arrow/compute/api_scalar.cc           |   8 +-
 cpp/src/arrow/compute/api_vector.cc           |  66 +---
 cpp/src/arrow/compute/exec.cc                 | 239 +++++++-------
 cpp/src/arrow/compute/exec.h                  |  28 +-
 cpp/src/arrow/compute/exec_internal.h         |   3 +
 cpp/src/arrow/compute/exec_test.cc            |  48 ++-
 cpp/src/arrow/compute/function_test.cc        |   3 +-
 cpp/src/arrow/compute/kernel.h                |  25 +-
 cpp/src/arrow/compute/kernels/CMakeLists.txt  |  34 +-
 .../arrow/compute/kernels/aggregate_basic.cc  |  26 +-
 .../compute/kernels/scalar_set_lookup.cc      |   6 +-
 .../arrow/compute/kernels/vector_filter.cc    |   7 +-
 .../compute/kernels/vector_filter_test.cc     |   8 +-
 cpp/src/arrow/compute/kernels/vector_hash.cc  | 310 +++++++++---------
 .../arrow/compute/kernels/vector_hash_test.cc |   6 +-
 cpp/src/arrow/compute/kernels/vector_sort.cc  |   6 +-
 cpp/src/arrow/compute/kernels/vector_take.cc  |   7 +-
 cpp/src/arrow/compute/registry.cc             |   1 +
 20 files changed, 384 insertions(+), 457 deletions(-)

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 6c81ba7e9ca..0125504ebb9 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -333,11 +333,11 @@ if(ARROW_COMPUTE)
               compute/kernels/scalar_boolean.cc
               compute/kernels/scalar_compare.cc
               compute/kernels/vector_filter.cc
+              compute/kernels/vector_hash.cc
               compute/kernels/scalar_set_lookup.cc
               compute/kernels/vector_sort.cc
               compute/kernels/vector_take.cc
               # compute/kernels/scalar_cast.cc
-              # compute/kernels/vector_hash.cc
        )
 endif()
 
diff --git a/cpp/src/arrow/compute/api_aggregate.cc b/cpp/src/arrow/compute/api_aggregate.cc
index 6d85f64843b..fdcb8654c57 100644
--- a/cpp/src/arrow/compute/api_aggregate.cc
+++ b/cpp/src/arrow/compute/api_aggregate.cc
@@ -30,19 +30,19 @@ namespace compute {
 // Scalar aggregates
 
 Result<Datum> Count(const Datum& value, CountOptions options, ExecContext* ctx) {
-  return ExecScalarAggregateFunction(ctx, "count", {value}, &options);
+  return CallFunction(ctx, "count", {value}, &options);
 }
 
 Result<Datum> Mean(const Datum& value, ExecContext* ctx) {
-  return ExecScalarAggregateFunction(ctx, "mean", {value});
+  return CallFunction(ctx, "mean", {value});
 }
 
 Result<Datum> Sum(const Datum& value, ExecContext* ctx) {
-  return ExecScalarAggregateFunction(ctx, "sum", {value});
+  return CallFunction(ctx, "sum", {value});
 }
 
 Result<Datum> MinMax(const Datum& value, const MinMaxOptions& options, ExecContext* ctx) {
-  return ExecScalarAggregateFunction(ctx, "minmax", {value}, &options);
+  return CallFunction(ctx, "minmax", {value}, &options);
 }
 
 }  // namespace compute
diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc
index 4793ee813ad..888e738c90d 100644
--- a/cpp/src/arrow/compute/api_scalar.cc
+++ b/cpp/src/arrow/compute/api_scalar.cc
@@ -28,12 +28,12 @@ namespace compute {
 
 #define SCALAR_EAGER_UNARY(NAME, REGISTRY_NAME)              \
   Result<Datum> NAME(const Datum& value, ExecContext* ctx) { \
-    return ExecScalarFunction(ctx, REGISTRY_NAME, {value});  \
+    return CallFunction(ctx, REGISTRY_NAME, {value});        \
   }
 
 #define SCALAR_EAGER_BINARY(NAME, REGISTRY_NAME)                                \
   Result<Datum> NAME(const Datum& left, const Datum& right, ExecContext* ctx) { \
-    return ExecScalarFunction(ctx, REGISTRY_NAME, {left, right});               \
+    return CallFunction(ctx, REGISTRY_NAME, {left, right});                     \
   }
 
 // ----------------------------------------------------------------------
@@ -54,7 +54,7 @@ static Result<Datum> ExecSetLookup(const std::string& func_name, const Datum& da
     return Status::Invalid(ss.str());
   }
   SetLookupOptions options(std::move(value_set), !add_nulls_to_hash_table);
-  return ExecScalarFunction(ctx, func_name, {data}, &options);
+  return CallFunction(ctx, func_name, {data}, &options);
 }
 
 Result<Datum> IsIn(const Datum& values, std::shared_ptr<Array> value_set,
@@ -107,7 +107,7 @@ Result<Datum> Compare(const Datum& left, const Datum& right, CompareOptions opti
       DCHECK(false);
       break;
   }
-  return ExecScalarFunction(ctx, func_name, {left, right}, &options);
+  return CallFunction(ctx, func_name, {left, right}, &options);
 }
 
 }  // namespace compute
diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc
index 454b1c02acb..7ac7a217fb0 100644
--- a/cpp/src/arrow/compute/api_vector.cc
+++ b/cpp/src/arrow/compute/api_vector.cc
@@ -37,60 +37,28 @@ namespace compute {
 Result<std::shared_ptr<Array>> NthToIndices(const Array& values, int64_t n,
                                             ExecContext* ctx) {
   PartitionOptions options(/*pivot=*/n);
-  ARROW_ASSIGN_OR_RAISE(Datum result, ExecVectorFunction(ctx, "partition_indices",
-                                                         {Datum(values)}, &options));
+  ARROW_ASSIGN_OR_RAISE(
+      Datum result, CallFunction(ctx, "partition_indices", {Datum(values)}, &options));
   return result.make_array();
 }
 
 Result<std::shared_ptr<Array>> SortToIndices(const Array& values, ExecContext* ctx) {
-  ARROW_ASSIGN_OR_RAISE(Datum result,
-                        ExecVectorFunction(ctx, "sort_indices", {Datum(values)}));
+  ARROW_ASSIGN_OR_RAISE(Datum result, CallFunction(ctx, "sort_indices", {Datum(values)}));
   return result.make_array();
 }
 
 Result<Datum> Take(const Datum& values, const Datum& indices, const TakeOptions& options,
                    ExecContext* ctx) {
-  return ExecVectorFunction(ctx, "take", {values, indices}, &options);
+  return CallFunction(ctx, "take", {values, indices}, &options);
 }
 
-namespace {
-
-// Status InvokeHash(FunctionContext* ctx, HashKernel* func, const Datum& value,
-//                   std::vector<Datum>* kernel_outputs,
-//                   std::shared_ptr<Array>* dictionary) {
-//   RETURN_NOT_OK(detail::InvokeUnaryArrayKernel(ctx, func, value, kernel_outputs));
-//   std::shared_ptr<ArrayData> dict_data;
-//   RETURN_NOT_OK(func->GetDictionary(&dict_data));
-//   *dictionary = MakeArray(dict_data);
-//   return Status::OK();
-// }
-
-}  // namespace
-
 Result<std::shared_ptr<Array>> Unique(const Datum& value, ExecContext* ctx) {
-  // std::unique_ptr<HashKernel> func;
-  // RETURN_NOT_OK(GetUniqueKernel(ctx, value.type(), &func));
-  // std::vector<Datum> dummy_outputs;
-  // return InvokeHash(ctx, func.get(), value, &dummy_outputs, out);
-  return Status::NotImplemented("NYI");
+  ARROW_ASSIGN_OR_RAISE(Datum result, CallFunction(ctx, "unique", {value}));
+  return result.make_array();
 }
 
 Result<Datum> DictionaryEncode(const Datum& value, ExecContext* ctx) {
-  // std::unique_ptr<HashKernel> func;
-  // RETURN_NOT_OK(GetDictionaryEncodeKernel(ctx, value.type(), &func));
-  // std::shared_ptr<Array> dict;
-  // std::vector<Datum> indices_outputs;
-  // RETURN_NOT_OK(InvokeHash(ctx, func.get(), value, &indices_outputs, &dict));
-  // auto dict_type = dictionary(func->out_type(), dict->type());
-  // // Wrap indices in dictionary arrays for result
-  // std::vector<std::shared_ptr<Array>> dict_chunks;
-  // for (const Datum& datum : indices_outputs) {
-  //   dict_chunks.emplace_back(
-  //       std::make_shared<DictionaryArray>(dict_type, datum.make_array(), dict));
-  // }
-  // *out = detail::WrapArraysLike(value, dict_type, dict_chunks);
-  // return Status::OK();
-  return Status::NotImplemented("NYI");
+  return CallFunction(ctx, "dict_encode", {value});
 }
 
 const char kValuesFieldName[] = "values";
@@ -99,22 +67,8 @@ const int32_t kValuesFieldIndex = 0;
 const int32_t kCountsFieldIndex = 1;
 
 Result<std::shared_ptr<Array>> ValueCounts(const Datum& value, ExecContext* ctx) {
-  // std::unique_ptr<HashKernel> func;
-  // RETURN_NOT_OK(GetValueCountsKernel(ctx, value.type(), &func));
-  // // Calls return nothing for counts.
-  // std::vector<Datum> unused_output;
-  // std::shared_ptr<Array> uniques;
-  // RETURN_NOT_OK(InvokeHash(ctx, func.get(), value, &unused_output, &uniques));
-  // Datum value_counts;
-  // RETURN_NOT_OK(func->FlushFinal(&value_counts));
-  // auto data_type = std::make_shared<StructType>(std::vector<std::shared_ptr<Field>>{
-  //     std::make_shared<Field>(kValuesFieldName, uniques->type()),
-  //     std::make_shared<Field>(kCountsFieldName, int64())});
-  // *counts = std::make_shared<StructArray>(
-  //     data_type, uniques->length(),
-  //     std::vector<std::shared_ptr<Array>>{uniques, MakeArray(value_counts.array())});
-  // return Status::OK();
-  return Status::NotImplemented("NYI");
+  ARROW_ASSIGN_OR_RAISE(Datum result, CallFunction(ctx, "value_counts", {value}));
+  return result.make_array();
 }
 
 // ----------------------------------------------------------------------
@@ -169,7 +123,7 @@ Result<Datum> Filter(const Datum& values, const Datum& filter, FilterOptions opt
                           FilterTable(*values.table(), filter, options, ctx));
     return Datum(out_table);
   } else {
-    return ExecVectorFunction(ctx, "filter", {values, filter}, &options);
+    return CallFunction(ctx, "filter", {values, filter}, &options);
   }
 }
 
diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc
index 7b461f5176d..24ffd89aa9e 100644
--- a/cpp/src/arrow/compute/exec.cc
+++ b/cpp/src/arrow/compute/exec.cc
@@ -436,6 +436,29 @@ class DatumAccumulator : public ExecListener {
   std::vector<Datum> values_;
 };
 
+std::shared_ptr<ChunkedArray> ToChunkedArray(const std::vector<Datum>& values,
+                                             const std::shared_ptr<DataType>& type) {
+  std::vector<std::shared_ptr<Array>> arrays;
+  for (const auto& val : values) {
+    auto boxed = val.make_array();
+    if (boxed->length() == 0) {
+      // Skip empty chunks
+      continue;
+    }
+    arrays.emplace_back(std::move(boxed));
+  }
+  return std::make_shared<ChunkedArray>(arrays, type);
+}
+
+bool HaveChunkedArray(const std::vector<Datum>& values) {
+  for (const auto& value : values) {
+    if (value.kind() == Datum::CHUNKED_ARRAY) {
+      return true;
+    }
+  }
+  return false;
+}
+
 template <typename FunctionType>
 class FunctionExecutorImpl : public FunctionExecutor {
  public:
@@ -451,7 +474,8 @@ class FunctionExecutorImpl : public FunctionExecutor {
   Status InitState() {
     // Some kernels require initialization of an opaque state object
     if (kernel_->init) {
-      state_ = kernel_->init(&kernel_ctx_, *kernel_, options_);
+      KernelInitArgs init_args{kernel_, input_descrs_, options_};
+      state_ = kernel_->init(&kernel_ctx_, init_args);
       CTX_RETURN_IF_ERROR(&kernel_ctx_);
       kernel_ctx_.SetState(state_.get());
     }
@@ -466,13 +490,12 @@ class FunctionExecutorImpl : public FunctionExecutor {
   }
 
   Status BindArgs(const std::vector<Datum>& args) {
-    std::vector<ValueDescr> arg_descrs;
-    RETURN_NOT_OK(GetValueDescriptors(args, &arg_descrs));
-    ARROW_ASSIGN_OR_RAISE(kernel_, func_->DispatchExact(arg_descrs));
+    RETURN_NOT_OK(GetValueDescriptors(args, &input_descrs_));
+    ARROW_ASSIGN_OR_RAISE(kernel_, func_->DispatchExact(input_descrs_));
 
     // Resolve the output descriptor for this kernel
     ARROW_ASSIGN_OR_RAISE(output_descr_,
-                          kernel_->signature->out_type().Resolve(arg_descrs));
+                          kernel_->signature->out_type().Resolve(input_descrs_));
 
     return SetupArgIteration(args);
   }
@@ -502,6 +525,7 @@ class FunctionExecutorImpl : public FunctionExecutor {
   const KernelType* kernel_;
   std::unique_ptr<ExecBatchIterator> batch_iterator_;
   std::unique_ptr<KernelState> state_;
+  std::vector<ValueDescr> input_descrs_;
   ValueDescr output_descr_;
   const FunctionOptions* options_;
 
@@ -536,6 +560,33 @@ class ScalarExecutor : public FunctionExecutorImpl<ScalarFunction> {
     return Status::OK();
   }
 
+  Datum WrapResults(const std::vector<Datum>& inputs,
+                    const std::vector<Datum>& outputs) override {
+    if (output_descr_.shape == ValueDescr::SCALAR) {
+      DCHECK_GT(outputs.size(), 0);
+      if (outputs.size() == 1) {
+        // Return as SCALAR
+        return outputs[0];
+      } else {
+        // Return as COLLECTION
+        return outputs;
+      }
+    } else {
+      // If execution yielded multiple chunks (because large arrays were split
+      // based on the ExecContext parameters, then the result is a ChunkedArray
+      if (HaveChunkedArray(inputs) || outputs.size() > 1) {
+        return ToChunkedArray(outputs, output_descr_.type);
+      } else if (outputs.size() == 1) {
+        // Outputs have just one element
+        return outputs[0];
+      } else {
+        // XXX: In the case where no outputs are omitted, is returning a 0-length
+        // array always the correct move?
+        return MakeArrayOfNull(output_descr_.type, /*length=*/0).ValueOrDie();
+      }
+    }
+  }
+
  protected:
   Status ExecuteBatch(const ExecBatch& batch, ExecListener* listener) {
     Datum out;
@@ -694,8 +745,32 @@ class VectorExecutor : public FunctionExecutorImpl<VectorFunction> {
     return Finalize(listener);
   }
 
+  Datum WrapResults(const std::vector<Datum>& inputs,
+                    const std::vector<Datum>& outputs) override {
+    // If execution yielded multiple chunks (because large arrays were split
+    // based on the ExecContext parameters, then the result is a ChunkedArray
+    if (kernel_->output_chunked) {
+      if (HaveChunkedArray(inputs) || outputs.size() > 1) {
+        return ToChunkedArray(outputs, output_descr_.type);
+      } else if (outputs.size() == 1) {
+        // Outputs have just one element
+        return outputs[0];
+      } else {
+        // XXX: In the case where no outputs are omitted, is returning a 0-length
+        // array always the correct move?
+        return MakeArrayOfNull(output_descr_.type, /*length=*/0).ValueOrDie();
+      }
+    } else {
+      return outputs[0];
+    }
+  }
+
  protected:
   Status ExecuteBatch(const ExecBatch& batch, ExecListener* listener) {
+    if (batch.length == 0) {
+      // Skip empty batches. This should only happen with zero-length inputs
+      return Status::OK();
+    }
     Datum out;
     if (output_descr_.shape == ValueDescr::ARRAY) {
       // We preallocate (maybe) only for the output of processing the current
@@ -723,11 +798,10 @@ class VectorExecutor : public FunctionExecutorImpl<VectorFunction> {
     if (kernel_->finalize) {
       // Intermediate results require post-processing after the execution is
       // completed (possibly involving some accumulated state)
+      kernel_->finalize(&kernel_ctx_, &results_);
+      CTX_RETURN_IF_ERROR(&kernel_ctx_);
       for (const auto& result : results_) {
-        Datum finalized_result;
-        kernel_->finalize(&kernel_ctx_, result, &finalized_result);
-        CTX_RETURN_IF_ERROR(&kernel_ctx_);
-        RETURN_NOT_OK(listener->OnResult(std::move(finalized_result)));
+        RETURN_NOT_OK(listener->OnResult(result));
       }
     }
     return Status::OK();
@@ -764,19 +838,7 @@ class ScalarAggExecutor : public FunctionExecutorImpl<ScalarAggregateFunction> {
   using FunctionType = ScalarAggregateFunction;
   static constexpr Function::Kind function_kind = Function::SCALAR_AGGREGATE;
   using BASE = FunctionExecutorImpl<ScalarAggregateFunction>;
-
-  Status Consume(const ExecBatch& batch) {
-    auto batch_state = kernel_->init(&kernel_ctx_, *kernel_, options_);
-    KernelContext batch_ctx(exec_ctx_);
-    batch_ctx.SetState(batch_state.get());
-
-    kernel_->consume(&batch_ctx, batch);
-    CTX_RETURN_IF_ERROR(&batch_ctx);
-
-    kernel_->merge(&kernel_ctx_, *batch_state, state_.get());
-    CTX_RETURN_IF_ERROR(&kernel_ctx_);
-    return Status::OK();
-  }
+  using BASE::BASE;
 
   Status Execute(const std::vector<Datum>& args, ExecListener* listener) override {
     RETURN_NOT_OK(BindArgs(args));
@@ -800,8 +862,28 @@ class ScalarAggExecutor : public FunctionExecutorImpl<ScalarAggregateFunction> {
     return Status::OK();
   }
 
+  Datum WrapResults(const std::vector<Datum>&,
+                    const std::vector<Datum>& outputs) override {
+    DCHECK_EQ(1, outputs.size());
+    return outputs[0];
+  }
+
  private:
-  using BASE::BASE;
+  Status Consume(const ExecBatch& batch) {
+    KernelInitArgs init_args{kernel_, input_descrs_, options_};
+    auto batch_state = kernel_->init(&kernel_ctx_, init_args);
+    CTX_RETURN_IF_ERROR(&kernel_ctx_);
+
+    KernelContext batch_ctx(exec_ctx_);
+    batch_ctx.SetState(batch_state.get());
+
+    kernel_->consume(&batch_ctx, batch);
+    CTX_RETURN_IF_ERROR(&batch_ctx);
+
+    kernel_->merge(&kernel_ctx_, *batch_state, state_.get());
+    CTX_RETURN_IF_ERROR(&kernel_ctx_);
+    return Status::OK();
+  }
 };
 
 template <typename ExecutorType,
@@ -836,26 +918,6 @@ Status CheckAllValues(const std::vector<Datum>& values) {
   return Status::OK();
 }
 
-Status ExecuteFunction(ExecContext* ctx, const std::string& func_name,
-                       const std::vector<Datum>& args, const FunctionOptions* options,
-                       ValueDescr* out_descr, ExecListener* listener) {
-  if (ctx == nullptr) {
-    ExecContext default_ctx;
-    return ExecuteFunction(&default_ctx, func_name, args, options, out_descr, listener);
-  }
-
-  // type-check Datum arguments here. Really we'd like to avoid this as much as
-  // possible
-  RETURN_NOT_OK(CheckAllValues(args));
-
-  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<const Function> func,
-                        ctx->func_registry()->GetFunction(func_name));
-  ARROW_ASSIGN_OR_RAISE(auto executor, FunctionExecutor::Make(ctx, func.get(), options));
-  RETURN_NOT_OK(executor->Execute(args, listener));
-  *out_descr = executor->output_descr();
-  return Status::OK();
-}
-
 }  // namespace detail
 
 ExecContext::ExecContext(MemoryPool* pool, FunctionRegistry* func_registry)
@@ -881,86 +943,25 @@ Result<std::shared_ptr<SelectionVector>> SelectionVector::FromMask(const Array&
   return Status::NotImplemented("FromMask");
 }
 
-namespace {
-
-std::shared_ptr<ChunkedArray> ToChunkedArray(const std::vector<Datum>& values,
-                                             const std::shared_ptr<DataType>& type) {
-  std::vector<std::shared_ptr<Array>> arrays;
-  for (const auto& val : values) {
-    auto boxed = val.make_array();
-    if (boxed->length() == 0) {
-      // Skip empty chunks
-      continue;
-    }
-    arrays.emplace_back(std::move(boxed));
-  }
-  return std::make_shared<ChunkedArray>(arrays, type);
-}
-
-bool HaveChunkedArray(const std::vector<Datum>& values) {
-  for (const auto& value : values) {
-    if (value.kind() == Datum::CHUNKED_ARRAY) {
-      return true;
-    }
-  }
-  return false;
-}
-
-Datum WrapArrayResults(const std::vector<Datum>& input_args,
-                       const std::vector<Datum>& results,
-                       const ValueDescr& output_descr) {
-  DCHECK_GT(results.size(), 0);
-  if (output_descr.shape == ValueDescr::SCALAR) {
-    if (results.size() == 1) {
-      // Return as SCALAR
-      return results[0];
-    } else {
-      // Return as COLLECTION
-      return results;
-    }
-  } else {
-    // If execution yielded multiple chunks (because large arrays were split
-    // based on the ExecContext parameters, then the result is a ChunkedArray
-    if (HaveChunkedArray(input_args) || results.size() > 1) {
-      return ToChunkedArray(results, output_descr.type);
-    } else {
-      // Results have just one element
-      return results[0];
-    }
+Result<Datum> CallFunction(ExecContext* ctx, const std::string& func_name,
+                           const std::vector<Datum>& args,
+                           const FunctionOptions* options) {
+  if (ctx == nullptr) {
+    ExecContext default_ctx;
+    return CallFunction(&default_ctx, func_name, args, options);
   }
-}
-
-}  // namespace
-
-Result<Datum> ExecScalarFunction(ExecContext* ctx, const std::string& func_name,
-                                 const std::vector<Datum>& args,
-                                 const FunctionOptions* options) {
-  auto listener = std::make_shared<detail::DatumAccumulator>();
-  ValueDescr out_descr;
-  RETURN_NOT_OK(
-      detail::ExecuteFunction(ctx, func_name, args, options, &out_descr, listener.get()));
-  return WrapArrayResults(args, listener->values(), out_descr);
-}
 
-Result<Datum> ExecVectorFunction(ExecContext* ctx, const std::string& func_name,
-                                 const std::vector<Datum>& args,
-                                 const FunctionOptions* options) {
-  auto listener = std::make_shared<detail::DatumAccumulator>();
-  ValueDescr out_descr;
-  RETURN_NOT_OK(
-      detail::ExecuteFunction(ctx, func_name, args, options, &out_descr, listener.get()));
-  return WrapArrayResults(args, listener->values(), out_descr);
-}
+  // type-check Datum arguments here. Really we'd like to avoid this as much as
+  // possible
+  RETURN_NOT_OK(detail::CheckAllValues(args));
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<const Function> func,
+                        ctx->func_registry()->GetFunction(func_name));
+  ARROW_ASSIGN_OR_RAISE(auto executor,
+                        detail::FunctionExecutor::Make(ctx, func.get(), options));
 
-Result<Datum> ExecScalarAggregateFunction(ExecContext* ctx, const std::string& func_name,
-                                          const std::vector<Datum>& args,
-                                          const FunctionOptions* options) {
   auto listener = std::make_shared<detail::DatumAccumulator>();
-  ValueDescr unused;
-  RETURN_NOT_OK(
-      detail::ExecuteFunction(ctx, func_name, args, options, &unused, listener.get()));
-  DCHECK_EQ(1, listener->values().size());
-  return listener->values()[0];
+  RETURN_NOT_OK(executor->Execute(args, listener.get()));
+  return executor->WrapResults(args, listener->values());
 }
 
 }  // namespace compute
diff --git a/cpp/src/arrow/compute/exec.h b/cpp/src/arrow/compute/exec.h
index b473838e281..909f2830ae0 100644
--- a/cpp/src/arrow/compute/exec.h
+++ b/cpp/src/arrow/compute/exec.h
@@ -147,29 +147,13 @@ struct ExecBatch {
   }
 };
 
-/// \brief Convenience method for invoking a scalar (elementwise) array
-/// function, including handling iteration on ChunkedArray inputs
+/// \brief One-shot invoker for all types of functions. Does kernel dispatch,
+/// argument checking, iteration of ChunkedArray inputs, and wrapping of
+/// outputs
 ARROW_EXPORT
-Result<Datum> ExecScalarFunction(ExecContext* ctx, const std::string& func_name,
-                                 const std::vector<Datum>& args,
-                                 const FunctionOptions* options = NULLPTR);
-
-/// \brief Convenience method for invoking a vector array function, including
-/// handling iteration on ChunkedArray inputs. Compared with a scalar function,
-/// vector functions may require post-processing of chunked outputs if the
-/// results are dependent on the whole data passed (e.g. with hash table
-/// functions)
-ARROW_EXPORT
-Result<Datum> ExecVectorFunction(ExecContext* ctx, const std::string& func_name,
-                                 const std::vector<Datum>& args,
-                                 const FunctionOptions* options = NULLPTR);
-
-/// \brief Convenience method for invoking a scalar aggregate function,
-/// including handling iteration on ChunkedArray inputs
-ARROW_EXPORT
-Result<Datum> ExecScalarAggregateFunction(ExecContext* ctx, const std::string& func_name,
-                                          const std::vector<Datum>& args,
-                                          const FunctionOptions* options = NULLPTR);
+Result<Datum> CallFunction(ExecContext* ctx, const std::string& func_name,
+                           const std::vector<Datum>& args,
+                           const FunctionOptions* options = NULLPTR);
 
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/exec_internal.h b/cpp/src/arrow/compute/exec_internal.h
index 1c61541d557..ad2cbdb025c 100644
--- a/cpp/src/arrow/compute/exec_internal.h
+++ b/cpp/src/arrow/compute/exec_internal.h
@@ -101,6 +101,9 @@ class ARROW_EXPORT FunctionExecutor {
 
   virtual ValueDescr output_descr() const = 0;
 
+  virtual Datum WrapResults(const std::vector<Datum>& args,
+                            const std::vector<Datum>& outputs) = 0;
+
   static Result<std::unique_ptr<FunctionExecutor>> Make(ExecContext* ctx,
                                                         const Function* func,
                                                         const FunctionOptions* options);
diff --git a/cpp/src/arrow/compute/exec_test.cc b/cpp/src/arrow/compute/exec_test.cc
index 95ea112390c..794207461dc 100644
--- a/cpp/src/arrow/compute/exec_test.cc
+++ b/cpp/src/arrow/compute/exec_test.cc
@@ -592,9 +592,8 @@ struct ExampleState : public KernelState {
   explicit ExampleState(std::shared_ptr<Scalar> value) : value(std::move(value)) {}
 };
 
-std::unique_ptr<KernelState> InitStateful(KernelContext*, const Kernel&,
-                                          const FunctionOptions* options) {
-  auto func_options = static_cast<const ExampleOptions*>(options);
+std::unique_ptr<KernelState> InitStateful(KernelContext*, const KernelInitArgs& args) {
+  auto func_options = static_cast<const ExampleOptions*>(args.options);
   return std::unique_ptr<KernelState>(new ExampleState{func_options->value});
 }
 
@@ -618,7 +617,7 @@ void ExecAddInt32(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
   out->value = std::make_shared<Int32Scalar>(arg0.value + arg1.value);
 }
 
-class TestExecScalarFunction : public TestComputeInternals {
+class TestCallScalarFunction : public TestComputeInternals {
  public:
   void SetUp() {
     TestComputeInternals::SetUp();
@@ -684,24 +683,24 @@ class TestExecScalarFunction : public TestComputeInternals {
   }
 };
 
-TEST_F(TestExecScalarFunction, ArgumentValidation) {
+TEST_F(TestCallScalarFunction, ArgumentValidation) {
   // Copy accepts only a single array argument
   Datum d1(GetInt32Array(10));
 
   // Too many args
   std::vector<Datum> args = {d1, d1};
-  ASSERT_RAISES(Invalid, ExecScalarFunction(exec_ctx_.get(), "copy", args));
+  ASSERT_RAISES(Invalid, CallFunction(exec_ctx_.get(), "copy", args));
 
   // Too few
   args = {};
-  ASSERT_RAISES(Invalid, ExecScalarFunction(exec_ctx_.get(), "copy", args));
+  ASSERT_RAISES(Invalid, CallFunction(exec_ctx_.get(), "copy", args));
 
   // Cannot do scalar
   args = {Datum(std::make_shared<Int32Scalar>(5))};
-  ASSERT_RAISES(NotImplemented, ExecScalarFunction(exec_ctx_.get(), "copy", args));
+  ASSERT_RAISES(NotImplemented, CallFunction(exec_ctx_.get(), "copy", args));
 }
 
-TEST_F(TestExecScalarFunction, PreallocationCases) {
+TEST_F(TestCallScalarFunction, PreallocationCases) {
   double null_prob = 0.2;
 
   auto arr = GetUInt8Array(50, null_prob);
@@ -712,8 +711,7 @@ TEST_F(TestExecScalarFunction, PreallocationCases) {
     // The default should be a single array output
     {
       std::vector<Datum> args = {Datum(arr)};
-      ASSERT_OK_AND_ASSIGN(Datum result,
-                           ExecScalarFunction(exec_ctx_.get(), func_name, args));
+      ASSERT_OK_AND_ASSIGN(Datum result, CallFunction(exec_ctx_.get(), func_name, args));
       ASSERT_EQ(Datum::ARRAY, result.kind());
       AssertArraysEqual(*arr, *result.make_array());
     }
@@ -723,8 +721,7 @@ TEST_F(TestExecScalarFunction, PreallocationCases) {
     {
       std::vector<Datum> args = {Datum(arr)};
       exec_ctx_->set_exec_chunksize(8);
-      ASSERT_OK_AND_ASSIGN(Datum result,
-                           ExecScalarFunction(exec_ctx_.get(), func_name, args));
+      ASSERT_OK_AND_ASSIGN(Datum result, CallFunction(exec_ctx_.get(), func_name, args));
       AssertArraysEqual(*arr, *result.make_array());
     }
 
@@ -734,8 +731,7 @@ TEST_F(TestExecScalarFunction, PreallocationCases) {
     {
       std::vector<Datum> args = {Datum(arr)};
       exec_ctx_->set_exec_chunksize(12);
-      ASSERT_OK_AND_ASSIGN(Datum result,
-                           ExecScalarFunction(exec_ctx_.get(), func_name, args));
+      ASSERT_OK_AND_ASSIGN(Datum result, CallFunction(exec_ctx_.get(), func_name, args));
       AssertArraysEqual(*arr, *result.make_array());
     }
 
@@ -744,8 +740,7 @@ TEST_F(TestExecScalarFunction, PreallocationCases) {
       auto carr = std::shared_ptr<ChunkedArray>(
           new ChunkedArray({arr->Slice(0, 15), arr->Slice(15)}));
       std::vector<Datum> args = {Datum(carr)};
-      ASSERT_OK_AND_ASSIGN(Datum result,
-                           ExecScalarFunction(exec_ctx_.get(), func_name, args));
+      ASSERT_OK_AND_ASSIGN(Datum result, CallFunction(exec_ctx_.get(), func_name, args));
       std::shared_ptr<ChunkedArray> actual = result.chunked_array();
       ASSERT_EQ(1, actual->num_chunks());
       AssertChunkedEquivalent(*carr, *actual);
@@ -756,8 +751,7 @@ TEST_F(TestExecScalarFunction, PreallocationCases) {
       std::vector<Datum> args = {Datum(arr)};
       exec_ctx_->set_preallocate_contiguous(false);
       exec_ctx_->set_exec_chunksize(20);
-      ASSERT_OK_AND_ASSIGN(Datum result,
-                           ExecScalarFunction(exec_ctx_.get(), func_name, args));
+      ASSERT_OK_AND_ASSIGN(Datum result, CallFunction(exec_ctx_.get(), func_name, args));
       ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind());
       const ChunkedArray& carr = *result.chunked_array();
       ASSERT_EQ(3, carr.num_chunks());
@@ -771,7 +765,7 @@ TEST_F(TestExecScalarFunction, PreallocationCases) {
   CheckFunction("copy_computed_bitmap");
 }
 
-TEST_F(TestExecScalarFunction, BasicNonStandardCases) {
+TEST_F(TestCallScalarFunction, BasicNonStandardCases) {
   // Test a handful of cases
   //
   // * Validity bitmap computed by kernel rather than using PropagateNulls
@@ -789,16 +783,14 @@ TEST_F(TestExecScalarFunction, BasicNonStandardCases) {
     // The default should be a single array output
     {
       exec_ctx_->set_exec_chunksize(-1);
-      ASSERT_OK_AND_ASSIGN(Datum result,
-                           ExecScalarFunction(exec_ctx_.get(), func_name, args));
+      ASSERT_OK_AND_ASSIGN(Datum result, CallFunction(exec_ctx_.get(), func_name, args));
       AssertArraysEqual(*arr, *result.make_array(), true);
     }
 
     // Split execution into 3 chunks
     {
       exec_ctx_->set_exec_chunksize(40);
-      ASSERT_OK_AND_ASSIGN(Datum result,
-                           ExecScalarFunction(exec_ctx_.get(), func_name, args));
+      ASSERT_OK_AND_ASSIGN(Datum result, CallFunction(exec_ctx_.get(), func_name, args));
       ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind());
       const ChunkedArray& carr = *result.chunked_array();
       ASSERT_EQ(3, carr.num_chunks());
@@ -812,7 +804,7 @@ TEST_F(TestExecScalarFunction, BasicNonStandardCases) {
   CheckFunction("nopre_validity_or_data");
 }
 
-TEST_F(TestExecScalarFunction, StatefulKernel) {
+TEST_F(TestCallScalarFunction, StatefulKernel) {
   auto input = ArrayFromJSON(int32(), "[1, 2, 3, null, 5]");
   auto multiplier = std::make_shared<Int32Scalar>(2);
   auto expected = ArrayFromJSON(int32(), "[2, 4, 6, null, 10]");
@@ -820,15 +812,15 @@ TEST_F(TestExecScalarFunction, StatefulKernel) {
   ExampleOptions options(multiplier);
   std::vector<Datum> args = {Datum(input)};
   ASSERT_OK_AND_ASSIGN(Datum result,
-                       ExecScalarFunction(exec_ctx_.get(), "stateful", args, &options));
+                       CallFunction(exec_ctx_.get(), "stateful", args, &options));
   AssertArraysEqual(*expected, *result.make_array());
 }
 
-TEST_F(TestExecScalarFunction, ScalarFunction) {
+TEST_F(TestCallScalarFunction, ScalarFunction) {
   std::vector<Datum> args = {Datum(std::make_shared<Int32Scalar>(5)),
                              Datum(std::make_shared<Int32Scalar>(7))};
   ASSERT_OK_AND_ASSIGN(Datum result,
-                       ExecScalarFunction(exec_ctx_.get(), "scalar_add_int32", args));
+                       CallFunction(exec_ctx_.get(), "scalar_add_int32", args));
   ASSERT_EQ(Datum::SCALAR, result.kind());
 
   auto expected = std::make_shared<Int32Scalar>(12);
diff --git a/cpp/src/arrow/compute/function_test.cc b/cpp/src/arrow/compute/function_test.cc
index c720a33ace7..84aedd6249c 100644
--- a/cpp/src/arrow/compute/function_test.cc
+++ b/cpp/src/arrow/compute/function_test.cc
@@ -184,8 +184,7 @@ TEST(ScalarAggregateFunction, Basics) {
   ASSERT_EQ(Function::SCALAR_AGGREGATE, func.kind());
 }
 
-std::unique_ptr<KernelState> NoopInit(KernelContext*, const Kernel&,
-                                      const FunctionOptions*) {
+std::unique_ptr<KernelState> NoopInit(KernelContext*, const KernelInitArgs&) {
   return nullptr;
 }
 
diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h
index 22c444aa903..cb9becc00dd 100644
--- a/cpp/src/arrow/compute/kernel.h
+++ b/cpp/src/arrow/compute/kernel.h
@@ -373,8 +373,15 @@ struct MemAllocation {
 
 struct Kernel;
 
-using KernelInit = std::function<std::unique_ptr<KernelState>(
-    KernelContext*, const Kernel&, const FunctionOptions*)>;
+struct KernelInitArgs {
+  const Kernel* kernel;
+  const std::vector<ValueDescr>& inputs;
+  const FunctionOptions* options;
+};
+
+// Kernel initializer (context, argument descriptors, options)
+using KernelInit =
+    std::function<std::unique_ptr<KernelState>(KernelContext*, const KernelInitArgs&)>;
 
 /// \brief Base type for kernels. Contains the function signature and
 /// optionally the state initialization function, along with some common
@@ -435,8 +442,8 @@ struct ScalarKernel : public ArrayKernel {
   MemAllocation::type mem_allocation = MemAllocation::PREALLOCATE;
 };
 
-// Convert intermediate result into finalized result
-using VectorFinalize = std::function<void(KernelContext*, const Datum&, Datum*)>;
+// Convert intermediate results into finalized results. Mutates input argument
+using VectorFinalize = std::function<void(KernelContext*, std::vector<Datum>*)>;
 
 struct VectorKernel : public ArrayKernel {
   VectorKernel() {}
@@ -465,6 +472,16 @@ struct VectorKernel : public ArrayKernel {
   /// in some cases accumulating some state. Other kernels (like Take) need to
   /// be passed whole arrays and don't work on ChunkedArray inputs
   bool can_execute_chunkwise = true;
+
+  /// Some kernels (like unique and value_counts) yield non-chunked output from
+  /// chunked-array inputs. This option controls how the results are boxed when
+  /// returned from ExecVectorFunction
+  ///
+  /// true -> ChunkedArray
+  /// false -> Array
+  ///
+  /// TODO: Where is a better place to deal with this issue?
+  bool output_chunked = true;
 };
 
 using ScalarAggregateConsume = std::function<void(KernelContext*, const ExecBatch&)>;
diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt
index 9a32cc35a86..dd309ad4e85 100644
--- a/cpp/src/arrow/compute/kernels/CMakeLists.txt
+++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt
@@ -32,28 +32,18 @@ add_arrow_benchmark(scalar_compare_benchmark PREFIX "arrow-compute")
 # ----------------------------------------------------------------------
 # Vector kernels
 
-add_arrow_compute_test(vector_test SOURCES
-  vector_filter_test.cc
-  vector_take_test.cc
-  vector_sort_test.cc)
-
-# add_arrow_compute_test(hash_test)
-
-# add_arrow_benchmark(hash_benchmark PREFIX "arrow-compute")
-
-# Single-array sorting
-
-# add_arrow_compute_test(sort_to_indices_test)
-# add_arrow_benchmark(sort_to_indices_benchmark PREFIX "arrow-compute")
-# add_arrow_benchmark(nth_to_indices_benchmark PREFIX "arrow-compute")
-
-# Array value selection
-
-# add_arrow_compute_test(filter_test)
-# add_arrow_compute_test(take_test)
-
-# add_arrow_benchmark(filter_benchmark PREFIX "arrow-compute")a
-# add_arrow_benchmark(take_benchmark PREFIX "arrow-compute")
+add_arrow_compute_test(vector_test
+                       SOURCES
+                       vector_filter_test.cc
+                       vector_hash_test.cc
+                       vector_take_test.cc
+                       vector_sort_test.cc)
+
+# add_arrow_benchmark(vector_hash_benchmark PREFIX "arrow-compute")
+# add_arrow_benchmark(vector_sort_benchmark PREFIX "arrow-compute")
+# add_arrow_benchmark(vector_partition_benchmark PREFIX "arrow-compute")
+# add_arrow_benchmark(vector_filter_benchmark PREFIX "arrow-compute")a
+# add_arrow_benchmark(vector_take_benchmark PREFIX "arrow-compute")
 
 # ----------------------------------------------------------------------
 # Aggregate kernels
diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
index 8e5b8f16396..709454d3ac3 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
@@ -81,10 +81,9 @@ struct CountImpl : public ScalarAggregator {
   int64_t nulls = 0;
 };
 
-std::unique_ptr<KernelState> CountInit(KernelContext*, const Kernel&,
-                                       const FunctionOptions* options) {
+std::unique_ptr<KernelState> CountInit(KernelContext*, const KernelInitArgs& args) {
   return std::unique_ptr<KernelState>(
-      new CountImpl(static_cast<const CountOptions&>(*options)));
+      new CountImpl(static_cast<const CountOptions&>(*args.options)));
 }
 
 // ----------------------------------------------------------------------
@@ -305,17 +304,13 @@ struct SumLikeInit {
   }
 };
 
-std::unique_ptr<KernelState> SumInit(KernelContext* ctx, const Kernel& kernel,
-                                     const FunctionOptions*) {
-  const DataType& input_type = *kernel.signature->in_types()[0].type();
-  SumLikeInit<SumImpl> visitor(ctx, input_type);
+std::unique_ptr<KernelState> SumInit(KernelContext* ctx, const KernelInitArgs& args) {
+  SumLikeInit<SumImpl> visitor(ctx, *args.inputs[0].type);
   return visitor.Create();
 }
 
-std::unique_ptr<KernelState> MeanInit(KernelContext* ctx, const Kernel& kernel,
-                                      const FunctionOptions*) {
-  const DataType& input_type = *kernel.signature->in_types()[0].type();
-  SumLikeInit<MeanImpl> visitor(ctx, input_type);
+std::unique_ptr<KernelState> MeanInit(KernelContext* ctx, const KernelInitArgs& args) {
+  SumLikeInit<MeanImpl> visitor(ctx, *args.inputs[0].type);
   return visitor.Create();
 }
 
@@ -461,11 +456,10 @@ struct MinMaxInitState {
   }
 };
 
-std::unique_ptr<KernelState> MinMaxInit(KernelContext* ctx, const Kernel& kernel,
-                                        const FunctionOptions* options) {
-  MinMaxInitState visitor(ctx, *kernel.signature->in_types()[0].type(),
-                          kernel.signature->out_type().type(),
-                          static_cast<const MinMaxOptions&>(*options));
+std::unique_ptr<KernelState> MinMaxInit(KernelContext* ctx, const KernelInitArgs& args) {
+  MinMaxInitState visitor(ctx, *args.inputs[0].type,
+                          args.kernel->signature->out_type().type(),
+                          static_cast<const MinMaxOptions&>(*args.options));
   return visitor.Create();
 }
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
index 26975553bd0..7d16f6fb12f 100644
--- a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
@@ -100,9 +100,9 @@ struct InitStateVisitor {
   }
 };
 
-std::unique_ptr<KernelState> InitSetLookup(KernelContext* ctx, const Kernel&,
-                                           const FunctionOptions* options) {
-  InitStateVisitor visitor{ctx, static_cast<const SetLookupOptions*>(options)};
+std::unique_ptr<KernelState> InitSetLookup(KernelContext* ctx,
+                                           const KernelInitArgs& args) {
+  InitStateVisitor visitor{ctx, static_cast<const SetLookupOptions*>(args.options)};
   std::unique_ptr<KernelState> result;
   ctx->SetStatus(visitor.GetResult(&result));
   return result;
diff --git a/cpp/src/arrow/compute/kernels/vector_filter.cc b/cpp/src/arrow/compute/kernels/vector_filter.cc
index 83acc298c66..089209c1d28 100644
--- a/cpp/src/arrow/compute/kernels/vector_filter.cc
+++ b/cpp/src/arrow/compute/kernels/vector_filter.cc
@@ -100,9 +100,8 @@ struct FilterState : public KernelState {
   FilterOptions options;
 };
 
-std::unique_ptr<KernelState> InitFilter(KernelContext*, const Kernel&,
-                                        const FunctionOptions* options) {
-  auto filter_options = static_cast<const FilterOptions*>(options);
+std::unique_ptr<KernelState> InitFilter(KernelContext*, const KernelInitArgs& args) {
+  auto filter_options = static_cast<const FilterOptions*>(args.options);
   return std::unique_ptr<KernelState>(new FilterState{*filter_options});
 }
 
@@ -152,8 +151,6 @@ Status GetFilterKernel(const DataType& type, ArrayKernelExec* exec) {
 void RegisterVectorFilter(FunctionRegistry* registry) {
   VectorKernel base;
   base.init = InitFilter;
-  base.mem_allocation = MemAllocation::NO_PREALLOCATE;
-  base.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
 
   auto filter = std::make_shared<VectorFunction>("filter", /*arity=*/2);
   OutputType out_ty(FirstType);
diff --git a/cpp/src/arrow/compute/kernels/vector_filter_test.cc b/cpp/src/arrow/compute/kernels/vector_filter_test.cc
index 320a8fbd7e7..6a784363541 100644
--- a/cpp/src/arrow/compute/kernels/vector_filter_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_filter_test.cc
@@ -194,7 +194,7 @@ TYPED_TEST(TestFilterKernelWithNumeric, FilterNumeric) {
 
 TYPED_TEST(TestFilterKernelWithNumeric, FilterRandomNumeric) {
   auto rand = random::RandomArrayGenerator(kSeed);
-  for (size_t i = 3; i < 13; i++) {
+  for (size_t i = 3; i < 10; i++) {
     const int64_t length = static_cast<int64_t>(1ULL << i);
     for (auto null_probability : {0.0, 0.01, 0.25, 1.0}) {
       for (auto filter_probability : {0.0, 0.1, 0.5, 1.0}) {
@@ -258,7 +258,7 @@ TYPED_TEST(TestFilterKernelWithNumeric, CompareScalarAndFilterRandomNumeric) {
   using CType = typename TypeTraits<TypeParam>::CType;
 
   auto rand = random::RandomArrayGenerator(kSeed);
-  for (size_t i = 3; i < 13; i++) {
+  for (size_t i = 3; i < 10; i++) {
     const int64_t length = static_cast<int64_t>(1ULL << i);
     // TODO(bkietz) rewrite with some nulls
     auto array =
@@ -282,7 +282,7 @@ TYPED_TEST(TestFilterKernelWithNumeric, CompareArrayAndFilterRandomNumeric) {
   using ArrayType = typename TypeTraits<TypeParam>::ArrayType;
 
   auto rand = random::RandomArrayGenerator(kSeed);
-  for (size_t i = 3; i < 13; i++) {
+  for (size_t i = 3; i < 10; i++) {
     const int64_t length = static_cast<int64_t>(1ULL << i);
     auto lhs = checked_pointer_cast<ArrayType>(
         rand.Numeric<TypeParam>(length, 0, 100, /*null_probability=*/0.0));
@@ -307,7 +307,7 @@ TYPED_TEST(TestFilterKernelWithNumeric, ScalarInRangeAndFilterRandomNumeric) {
   using CType = typename TypeTraits<TypeParam>::CType;
 
   auto rand = random::RandomArrayGenerator(kSeed);
-  for (size_t i = 3; i < 13; i++) {
+  for (size_t i = 3; i < 10; i++) {
     const int64_t length = static_cast<int64_t>(1ULL << i);
     auto array = checked_pointer_cast<ArrayType>(
         rand.Numeric<TypeParam>(length, 0, 100, /*null_probability=*/0.0));
diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc
index 54e6f752fff..9a0ef76c74b 100644
--- a/cpp/src/arrow/compute/kernels/vector_hash.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash.cc
@@ -26,41 +26,22 @@
 #include <utility>
 #include <vector>
 
-#include "arrow/array.h"
 #include "arrow/array/dict_internal.h"
-#include "arrow/buffer.h"
 #include "arrow/builder.h"
-#include "arrow/compute/exec.h"
-#include "arrow/compute/kernel.h"
-#include "arrow/type.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/checked_cast.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/kernels/common.h"
 #include "arrow/util/hashing.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/macros.h"
-#include "arrow/util/string_view.h"
-#include "arrow/visitor_inline.h"
 
 namespace arrow {
 
 class MemoryPool;
 
-using internal::checked_cast;
 using internal::DictionaryTraits;
-using internal::HashTraits;
 
 namespace compute {
 
 namespace {
 
-#define CHECK_IMPLEMENTED(KERNEL, FUNCNAME, TYPE)                                       \
-  if (!KERNEL) {                                                                        \
-    return Status::NotImplemented(FUNCNAME, " not implemented for ", type->ToString()); \
-  }
-
-// ----------------------------------------------------------------------
-// Unique implementation
-
 class ActionBase {
  public:
   ActionBase(const std::shared_ptr<DataType>& type, MemoryPool* pool)
@@ -71,10 +52,16 @@ class ActionBase {
   MemoryPool* pool_;
 };
 
+// ----------------------------------------------------------------------
+// Unique
+
 class UniqueAction final : public ActionBase {
  public:
   using ActionBase::ActionBase;
 
+  static constexpr bool with_error_status = false;
+  static constexpr bool with_memo_visit_null = true;
+
   Status Reset() { return Status::OK(); }
 
   Status Reserve(const int64_t length) { return Status::OK(); }
@@ -93,18 +80,19 @@ class UniqueAction final : public ActionBase {
 
   Status Flush(Datum* out) { return Status::OK(); }
 
-  std::shared_ptr<DataType> out_type() const { return type_; }
-
   Status FlushFinal(Datum* out) { return Status::OK(); }
 };
 
 // ----------------------------------------------------------------------
-// Count values implementation (see HashKernel for description of methods)
+// Count values
 
 class ValueCountsAction final : ActionBase {
  public:
   using ActionBase::ActionBase;
 
+  static constexpr bool with_error_status = true;
+  static constexpr bool with_memo_visit_null = true;
+
   ValueCountsAction(const std::shared_ptr<DataType>& type, MemoryPool* pool)
       : ActionBase(type, pool), count_builder_(pool) {}
 
@@ -122,8 +110,6 @@ class ValueCountsAction final : ActionBase {
   // or incur the cost of memory copies.
   Status Flush(Datum* out) { return Status::OK(); }
 
-  std::shared_ptr<DataType> out_type() const { return type_; }
-
   // Return the counts corresponding the MemoTable keys.
   Status FlushFinal(Datum* out) {
     std::shared_ptr<ArrayData> result;
@@ -168,12 +154,15 @@ class ValueCountsAction final : ActionBase {
 };
 
 // ----------------------------------------------------------------------
-// Dictionary encode implementation (see HashKernel for description of methods)
+// Dictionary encode implementation
 
 class DictEncodeAction final : public ActionBase {
  public:
   using ActionBase::ActionBase;
 
+  static constexpr bool with_error_status = false;
+  static constexpr bool with_memo_visit_null = false;
+
   DictEncodeAction(const std::shared_ptr<DataType>& type, MemoryPool* pool)
       : ActionBase(type, pool), indices_builder_(pool) {}
 
@@ -211,25 +200,19 @@ class DictEncodeAction final : public ActionBase {
     return Status::OK();
   }
 
-  std::shared_ptr<DataType> out_type() const { return int32(); }
   Status FlushFinal(Datum* out) { return Status::OK(); }
 
  private:
   Int32Builder indices_builder_;
 };
 
-/// \brief Invoke hash table kernel on input array, returning any output
-/// values. Implementations should be thread-safe
-///
-/// This interface is implemented below using visitor pattern on "Action"
-/// implementations.  It is not consolidate to keep the contract clearer.
-class HashKernel : public UnaryKernel {
+class HashKernel : public KernelState {
  public:
   // Reset for another run.
   virtual Status Reset() = 0;
   // Prepare the Action for the given input (e.g. reserve appropriately sized
   // data structures) and visit the given input with Action.
-  virtual Status Append(FunctionContext* ctx, const ArrayData& input) = 0;
+  virtual Status Append(KernelContext* ctx, const ArrayData& input) = 0;
   // Flush out accumulated results from the last invocation of Call.
   virtual Status Flush(Datum* out) = 0;
   // Flush out accumulated results across all invocations of Call. The kernel
@@ -244,13 +227,7 @@ class HashKernel : public UnaryKernel {
 
 class HashKernelImpl : public HashKernel {
  public:
-  Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override {
-    DCHECK_EQ(Datum::ARRAY, input.kind());
-    RETURN_NOT_OK(Append(ctx, *input.array()));
-    return Flush(out);
-  }
-
-  Status Append(FunctionContext* ctx, const ArrayData& input) override {
+  Status Append(KernelContext* ctx, const ArrayData& input) override {
     std::lock_guard<std::mutex> guard(lock_);
     return Append(input);
   }
@@ -265,8 +242,9 @@ class HashKernelImpl : public HashKernel {
 // Base class for all "regular" hash kernel implementations
 // (NullType has a separate implementation)
 
-template <typename Type, typename Scalar, typename Action, bool with_error_status = false,
-          bool with_memo_visit_null = true>
+template <typename Type, typename Scalar, typename Action,
+          bool with_error_status = Action::with_error_status,
+          bool with_memo_visit_null = Action::with_memo_visit_null>
 class RegularHashKernelImpl : public HashKernelImpl {
  public:
   RegularHashKernelImpl(const std::shared_ptr<DataType>& type, MemoryPool* pool)
@@ -354,10 +332,8 @@ class RegularHashKernelImpl : public HashKernelImpl {
     return VisitArrayDataInline<Type>(arr, std::move(process_value));
   }
 
-  std::shared_ptr<DataType> out_type() const override { return action_.out_type(); }
-
  protected:
-  using MemoTable = typename HashTraits<Type>::MemoTableType;
+  using MemoTable = typename internal::HashTraits<Type>::MemoTableType;
 
   MemoryPool* pool_;
   std::shared_ptr<DataType> type_;
@@ -398,8 +374,6 @@ class NullHashKernelImpl : public HashKernelImpl {
     return Status::OK();
   }
 
-  std::shared_ptr<DataType> out_type() const override { return action_.out_type(); }
-
  protected:
   MemoryPool* pool_;
   std::shared_ptr<DataType> type_;
@@ -409,136 +383,160 @@ class NullHashKernelImpl : public HashKernelImpl {
 // ----------------------------------------------------------------------
 // Kernel wrapper for generic hash table kernels
 
-template <typename Type, typename Action, bool with_error_status,
-          bool with_memo_visit_null, typename Enable = void>
+template <typename Type, typename Action, typename Enable = void>
 struct HashKernelTraits {};
 
-template <typename Type, typename Action, bool with_error_status,
-          bool with_memo_visit_null>
-struct HashKernelTraits<Type, Action, with_error_status, with_memo_visit_null,
-                        enable_if_null<Type>> {
+template <typename Type, typename Action>
+struct HashKernelTraits<Type, Action, enable_if_null<Type>> {
   using HashKernelImpl = NullHashKernelImpl<Action>;
 };
 
-template <typename Type, typename Action, bool with_error_status,
-          bool with_memo_visit_null>
-struct HashKernelTraits<Type, Action, with_error_status, with_memo_visit_null,
-                        enable_if_has_c_type<Type>> {
-  using HashKernelImpl = RegularHashKernelImpl<Type, typename Type::c_type, Action,
-                                               with_error_status, with_memo_visit_null>;
+template <typename Type, typename Action>
+struct HashKernelTraits<Type, Action, enable_if_has_c_type<Type>> {
+  using HashKernelImpl = RegularHashKernelImpl<Type, typename Type::c_type, Action>;
 };
 
-template <typename Type, typename Action, bool with_error_status,
-          bool with_memo_visit_null>
-struct HashKernelTraits<Type, Action, with_error_status, with_memo_visit_null,
-                        enable_if_has_string_view<Type>> {
-  using HashKernelImpl = RegularHashKernelImpl<Type, util::string_view, Action,
-                                               with_error_status, with_memo_visit_null>;
+template <typename Type, typename Action>
+struct HashKernelTraits<Type, Action, enable_if_has_string_view<Type>> {
+  using HashKernelImpl = RegularHashKernelImpl<Type, util::string_view, Action>;
 };
 
-}  // namespace
+template <typename T, typename R = void>
+using enable_if_can_hash =
+    enable_if_t<is_null_type<T>::value || has_c_type<T>::value ||
+                    is_base_binary_type<T>::value || is_fixed_size_binary_type<T>::value,
+                R>;
+
+template <typename Type, typename Action>
+struct HashInitFunctor {
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+  using HashKernelType = typename HashKernelTraits<Type, Action>::HashKernelImpl;
+
+  static std::unique_ptr<KernelState> Init(KernelContext* ctx,
+                                           const KernelInitArgs& args) {
+    auto result = std::unique_ptr<HashKernel>(
+        new HashKernelType(args.inputs[0].type, ctx->memory_pool()));
+    ctx->SetStatus(result->Reset());
+    return std::move(result);
+  }
+};
+
+template <typename Action>
+struct HashInitVisitor {
+  VectorKernel* out;
 
-#define PROCESS_SUPPORTED_HASH_TYPES(PROCESS) \
-  PROCESS(NullType)                           \
-  PROCESS(BooleanType)                        \
-  PROCESS(UInt8Type)                          \
-  PROCESS(Int8Type)                           \
-  PROCESS(UInt16Type)                         \
-  PROCESS(Int16Type)                          \
-  PROCESS(UInt32Type)                         \
-  PROCESS(Int32Type)                          \
-  PROCESS(UInt64Type)                         \
-  PROCESS(Int64Type)                          \
-  PROCESS(FloatType)                          \
-  PROCESS(DoubleType)                         \
-  PROCESS(Date32Type)                         \
-  PROCESS(Date64Type)                         \
-  PROCESS(Time32Type)                         \
-  PROCESS(Time64Type)                         \
-  PROCESS(TimestampType)                      \
-  PROCESS(BinaryType)                         \
-  PROCESS(LargeBinaryType)                    \
-  PROCESS(StringType)                         \
-  PROCESS(LargeStringType)                    \
-  PROCESS(FixedSizeBinaryType)                \
-  PROCESS(Decimal128Type)
-
-Status GetUniqueKernel(FunctionContext* ctx, const std::shared_ptr<DataType>& type,
-                       std::unique_ptr<HashKernel>* out) {
-  std::unique_ptr<HashKernel> kernel;
-  switch (type->id()) {
-#define PROCESS(InType)                                                               \
-  case InType::type_id:                                                               \
-    kernel.reset(                                                                     \
-        new                                                                           \
-        typename HashKernelTraits<InType, UniqueAction, false, true>::HashKernelImpl( \
-            type, ctx->memory_pool()));                                               \
-    break;
-
-    PROCESS_SUPPORTED_HASH_TYPES(PROCESS)
-#undef PROCESS
-    default:
-      break;
+  Status Visit(const DataType& type) {
+    return Status::NotImplemented("Hashing not available for ", type.ToString());
   }
 
-  CHECK_IMPLEMENTED(kernel, "unique", type);
-  RETURN_NOT_OK(kernel->Reset());
-  *out = std::move(kernel);
-  return Status::OK();
+  template <typename Type>
+  enable_if_can_hash<Type, Status> Visit(const Type&) {
+    out->init = HashInitFunctor<Type, Action>::Init;
+    return Status::OK();
+  }
+};
+
+void HashExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  auto hash_impl = checked_cast<HashKernel*>(ctx->state());
+  CTX_RETURN_IF_ERROR(ctx, hash_impl->Append(ctx, *batch[0].array()));
+  CTX_RETURN_IF_ERROR(ctx, hash_impl->Flush(out));
+}
+
+void UniqueFinalize(KernelContext* ctx, std::vector<Datum>* out) {
+  auto hash_impl = checked_cast<HashKernel*>(ctx->state());
+  std::shared_ptr<ArrayData> uniques;
+  CTX_RETURN_IF_ERROR(ctx, hash_impl->GetDictionary(&uniques));
+  *out = {Datum(uniques)};
 }
 
-Status GetDictionaryEncodeKernel(FunctionContext* ctx,
-                                 const std::shared_ptr<DataType>& type,
-                                 std::unique_ptr<HashKernel>* out) {
-  std::unique_ptr<HashKernel> kernel;
-
-  switch (type->id()) {
-#define PROCESS(InType)                                                                  \
-  case InType::type_id:                                                                  \
-    kernel.reset(                                                                        \
-        new typename HashKernelTraits<InType, DictEncodeAction, false,                   \
-                                      false>::HashKernelImpl(type, ctx->memory_pool())); \
-    break;
-
-    PROCESS_SUPPORTED_HASH_TYPES(PROCESS)
-#undef PROCESS
-    default:
-      break;
+void DictEncodeFinalize(KernelContext* ctx, std::vector<Datum>* out) {
+  auto hash_impl = checked_cast<HashKernel*>(ctx->state());
+  std::shared_ptr<ArrayData> uniques;
+  CTX_RETURN_IF_ERROR(ctx, hash_impl->GetDictionary(&uniques));
+  auto dict_type = dictionary(int32(), uniques->type);
+  auto dict = MakeArray(uniques);
+  for (size_t i = 0; i < out->size(); ++i) {
+    (*out)[i] =
+        std::make_shared<DictionaryArray>(dict_type, (*out)[i].make_array(), dict);
   }
+}
 
-#undef DICTIONARY_ENCODE_CASE
+void ValueCountsFinalize(KernelContext* ctx, std::vector<Datum>* out) {
+  auto hash_impl = checked_cast<HashKernel*>(ctx->state());
+  std::shared_ptr<ArrayData> uniques;
+  CTX_RETURN_IF_ERROR(ctx, hash_impl->GetDictionary(&uniques));
+
+  Datum value_counts;
+  CTX_RETURN_IF_ERROR(ctx, hash_impl->FlushFinal(&value_counts));
+  auto data_type =
+      struct_({field(kValuesFieldName, uniques->type), field(kCountsFieldName, int64())});
+  ArrayVector children = {MakeArray(uniques), value_counts.make_array()};
+  auto result = std::make_shared<StructArray>(data_type, uniques->length, children);
+  *out = {Datum(result)};
+}
 
-  CHECK_IMPLEMENTED(kernel, "dictionary-encode", type);
-  RETURN_NOT_OK(kernel->Reset());
-  *out = std::move(kernel);
-  return Status::OK();
+Result<ValueDescr> DictEncodeOutput(const std::vector<ValueDescr>& descrs) {
+  return ValueDescr::Array(dictionary(int32(), descrs[0].type));
+}
+
+Result<ValueDescr> ValueCountsOutput(const std::vector<ValueDescr>& descrs) {
+  return ValueDescr::Array(struct_(
+      {field(kValuesFieldName, descrs[0].type), field(kCountsFieldName, int64())}));
+}
+
+template <typename Action>
+void AddKernel(VectorFunction* func, VectorKernel kernel,
+               const std::shared_ptr<DataType>& type) {
+  HashInitVisitor<Action> visitor{&kernel};
+  DCHECK_OK(VisitTypeInline(*type, &visitor));
+  DCHECK_OK(func->AddKernel(std::move(kernel)));
 }
 
-Status GetValueCountsKernel(FunctionContext* ctx, const std::shared_ptr<DataType>& type,
-                            std::unique_ptr<HashKernel>* out) {
-  std::unique_ptr<HashKernel> kernel;
-
-  switch (type->id()) {
-#define PROCESS(InType)                                                                 \
-  case InType::type_id:                                                                 \
-    kernel.reset(                                                                       \
-        new typename HashKernelTraits<InType, ValueCountsAction, true,                  \
-                                      true>::HashKernelImpl(type, ctx->memory_pool())); \
-    break;
-
-    PROCESS_SUPPORTED_HASH_TYPES(PROCESS)
-#undef PROCESS
-    default:
-      break;
+template <typename Action>
+void AddHashKernels(VectorFunction* func, VectorKernel base,
+                    OutputType::Resolver out_resolver) {
+  OutputType out_ty(out_resolver);
+  for (const auto& ty : PrimitiveTypes()) {
+    base.signature = KernelSignature::Make({InputType::Array(ty)}, out_ty);
+    AddKernel<Action>(func, base, ty);
   }
 
-  CHECK_IMPLEMENTED(kernel, "count-values", type);
-  RETURN_NOT_OK(kernel->Reset());
-  *out = std::move(kernel);
-  return Status::OK();
+  base.signature =
+      KernelSignature::Make({InputType::Array(Type::FIXED_SIZE_BINARY)}, out_ty);
+  AddKernel<Action>(func, base, /*dummy=*/fixed_size_binary(0));
+
+  base.signature = KernelSignature::Make({InputType::Array(Type::DECIMAL)}, out_ty);
+  AddKernel<Action>(func, base, /*dummy*/ decimal(12, 2));
 }
 
-#undef PROCESS_SUPPORTED_HASH_TYPES
+}  // namespace
+
+namespace internal {
+
+void RegisterVectorHash(FunctionRegistry* registry) {
+  VectorKernel base;
+  base.exec = HashExec;
+
+  // Unique and ValueCounts output unchunked arrays
+
+  base.finalize = UniqueFinalize;
+  base.output_chunked = false;
+  auto unique = std::make_shared<VectorFunction>("unique", /*arity=*/1);
+  AddHashKernels<UniqueAction>(unique.get(), base, /*output_type=*/FirstType);
+  DCHECK_OK(registry->AddFunction(std::move(unique)));
+
+  base.finalize = ValueCountsFinalize;
+  auto value_counts = std::make_shared<VectorFunction>("value_counts", /*arity=*/1);
+  AddHashKernels<ValueCountsAction>(value_counts.get(), base, ValueCountsOutput);
+  DCHECK_OK(registry->AddFunction(std::move(value_counts)));
+
+  base.finalize = DictEncodeFinalize;
+  base.output_chunked = true;
+  auto dict_encode = std::make_shared<VectorFunction>("dict_encode", /*arity=*/1);
+  AddHashKernels<DictEncodeAction>(dict_encode.get(), base, DictEncodeOutput);
+  DCHECK_OK(registry->AddFunction(std::move(dict_encode)));
+}
 
+}  // namespace internal
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/vector_hash_test.cc b/cpp/src/arrow/compute/kernels/vector_hash_test.cc
index 40060d35085..afeeeead9b3 100644
--- a/cpp/src/arrow/compute/kernels/vector_hash_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash_test.cc
@@ -128,7 +128,7 @@ void CheckDictEncode(const std::shared_ptr<Array>& input,
   auto type = dictionary(expected_indices->type(), expected_values->type());
   DictionaryArray expected(type, expected_indices, expected_values);
 
-  ARROW_ASSIGN_OR_RAISE(Datum datum_out, DictionaryEncode(input));
+  ASSERT_OK_AND_ASSIGN(Datum datum_out, DictionaryEncode(input));
   std::shared_ptr<Array> result = MakeArray(datum_out.array());
   ASSERT_OK(result->ValidateFull());
 
@@ -149,10 +149,10 @@ void CheckDictEncode(const std::shared_ptr<DataType>& type,
   return CheckDictEncode(input, ex_dict, ex_indices);
 }
 
-class TestHashKernel : public TestBase {};
+class TestHashKernel : public ::testing::Test {};
 
 template <typename Type>
-class TestHashKernelPrimitive : public TestBase {};
+class TestHashKernelPrimitive : public ::testing::Test {};
 
 typedef ::testing::Types<Int8Type, UInt8Type, Int16Type, UInt16Type, Int32Type,
                          UInt32Type, Int64Type, UInt64Type, FloatType, DoubleType,
diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc
index cd9e18ae6d3..6aa2db6c108 100644
--- a/cpp/src/arrow/compute/kernels/vector_sort.cc
+++ b/cpp/src/arrow/compute/kernels/vector_sort.cc
@@ -254,9 +254,9 @@ namespace internal {
 // * Number types
 // * Base binary types
 
-std::unique_ptr<KernelState> InitPartitionIndices(KernelContext*, const Kernel&,
-                                                  const FunctionOptions* options) {
-  int64_t pivot = static_cast<const PartitionOptions*>(options)->pivot;
+std::unique_ptr<KernelState> InitPartitionIndices(KernelContext*,
+                                                  const KernelInitArgs& args) {
+  int64_t pivot = static_cast<const PartitionOptions*>(args.options)->pivot;
   return std::unique_ptr<KernelState>(new PartitionIndicesState(pivot));
 }
 
diff --git a/cpp/src/arrow/compute/kernels/vector_take.cc b/cpp/src/arrow/compute/kernels/vector_take.cc
index 0f93b98eb99..8cc42df2ebb 100644
--- a/cpp/src/arrow/compute/kernels/vector_take.cc
+++ b/cpp/src/arrow/compute/kernels/vector_take.cc
@@ -33,10 +33,9 @@ struct TakeState : public KernelState {
   TakeOptions options;
 };
 
-std::unique_ptr<KernelState> InitTake(KernelContext*, const Kernel&,
-                                      const FunctionOptions* options) {
+std::unique_ptr<KernelState> InitTake(KernelContext*, const KernelInitArgs& args) {
   // NOTE: TakeOptions are currently unused, but we pass it through anyway
-  auto take_options = static_cast<const TakeOptions*>(options);
+  auto take_options = static_cast<const TakeOptions*>(args.options);
   return std::unique_ptr<KernelState>(new TakeState{*take_options});
 }
 
@@ -83,8 +82,6 @@ Status GetTakeKernel(const DataType& value_type, const DataType& index_type,
 void RegisterVectorTake(FunctionRegistry* registry) {
   VectorKernel base;
   base.init = InitTake;
-  base.mem_allocation = MemAllocation::NO_PREALLOCATE;
-  base.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
   base.can_execute_chunkwise = false;
 
   auto take = std::make_shared<VectorFunction>("take", /*arity=*/2);
diff --git a/cpp/src/arrow/compute/registry.cc b/cpp/src/arrow/compute/registry.cc
index 0a6630791cc..5a867078ee3 100644
--- a/cpp/src/arrow/compute/registry.cc
+++ b/cpp/src/arrow/compute/registry.cc
@@ -111,6 +111,7 @@ static void CreateBuiltInRegistry() {
 
   // Vector functions
   RegisterVectorFilter(g_registry.get());
+  RegisterVectorHash(g_registry.get());
   RegisterVectorSort(g_registry.get());
   RegisterVectorTake(g_registry.get());
 }

From 4f3e7690f5da96641cc61bca2f73b93145d6328c Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Tue, 19 May 2020 21:13:15 -0500
Subject: [PATCH 07/41] Fix memory allocation issue caught by ASAN

---
 cpp/src/arrow/compute/kernel.cc | 6 ++++--
 cpp/src/arrow/util/macros.h     | 1 -
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/cpp/src/arrow/compute/kernel.cc b/cpp/src/arrow/compute/kernel.cc
index 5be03e82dc1..1ed781a9c58 100644
--- a/cpp/src/arrow/compute/kernel.cc
+++ b/cpp/src/arrow/compute/kernel.cc
@@ -59,8 +59,10 @@ Result<std::shared_ptr<Buffer>> KernelContext::AllocateBitmap(int64_t num_bits)
   // Some utility methods access the last byte before it might be
   // initialized this makes valgrind/asan unhappy, so we proactively
   // zero it.
-  ZeroLastByte(result.get());
-  result->ZeroPadding();
+  if (nbytes > 0) {
+    ZeroLastByte(result.get());
+    result->ZeroPadding();
+  }
   return result;
 }
 
diff --git a/cpp/src/arrow/util/macros.h b/cpp/src/arrow/util/macros.h
index ae8d56d098b..da68e88863c 100644
--- a/cpp/src/arrow/util/macros.h
+++ b/cpp/src/arrow/util/macros.h
@@ -179,4 +179,3 @@
 
 #define FRIEND_TEST(test_case_name, test_name) \
   friend class test_case_name##_##test_name##_Test
-

From b4098be604356903e5e00d0fbe1d378ac877f74a Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Tue, 19 May 2020 22:47:24 -0500
Subject: [PATCH 08/41] Some progress refactoring cast kernels

---
 cpp/src/arrow/CMakeLists.txt                  |   1 +
 .../arrow/compute/kernels/codegen_internal.h  |  51 +++-
 cpp/src/arrow/compute/kernels/registry.h      |   2 +
 .../compute/kernels/scalar_cast_boolean.cc    |  73 +++---
 .../compute/kernels/scalar_cast_numeric.cc    | 241 +++++++-----------
 5 files changed, 172 insertions(+), 196 deletions(-)

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 0125504ebb9..26604475828 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -331,6 +331,7 @@ if(ARROW_COMPUTE)
               compute/kernels/codegen_internal.cc
               compute/kernels/scalar_arithmetic.cc
               compute/kernels/scalar_boolean.cc
+              compute/kernels/scalar_cast_boolean.cc
               compute/kernels/scalar_compare.cc
               compute/kernels/vector_filter.cc
               compute/kernels/vector_hash.cc
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h
index 791e0a9376f..fc3c8bcedc7 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.h
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -55,6 +55,19 @@ template <typename T, typename R = void>
 using enable_if_has_c_type_not_boolean = enable_if_t<has_c_type<T>::value &&
                                                      !is_boolean_type<T>::value, R>;
 
+template <typename T, typename Enable = void>
+struct CodegenTraits;
+
+template <typename T>
+struct CodegenTraits<T, enable_if_has_c_type<T>> {
+  using value_type = typename T::c_type;
+};
+
+template <typename T>
+struct CodegenTraits<T, enable_if_base_binary<T>> {
+  using value_type = util::string_view;
+};
+
 template <typename Type, typename Enable = void>
 struct ArrayIterator;
 
@@ -259,16 +272,48 @@ struct OutputAdapter<Type, enable_if_base_binary<Type>> {
   }
 };
 
-// A binary kernel that outputs boolean values.
+template <typename OutType, typename Arg0Type, typename Op>
+struct ScalarUnary {
+  using OutScalar = typename TypeTraits<OutType>::ScalarType;
+
+  using OUT = typename CodegenTraits<OutType>::value_type;
+  using ARG0 = typename CodegenTraits<Arg0Type>::value_type;
+
+  static void Array(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    ArrayIterator<Arg0Type> arg0(*batch[0].array());
+    OutputAdapter<OutType>::Write(ctx, out, [&]() -> OUT {
+        return Op::template Call<OUT, ARG0>(ctx, arg0());
+    });
+  }
+
+  static void Scalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    ARG0 arg0 = UnboxScalar<Arg0Type>::Unbox(batch[0]);
+    out->value = std::make_shared<OutScalar>(Op::template Call<OUT, ARG0>(ctx, arg0));
+  }
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    if (batch[0].kind() == Datum::ARRAY) {
+      return Array(ctx, batch, out);
+    } else {
+      return Scalar(ctx, batch, out);
+    }
+  }
+};
+
 template <typename OutType, typename Arg0Type, typename Arg1Type, typename Op,
           typename FlippedOp = Op>
 struct ScalarBinary {
   using OutScalarType = typename TypeTraits<OutType>::ScalarType;
+
+  using OUT = typename CodegenTraits<OutType>::value_type;
+  using ARG0 = typename CodegenTraits<Arg0Type>::value_type;
+  using ARG1 = typename CodegenTraits<Arg1Type>::value_type;
+
   template <typename ChosenOp>
   static void ArrayArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     ArrayIterator<Arg0Type> arg0(*batch[0].array());
     ArrayIterator<Arg1Type> arg1(*batch[1].array());
-    OutputAdapter<OutType>::Write(ctx, out, [&]() -> bool {
+    OutputAdapter<OutType>::Write(ctx, out, [&]() -> OUT {
         return ChosenOp::template Call(ctx, arg0(), arg1());
     });
   }
@@ -277,7 +322,7 @@ struct ScalarBinary {
   static void ArrayScalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     ArrayIterator<Arg0Type> arg0(*batch[0].array());
     auto arg1 = UnboxScalar<Arg1Type>::Unbox(batch[1]);
-    OutputAdapter<OutType>::Write(ctx, out, [&]() -> bool {
+    OutputAdapter<OutType>::Write(ctx, out, [&]() -> OUT {
         return ChosenOp::template Call(ctx, arg0(), arg1);
     });
   }
diff --git a/cpp/src/arrow/compute/kernels/registry.h b/cpp/src/arrow/compute/kernels/registry.h
index 5186fd0705f..c8b66a79f4e 100644
--- a/cpp/src/arrow/compute/kernels/registry.h
+++ b/cpp/src/arrow/compute/kernels/registry.h
@@ -31,6 +31,8 @@ void RegisterScalarBoolean(FunctionRegistry* registry);
 void RegisterScalarComparison(FunctionRegistry* registry);
 void RegisterScalarSetLookup(FunctionRegistry* registry);
 
+void RegisterScalarCastBoolean(FunctionRegistry* registry);
+
 // Vector functions
 void RegisterVectorFilter(FunctionRegistry* registry);
 void RegisterVectorHash(FunctionRegistry* registry);
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
index 85e81548772..3b5bd1e3ffd 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
@@ -17,66 +17,53 @@
 
 // Cast types to boolean
 
+#include "arrow/compute/kernels/common.h"
+#include "arrow/util/value_parsing.h"
+
 namespace arrow {
 namespace compute {
+namespace codegen {
 
 struct IsNonZero {
-  template <typename InType, typename OutType>
-  static OutType Call(FunctionContext*, InType val) {
+  template <typename OUT, typename ARG0>
+  static OUT Call(KernelContext*, ARG0 val) {
     return val != 0;
   }
 };
 
 struct ParseBooleanString {
-  template <typename OutType = bool>
-  static OutType Call(FunctionContext* ctx, util::string_view val) {
-    internal::StringConverter<BooleanType> converter;
+  template <typename OUT, typename ARG0>
+  static OUT Call(KernelContext* ctx, ARG0 val) {
+    bool result;
+    if (ARROW_PREDICT_FALSE(!::arrow::internal::ParseValue<BooleanType>(
+            val.data(), val.size(), &result))) {
+      ctx->SetStatus(Status::Invalid("Failed to parse value: ", val));
+    }
+    return result;
   }
 };
 
-void RegisterBooleanCasts(FunctionRegistry* registry) {
-  ScalarDispatcher dispatcher("cast_boolean", /*num_args=*/1);
-  auto out_type = boolean();
-  for (const auto& in_type : kNumberTypes) {
-    auto func = codegen::MakePrimitiveUnary<BooleanType, IsNonZero>(in_type));
-    dispatcher.Add(ScalarKernel({in_type}, out_type, func));
+void AddBooleanCasts(FunctionRegistry* registry) {
+  auto func = std::make_shared<ScalarFunction>("cast_boolean", /*arity=*/1);
+  for (const auto& ty : NumericTypes()) {
+    auto exec = codegen::Numeric<ScalarUnary, BooleanType, IsNonZero>(*ty);
+    DCHECK_OK(func->AddKernel({ty}, boolean(), exec));
   }
+  for (const auto& ty : BaseBinaryTypes()) {
+    auto exec = codegen::BaseBinary<ScalarUnary, BooleanType, ParseBooleanString>(*ty);
+    DCHECK_OK(func->AddKernel({ty}, boolean(), exec));
+  }
+  DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
-// String to Boolean
-template <typename I>
-struct CastFunctor<BooleanType, I, enable_if_t<is_string_like_type<I>::value>> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    typename TypeTraits<I>::ArrayType input_array(input.Copy());
-    internal::FirstTimeBitmapWriter writer(output->buffers[1]->mutable_data(),
-                                           output->offset, input.length);
-
-    for (int64_t i = 0; i < input.length; ++i) {
-      if (input_array.IsNull(i)) {
-        writer.Next();
-        continue;
-      }
+}  // namespace codegen
 
-      bool value;
-      auto str = input_array.GetView(i);
-      if (!converter(str.data(), str.length(), &value)) {
-        ctx->SetStatus(Status::Invalid("Failed to cast String '",
-                                       input_array.GetString(i), "' into ",
-                                       output->type->ToString()));
-        return;
-      }
+namespace internal {
 
-      if (value) {
-        writer.Set();
-      } else {
-        writer.Clear();
-      }
-      writer.Next();
-    }
-    writer.Finish();
-  }
-};
+void RegisterScalarCastBoolean(FunctionRegistry* registry) {
+  codegen::AddBooleanCasts(registry);
+}
 
+}  // namespace internal
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
index 045e05f1eed..be364e3cfcf 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
@@ -20,30 +20,6 @@
 namespace arrow {
 namespace compute {
 
-// ----------------------------------------------------------------------
-// Boolean to other things
-
-// Cast from Boolean to other numbers
-template <typename T>
-struct CastFunctor<T, BooleanType, enable_if_number<T>> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    using c_type = typename T::c_type;
-    constexpr auto kOne = static_cast<c_type>(1);
-    constexpr auto kZero = static_cast<c_type>(0);
-
-    if (input.length == 0) return;
-
-    internal::BitmapReader bit_reader(input.buffers[1]->data(), input.offset,
-                                      input.length);
-    auto out = output->GetMutableValues<c_type>(1);
-    for (int64_t i = 0; i < input.length; ++i) {
-      *out++ = bit_reader.IsSet() ? kOne : kZero;
-      bit_reader.Next();
-    }
-  }
-};
-
 // ----------------------------------------------------------------------
 // Integers and Floating Point
 
@@ -167,53 +143,6 @@ constexpr RET_TYPE(is_integral_signed_to_unsigned) SafeMaximum() {
 
 #undef RET_TYPE
 
-template <typename O, typename I>
-struct CastFunctor<O, I,
-                   enable_if_t<is_number_downcast<O, I>::value ||
-                               is_integral_signed_to_unsigned<O, I>::value ||
-                               is_integral_unsigned_to_signed<O, I>::value>> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    using in_type = typename I::c_type;
-    using out_type = typename O::c_type;
-
-    auto in_offset = input.offset;
-
-    const in_type* in_data = input.GetValues<in_type>(1);
-    auto out_data = output->GetMutableValues<out_type>(1);
-
-    if (!options.allow_int_overflow) {
-      constexpr in_type kMax = SafeMaximum<O, I>();
-      constexpr in_type kMin = SafeMinimum<O, I>();
-
-      // Null count may be -1 if the input array had been sliced
-      if (input.null_count != 0) {
-        internal::BitmapReader is_valid_reader(input.buffers[0]->data(), in_offset,
-                                               input.length);
-        for (int64_t i = 0; i < input.length; ++i) {
-          if (ARROW_PREDICT_FALSE(is_valid_reader.IsSet() &&
-                                  (*in_data > kMax || *in_data < kMin))) {
-            ctx->SetStatus(Status::Invalid("Integer value out of bounds"));
-          }
-          *out_data++ = static_cast<out_type>(*in_data++);
-          is_valid_reader.Next();
-        }
-      } else {
-        for (int64_t i = 0; i < input.length; ++i) {
-          if (ARROW_PREDICT_FALSE(*in_data > kMax || *in_data < kMin)) {
-            ctx->SetStatus(Status::Invalid("Integer value out of bounds"));
-          }
-          *out_data++ = static_cast<out_type>(*in_data++);
-        }
-      }
-    } else {
-      for (int64_t i = 0; i < input.length; ++i) {
-        *out_data++ = static_cast<out_type>(*in_data++);
-      }
-    }
-  }
-};
-
 // Float to Integer or Integer to Float
 template <typename O, typename I, typename Enable = void>
 struct is_float_truncate {
@@ -228,52 +157,6 @@ struct is_float_truncate<
   static constexpr bool value = true;
 };
 
-template <typename O, typename I>
-struct CastFunctor<O, I, enable_if_t<is_float_truncate<O, I>::value>> {
-  ARROW_DISABLE_UBSAN("float-cast-overflow")
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    using in_type = typename I::c_type;
-    using out_type = typename O::c_type;
-
-    auto in_offset = input.offset;
-    const in_type* in_data = input.GetValues<in_type>(1);
-    auto out_data = output->GetMutableValues<out_type>(1);
-
-    if (options.allow_float_truncate) {
-      // unsafe cast
-      for (int64_t i = 0; i < input.length; ++i) {
-        *out_data++ = static_cast<out_type>(*in_data++);
-      }
-    } else {
-      // safe cast
-      if (input.null_count != 0) {
-        internal::BitmapReader is_valid_reader(input.buffers[0]->data(), in_offset,
-                                               input.length);
-        for (int64_t i = 0; i < input.length; ++i) {
-          auto out_value = static_cast<out_type>(*in_data);
-          if (ARROW_PREDICT_FALSE(is_valid_reader.IsSet() &&
-                                  static_cast<in_type>(out_value) != *in_data)) {
-            ctx->SetStatus(Status::Invalid("Floating point value truncated"));
-          }
-          *out_data++ = out_value;
-          in_data++;
-          is_valid_reader.Next();
-        }
-      } else {
-        for (int64_t i = 0; i < input.length; ++i) {
-          auto out_value = static_cast<out_type>(*in_data);
-          if (ARROW_PREDICT_FALSE(static_cast<in_type>(out_value) != *in_data)) {
-            ctx->SetStatus(Status::Invalid("Floating point value truncated"));
-          }
-          *out_data++ = out_value;
-          in_data++;
-        }
-      }
-    }
-  }
-};
-
 // Leftover of Number combinations that are safe to cast.
 template <typename O, typename I, typename Enable = void>
 struct is_safe_numeric_cast {
@@ -292,25 +175,81 @@ struct is_safe_numeric_cast<
       (sizeof(O_T) >= sizeof(I_T)) && (!std::is_same<O, I>::value);
 };
 
+// ----------------------------------------------------------------------
+// Possible integer truncation
+
+struct IntegerDowncastNoOverflow {
+  template <typename OutT, typename InT>
+  OutT Call(KernelContext ctx, InT val) {
+    constexpr InT kMax = SafeMaximum<OutT, InT>();
+    constexpr InT kMin = SafeMinimum<OutT, InT>();
+    if (ARROW_PREDICT_FALSE(val > kMax || val < kMin)) {
+      ctx->SetStatus(Status::Invalid("Integer value out of bounds"));
+    }
+    return static_cast<OutT>(val);
+  }
+};
+
+struct StaticCast {
+  ARROW_DISABLE_UBSAN("float-cast-overflow")
+  template <typename OutT, typename InT>
+  OutT Call(KernelContext ctx, InT val) {
+    return static_cast<out_type>(val);
+  }
+};
+
+template <typename O, typename I>
+struct CastFunctor<O, I,
+                   enable_if_t<is_number_downcast<O, I>::value ||
+                               is_integral_signed_to_unsigned<O, I>::value ||
+                               is_integral_unsigned_to_signed<O, I>::value>> {
+  using in_type = typename I::c_type;
+  using out_type = typename O::c_type;
+  out_type Call(KernelContext ctx, in_type val) {
+    if (!options.allow_int_overflow) {
+      // TODO
+    } else {
+      return static_cast<out_type>(val);
+    }
+  }
+};
+
+template <typename O, typename I>
+struct CastFunctor<O, I, enable_if_t<is_float_truncate<O, I>::value>> {
+  using in_type = typename I::c_type;
+  using out_type = typename O::c_type;
+
+  ARROW_DISABLE_UBSAN("float-cast-overflow")
+  out_type Call(KernelContext ctx, in_type val) {
+    if (options.allow_float_truncate) {
+      // unsafe cast
+      return static_cast<out_type>(*in_data++);
+    } else {
+      // safe cast
+      auto out_value = static_cast<out_type>(*in_data);
+      if (ARROW_PREDICT_FALSE(static_cast<in_type>(out_value) != *in_data)) {
+        ctx->SetStatus(Status::Invalid("Floating point value truncated"));
+      }
+      return out_value;
+    }
+  }
+};
+
 template <typename O, typename I>
 struct CastFunctor<
     O, I,
     enable_if_t<is_safe_numeric_cast<O, I>::value && !is_float_truncate<O, I>::value &&
                 !is_number_downcast<O, I>::value>> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    using in_type = typename I::c_type;
-    using out_type = typename O::c_type;
+  using in_type = typename I::c_type;
+  using out_type = typename O::c_type;
 
-    const in_type* in_data = input.GetValues<in_type>(1);
-    auto out_data = output->GetMutableValues<out_type>(1);
-    for (int64_t i = 0; i < input.length; ++i) {
-      // Due to various checks done via type-trait, the cast is safe and bear
-      // no truncation.
-      *out_data++ = static_cast<out_type>(*in_data++);
-    }
+  out_type Call(KernelContext ctx, in_type val) {
+    // Due to various checks done via type-trait, the cast is safe and bear
+    // no truncation.
+    return static_cast<out_type>(*in_data++);
   }
 };
+
 // ----------------------------------------------------------------------
 // Decimals
 
@@ -318,9 +257,12 @@ struct CastFunctor<
 
 template <typename O>
 struct CastFunctor<O, Decimal128Type, enable_if_t<is_integer_type<O>::value>> {
+  using OUT = typename O::c_type;
+
+  static OUT Call(KernelContext* ctx, Decimal128 val) {}
+
   void operator()(FunctionContext* ctx, const CastOptions& options,
                   const ArrayData& input, ArrayData* output) {
-    using out_type = typename O::c_type;
     const auto& in_type_inst = checked_cast<const Decimal128Type&>(*input.type);
     auto in_scale = in_type_inst.scale();
 
@@ -393,31 +335,30 @@ struct CastFunctor<O, Decimal128Type, enable_if_t<is_integer_type<O>::value>> {
 };
 
 // ----------------------------------------------------------------------
-// String to Number
-
-template <typename I, typename O>
-struct CastFunctor<
-    O, I, enable_if_t<is_string_like_type<I>::value && is_number_type<O>::value>> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    using out_type = typename O::c_type;
+// Boolean to other things
 
-    typename TypeTraits<I>::ArrayType input_array(input.Copy());
-    auto out_data = output->GetMutableValues<out_type>(1);
-    internal::StringConverter<O> converter;
+struct BooleanToNumber {
+  template <typename OUT, typename ARG0>
+  static OUT Call(KernelContext*, ARG0 val) {
+    constexpr auto kOne = static_cast<OUT>(1);
+    constexpr auto kZero = static_cast<OUT>(0);
+    return val ? kOne : kZero;
+  }
+};
 
-    for (int64_t i = 0; i < input.length; ++i, ++out_data) {
-      if (input_array.IsNull(i)) {
-        continue;
-      }
+// ----------------------------------------------------------------------
+// String to Number
 
-      auto str = input_array.GetView(i);
-      if (!converter(str.data(), str.length(), out_data)) {
-        ctx->SetStatus(Status::Invalid("Failed to cast String '", str, "' into ",
-                                       output->type->ToString()));
-        return;
-      }
+template <typename ArrowType>
+struct StringToNumber {
+  template <typename OUT, typename ARG0>
+  static OUT Call(KernelContext*, ARG0 val) {
+    OUT result;
+    if (ARROW_PREDICT_FALSE(
+            !::arrow::internal::ParseValue<ArrowType>(val.data(), val.size(), &result))) {
+      ctx->SetStatus(Status::Invalid("Failed to parse string: ", val));
     }
+    return result;
   }
 };
 

From 4f84a7d2223817e59c67b9487a3efdd4b22101bf Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Wed, 20 May 2020 16:46:36 -0500
Subject: [PATCH 09/41] Progress building casting infrastructure

---
 cpp/src/arrow/CMakeLists.txt                  |  10 +-
 cpp/src/arrow/compute/api_scalar.h            |  35 --
 cpp/src/arrow/compute/cast.cc                 | 128 ++++++
 cpp/src/arrow/compute/cast.h                  | 102 +++++
 cpp/src/arrow/compute/cast_internal.h         |  38 ++
 cpp/src/arrow/compute/exec.cc                 | 106 ++---
 cpp/src/arrow/compute/exec.h                  |   2 +-
 cpp/src/arrow/compute/exec_internal.h         |  29 +-
 cpp/src/arrow/compute/function.cc             |  18 +
 cpp/src/arrow/compute/function.h              |   9 +-
 cpp/src/arrow/compute/kernel.cc               |   5 +-
 cpp/src/arrow/compute/kernel.h                |  21 +-
 cpp/src/arrow/compute/kernel_test.cc          |  48 ++-
 .../arrow/compute/kernels/codegen_internal.cc |   2 +-
 .../arrow/compute/kernels/codegen_internal.h  |  93 +++-
 .../kernels/generated/cast_codegen_internal.h | 258 ------------
 .../compute/kernels/generated/codegen.py      | 142 -------
 cpp/src/arrow/compute/kernels/scalar_cast.cc  | 397 +-----------------
 .../compute/kernels/scalar_cast_boolean.cc    |  29 +-
 .../compute/kernels/scalar_cast_decimal.cc    |  88 ----
 .../compute/kernels/scalar_cast_internal.cc   |  94 +++++
 .../compute/kernels/scalar_cast_internal.h    | 156 ++++---
 .../compute/kernels/scalar_cast_nested.cc     |   2 +-
 .../compute/kernels/scalar_cast_numeric.cc    | 278 +++++++++---
 .../compute/kernels/scalar_cast_string.cc     |  10 +-
 .../compute/kernels/scalar_cast_temporal.cc   | 303 ++++++++++---
 .../arrow/compute/kernels/scalar_compare.cc   |  75 ++--
 .../arrow/compute/kernels/vector_filter.cc    |   2 +-
 cpp/src/arrow/compute/kernels/vector_hash.cc  |  19 +-
 cpp/src/arrow/compute/kernels/vector_take.cc  |   2 +-
 cpp/src/arrow/compute/registry.cc             |   3 +-
 .../registry.h => registry_internal.h}        |  10 +-
 cpp/src/arrow/dataset/filter.cc               |   2 +-
 cpp/src/arrow/type_traits.h                   |   4 +-
 34 files changed, 1215 insertions(+), 1305 deletions(-)
 create mode 100644 cpp/src/arrow/compute/cast.cc
 create mode 100644 cpp/src/arrow/compute/cast.h
 create mode 100644 cpp/src/arrow/compute/cast_internal.h
 delete mode 100644 cpp/src/arrow/compute/kernels/generated/cast_codegen_internal.h
 delete mode 100644 cpp/src/arrow/compute/kernels/generated/codegen.py
 delete mode 100644 cpp/src/arrow/compute/kernels/scalar_cast_decimal.cc
 create mode 100644 cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
 rename cpp/src/arrow/compute/{kernels/registry.h => registry_internal.h} (91%)

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 26604475828..a48580c910f 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -323,6 +323,7 @@ if(ARROW_COMPUTE)
               compute/api_aggregate.cc
               compute/api_scalar.cc
               compute/api_vector.cc
+              compute/cast.cc
               compute/exec.cc
               compute/function.cc
               compute/kernel.cc
@@ -332,14 +333,15 @@ if(ARROW_COMPUTE)
               compute/kernels/scalar_arithmetic.cc
               compute/kernels/scalar_boolean.cc
               compute/kernels/scalar_cast_boolean.cc
+              compute/kernels/scalar_cast_internal.cc
+              compute/kernels/scalar_cast_numeric.cc
+              compute/kernels/scalar_cast_temporal.cc
               compute/kernels/scalar_compare.cc
+              compute/kernels/scalar_set_lookup.cc
               compute/kernels/vector_filter.cc
               compute/kernels/vector_hash.cc
-              compute/kernels/scalar_set_lookup.cc
               compute/kernels/vector_sort.cc
-              compute/kernels/vector_take.cc
-              # compute/kernels/scalar_cast.cc
-       )
+              compute/kernels/vector_take.cc)
 endif()
 
 if(ARROW_FILESYSTEM)
diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h
index 67cbd3ea15d..848f3c361bf 100644
--- a/cpp/src/arrow/compute/api_scalar.h
+++ b/cpp/src/arrow/compute/api_scalar.h
@@ -32,41 +32,6 @@ namespace compute {
 
 class ExecContext;
 
-// ----------------------------------------------------------------------
-// Convenience invocation APIs for a number of kernels
-
-/// \brief Cast from one array type to another
-/// \param[in] value array to cast
-/// \param[in] to_type type to cast to
-/// \param[in] options casting options
-/// \param[in] context the function execution context, optional
-/// \return the resulting array
-///
-/// \since 1.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<std::shared_ptr<Array>> Cast(const Array& value, std::shared_ptr<DataType> to_type,
-                                    const CastOptions& options = CastOptions::Safe(),
-                                    ExecContext* context = NULLPTR);
-
-/// \brief Cast from one value to another
-/// \param[in] value datum to cast
-/// \param[in] to_type type to cast to
-/// \param[in] options casting options
-/// \param[in] context the function execution context, optional
-/// \return the resulting datum
-///
-/// \since 1.0.0
-/// \note API not yet finalized
-ARROW_EXPORT
-Result<Datum> Cast(const Datum& value, std::shared_ptr<DataType> to_type,
-                   const CastOptions& options = CastOptions::Safe(),
-                   ExecContext* context = NULLPTR);
-
-/// \brief Return true if a cast function is defined
-ARROW_EXPORT
-bool CanCast(const DataType& from_type, const DataType& to_type);
-
 // ----------------------------------------------------------------------
 
 /// \brief Add two values together. Array values must be the same length. If a
diff --git a/cpp/src/arrow/compute/cast.cc b/cpp/src/arrow/compute/cast.cc
new file mode 100644
index 00000000000..b283d7a6478
--- /dev/null
+++ b/cpp/src/arrow/compute/cast.cc
@@ -0,0 +1,128 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/cast.h"
+
+#include <mutex>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "arrow/compute/cast_internal.h"
+#include "arrow/compute/exec.h"
+#include "arrow/compute/registry.h"
+
+namespace arrow {
+namespace compute {
+
+namespace internal {
+
+std::unordered_map<Type::type, std::shared_ptr<const CastFunction>> g_cast_table;
+static std::once_flag cast_table_initialized;
+
+void AddCastFunctions(const std::vector<std::shared_ptr<CastFunction>>& funcs) {
+  for (const auto& func : funcs) {
+    g_cast_table[func->out_type_id()] = func;
+  }
+}
+
+void InitCastTable() {
+  AddCastFunctions(GetBooleanCasts());
+  AddCastFunctions(GetNumericCasts());
+  AddCastFunctions(GetTemporalCasts());
+}
+
+void EnsureInitCastTable() { std::call_once(cast_table_initialized, InitCastTable); }
+
+void RegisterScalarCasts(FunctionRegistry* registry) {
+  EnsureInitCastTable();
+  for (auto it : g_cast_table) {
+    DCHECK_OK(registry->AddFunction(it.second));
+  }
+}
+
+}  // namespace internal
+
+struct CastFunction::CastFunctionImpl {
+  Type::type out_type;
+  std::unordered_set<Type::type> in_types;
+};
+
+CastFunction::CastFunction(std::string name, Type::type out_type)
+    : ScalarFunction(std::move(name), /*arity=*/1) {
+  impl_.reset(new CastFunctionImpl());
+  impl_->out_type = out_type;
+}
+
+CastFunction::~CastFunction() {}
+
+Type::type CastFunction::out_type_id() const { return impl_->out_type; }
+
+Status CastFunction::AddKernel(Type::type in_type_id, std::vector<InputType> in_types,
+                               OutputType out_type, ArrayKernelExec exec,
+                               KernelInit init) {
+  RETURN_NOT_OK(
+      ScalarFunction::AddKernel(std::move(in_types), std::move(out_type), exec, init));
+  impl_->in_types.insert(in_type_id);
+  return Status::OK();
+}
+
+Status CastFunction::AddKernel(Type::type in_type_id, ScalarKernel kernel) {
+  RETURN_NOT_OK(ScalarFunction::AddKernel(kernel));
+  impl_->in_types.insert(in_type_id);
+  return Status::OK();
+}
+
+bool CastFunction::CanCastTo(const DataType& out_type) const {
+  return impl_->in_types.find(out_type.id()) != impl_->in_types.end();
+}
+
+Result<std::shared_ptr<Array>> Cast(const Array& value, std::shared_ptr<DataType> to_type,
+                                    const CastOptions& options, ExecContext* context) {
+  return Status::NotImplemented("NYI");
+}
+
+Result<Datum> Cast(const Datum& value, std::shared_ptr<DataType> to_type,
+                   const CastOptions& options, ExecContext* context) {
+  return Status::NotImplemented("NYI");
+}
+
+Result<std::shared_ptr<const CastFunction>> GetCastFunction(
+    const std::shared_ptr<DataType>& from_type,
+    const std::shared_ptr<DataType>& to_type) {
+  auto it = internal::g_cast_table.find(from_type->id());
+  if (it == internal::g_cast_table.end()) {
+    return Status::NotImplemented("No cast implemented from ", from_type->ToString(),
+                                  " to ", to_type->ToString());
+  }
+  return it->second;
+}
+
+bool CanCast(const DataType& from_type, const DataType& to_type) {
+  // TODO
+  auto it = internal::g_cast_table.find(from_type.id());
+  if (it == internal::g_cast_table.end()) {
+    return false;
+  }
+  return it->second->CanCastTo(to_type);
+}
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/cast.h b/cpp/src/arrow/compute/cast.h
new file mode 100644
index 00000000000..64298ef74bb
--- /dev/null
+++ b/cpp/src/arrow/compute/cast.h
@@ -0,0 +1,102 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/compute/exec.h"
+#include "arrow/compute/function.h"
+#include "arrow/compute/options.h"
+#include "arrow/datum.h"
+#include "arrow/result.h"
+
+namespace arrow {
+namespace compute {
+
+class ExecContext;
+
+class Caster {
+ public:
+  virtual ~Caster() = default;
+
+  virtual Status Cast(const ArrayData& input, ArrayData* output) = 0;
+};
+
+class CastFunction : public ScalarFunction {
+ public:
+  CastFunction(std::string name, Type::type out_type);
+  ~CastFunction();
+
+  Type::type out_type_id() const;
+
+  Status AddKernel(Type::type in_type_id, std::vector<InputType> in_types,
+                   OutputType out_type, ArrayKernelExec exec, KernelInit init = NULLPTR);
+
+  Status AddKernel(Type::type in_type_id, ScalarKernel kernel);
+
+  bool CanCastTo(const DataType& out_type) const;
+
+ private:
+  struct CastFunctionImpl;
+  std::unique_ptr<CastFunctionImpl> impl_;
+};
+
+ARROW_EXPORT
+Result<std::shared_ptr<const CastFunction>> GetCastFunction(
+    const std::shared_ptr<DataType>& from_type,
+    const std::shared_ptr<DataType>& to_type);
+
+/// \brief Return true if a cast function is defined
+ARROW_EXPORT
+bool CanCast(const DataType& from_type, const DataType& to_type);
+
+// ----------------------------------------------------------------------
+// Convenience invocation APIs for a number of kernels
+
+/// \brief Cast from one array type to another
+/// \param[in] value array to cast
+/// \param[in] to_type type to cast to
+/// \param[in] options casting options
+/// \param[in] context the function execution context, optional
+/// \return the resulting array
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> Cast(const Array& value, std::shared_ptr<DataType> to_type,
+                                    const CastOptions& options = CastOptions::Safe(),
+                                    ExecContext* context = NULLPTR);
+
+/// \brief Cast from one value to another
+/// \param[in] value datum to cast
+/// \param[in] to_type type to cast to
+/// \param[in] options casting options
+/// \param[in] context the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Cast(const Datum& value, std::shared_ptr<DataType> to_type,
+                   const CastOptions& options = CastOptions::Safe(),
+                   ExecContext* context = NULLPTR);
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/cast_internal.h b/cpp/src/arrow/compute/cast_internal.h
new file mode 100644
index 00000000000..20478cca8be
--- /dev/null
+++ b/cpp/src/arrow/compute/cast_internal.h
@@ -0,0 +1,38 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "arrow/compute/cast.h"  // IWYU pragma: keep
+
+namespace arrow {
+namespace compute {
+namespace internal {
+
+// See scalar_cast_*.cc for these
+std::vector<std::shared_ptr<CastFunction>> GetBooleanCasts();
+std::vector<std::shared_ptr<CastFunction>> GetNumericCasts();
+std::vector<std::shared_ptr<CastFunction>> GetTemporalCasts();
+std::vector<std::shared_ptr<CastFunction>> GetStringCasts();
+std::vector<std::shared_ptr<CastFunction>> GetNestedCasts();
+
+}  // namespace internal
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc
index 24ffd89aa9e..d7fb0745e9b 100644
--- a/cpp/src/arrow/compute/exec.cc
+++ b/cpp/src/arrow/compute/exec.cc
@@ -38,19 +38,13 @@
 
 namespace arrow {
 
+using internal::BitmapAnd;
 using internal::checked_cast;
+using internal::CopyBitmap;
+using internal::CpuInfo;
 
 namespace compute {
 
-#define CTX_RETURN_IF_ERROR(CTX)                  \
-  do {                                            \
-    if (ARROW_PREDICT_FALSE((CTX)->HasError())) { \
-      Status s = (CTX)->status();                 \
-      (CTX)->ResetStatus();                       \
-      return s;                                   \
-    }                                             \
-  } while (0)
-
 namespace {
 
 Result<std::shared_ptr<Buffer>> AllocateDataBuffer(KernelContext* ctx, int64_t length,
@@ -298,8 +292,7 @@ class NullPropagator {
     output_->null_count = arr.null_count.load();
 
     if (bitmap_preallocated_) {
-      internal::CopyBitmap(arr_bitmap->data(), arr.offset, arr.length, bitmap_,
-                           output_->offset);
+      CopyBitmap(arr_bitmap->data(), arr.offset, arr.length, bitmap_, output_->offset);
     } else {
       // Two cases when memory was not pre-allocated:
       //
@@ -317,8 +310,8 @@ class NullPropagator {
             SliceBuffer(arr_bitmap, arr.offset / 8, BitUtil::BytesForBits(arr.length));
       } else {
         RETURN_NOT_OK(EnsureAllocated());
-        internal::CopyBitmap(arr_bitmap->data(), arr.offset, arr.length, bitmap_,
-                             /*dst_offset=*/0);
+        CopyBitmap(arr_bitmap->data(), arr.offset, arr.length, bitmap_,
+                   /*dst_offset=*/0);
       }
     }
     return Status::OK();
@@ -334,9 +327,9 @@ class NullPropagator {
       // This is a precondition of reaching this code path
       DCHECK(left.buffers[0]);
       DCHECK(right.buffers[0]);
-      internal::BitmapAnd(left.buffers[0]->data(), left.offset, right.buffers[0]->data(),
-                          right.offset, output_->length, output_->offset,
-                          output_->buffers[0]->mutable_data());
+      BitmapAnd(left.buffers[0]->data(), left.offset, right.buffers[0]->data(),
+                right.offset, output_->length, output_->offset,
+                output_->buffers[0]->mutable_data());
     };
 
     DCHECK_GT(values_with_nulls_.size(), 1);
@@ -419,23 +412,6 @@ Status PropagateNulls(KernelContext* ctx, const ExecBatch& batch, ArrayData* out
   return propagator.Execute();
 }
 
-Status ExecListener::OnResult(Datum) { return Status::NotImplemented("OnResult"); }
-
-class DatumAccumulator : public ExecListener {
- public:
-  DatumAccumulator() {}
-
-  Status OnResult(Datum value) override {
-    values_.emplace_back(value);
-    return Status::OK();
-  }
-
-  std::vector<Datum> values() const { return values_; }
-
- private:
-  std::vector<Datum> values_;
-};
-
 std::shared_ptr<ChunkedArray> ToChunkedArray(const std::vector<Datum>& values,
                                              const std::shared_ptr<DataType>& type) {
   std::vector<std::shared_ptr<Array>> arrays;
@@ -459,6 +435,16 @@ bool HaveChunkedArray(const std::vector<Datum>& values) {
   return false;
 }
 
+Status CheckAllValues(const std::vector<Datum>& values) {
+  for (const auto& value : values) {
+    if (!value.is_value()) {
+      return Status::Invalid("Tried executing function with non-value type: ",
+                             value.ToString());
+    }
+  }
+  return Status::OK();
+}
+
 template <typename FunctionType>
 class FunctionExecutorImpl : public FunctionExecutor {
  public:
@@ -476,7 +462,7 @@ class FunctionExecutorImpl : public FunctionExecutor {
     if (kernel_->init) {
       KernelInitArgs init_args{kernel_, input_descrs_, options_};
       state_ = kernel_->init(&kernel_ctx_, init_args);
-      CTX_RETURN_IF_ERROR(&kernel_ctx_);
+      ARROW_CTX_RETURN_IF_ERROR(&kernel_ctx_);
       kernel_ctx_.SetState(state_.get());
     }
     return Status::OK();
@@ -493,9 +479,12 @@ class FunctionExecutorImpl : public FunctionExecutor {
     RETURN_NOT_OK(GetValueDescriptors(args, &input_descrs_));
     ARROW_ASSIGN_OR_RAISE(kernel_, func_->DispatchExact(input_descrs_));
 
+    // Initialize kernel state, since type resolution may depend on this state
+    RETURN_NOT_OK(this->InitState());
+
     // Resolve the output descriptor for this kernel
-    ARROW_ASSIGN_OR_RAISE(output_descr_,
-                          kernel_->signature->out_type().Resolve(input_descrs_));
+    ARROW_ASSIGN_OR_RAISE(output_descr_, kernel_->signature->out_type().Resolve(
+        &kernel_ctx_, input_descrs_));
 
     return SetupArgIteration(args);
   }
@@ -598,7 +587,7 @@ class ScalarExecutor : public FunctionExecutorImpl<ScalarFunction> {
     }
 
     kernel_->exec(&kernel_ctx_, batch, &out);
-    CTX_RETURN_IF_ERROR(&kernel_ctx_);
+    ARROW_CTX_RETURN_IF_ERROR(&kernel_ctx_);
     if (!preallocate_contiguous_) {
       // If we are producing chunked output rather than one big array, then
       // emit each chunk as soon as it's available
@@ -610,7 +599,6 @@ class ScalarExecutor : public FunctionExecutorImpl<ScalarFunction> {
   Status PrepareExecute(const std::vector<Datum>& args) {
     this->Reset();
     RETURN_NOT_OK(this->BindArgs(args));
-    RETURN_NOT_OK(this->InitState());
 
     if (output_descr_.shape == ValueDescr::ARRAY) {
       // If the executor is configured to produce a single large Array output for
@@ -783,7 +771,7 @@ class VectorExecutor : public FunctionExecutorImpl<VectorFunction> {
       RETURN_NOT_OK(PropagateNulls(&kernel_ctx_, batch, out.mutable_array()));
     }
     kernel_->exec(&kernel_ctx_, batch, &out);
-    CTX_RETURN_IF_ERROR(&kernel_ctx_);
+    ARROW_CTX_RETURN_IF_ERROR(&kernel_ctx_);
     if (!kernel_->finalize) {
       // If there is no result finalizer (e.g. for hash-based functions, we can
       // emit the processed batch right away rather than waiting
@@ -799,7 +787,7 @@ class VectorExecutor : public FunctionExecutorImpl<VectorFunction> {
       // Intermediate results require post-processing after the execution is
       // completed (possibly involving some accumulated state)
       kernel_->finalize(&kernel_ctx_, &results_);
-      CTX_RETURN_IF_ERROR(&kernel_ctx_);
+      ARROW_CTX_RETURN_IF_ERROR(&kernel_ctx_);
       for (const auto& result : results_) {
         RETURN_NOT_OK(listener->OnResult(result));
       }
@@ -818,7 +806,6 @@ class VectorExecutor : public FunctionExecutorImpl<VectorFunction> {
   Status PrepareExecute(const std::vector<Datum>& args) {
     this->Reset();
     RETURN_NOT_OK(this->BindArgs(args));
-    RETURN_NOT_OK(this->InitState());
     output_num_buffers_ = static_cast<int>(output_descr_.type->layout().buffers.size());
 
     // Decide if we need to preallocate memory for this kernel
@@ -843,10 +830,6 @@ class ScalarAggExecutor : public FunctionExecutorImpl<ScalarAggregateFunction> {
   Status Execute(const std::vector<Datum>& args, ExecListener* listener) override {
     RETURN_NOT_OK(BindArgs(args));
 
-    // This is the global/total state for the aggregation. Batches are
-    // aggregated independently and then merged into the state
-    RETURN_NOT_OK(InitState());
-
     ExecBatch batch;
     while (batch_iterator_->Next(&batch)) {
       // TODO: implement parallelism
@@ -857,7 +840,7 @@ class ScalarAggExecutor : public FunctionExecutorImpl<ScalarAggregateFunction> {
 
     Datum out;
     kernel_->finalize(&kernel_ctx_, &out);
-    CTX_RETURN_IF_ERROR(&kernel_ctx_);
+    ARROW_CTX_RETURN_IF_ERROR(&kernel_ctx_);
     RETURN_NOT_OK(listener->OnResult(std::move(out)));
     return Status::OK();
   }
@@ -872,16 +855,16 @@ class ScalarAggExecutor : public FunctionExecutorImpl<ScalarAggregateFunction> {
   Status Consume(const ExecBatch& batch) {
     KernelInitArgs init_args{kernel_, input_descrs_, options_};
     auto batch_state = kernel_->init(&kernel_ctx_, init_args);
-    CTX_RETURN_IF_ERROR(&kernel_ctx_);
+    ARROW_CTX_RETURN_IF_ERROR(&kernel_ctx_);
 
     KernelContext batch_ctx(exec_ctx_);
     batch_ctx.SetState(batch_state.get());
 
     kernel_->consume(&batch_ctx, batch);
-    CTX_RETURN_IF_ERROR(&batch_ctx);
+    ARROW_CTX_RETURN_IF_ERROR(&batch_ctx);
 
     kernel_->merge(&kernel_ctx_, *batch_state, state_.get());
-    CTX_RETURN_IF_ERROR(&kernel_ctx_);
+    ARROW_CTX_RETURN_IF_ERROR(&kernel_ctx_);
     return Status::OK();
   }
 };
@@ -908,16 +891,6 @@ Result<std::unique_ptr<FunctionExecutor>> FunctionExecutor::Make(
   }
 }
 
-Status CheckAllValues(const std::vector<Datum>& values) {
-  for (const auto& value : values) {
-    if (!value.is_value()) {
-      return Status::Invalid("Tried executing kernel with unsupported argument type: ",
-                             value.ToString());
-    }
-  }
-  return Status::OK();
-}
-
 }  // namespace detail
 
 ExecContext::ExecContext(MemoryPool* pool, FunctionRegistry* func_registry)
@@ -925,9 +898,7 @@ ExecContext::ExecContext(MemoryPool* pool, FunctionRegistry* func_registry)
   this->func_registry_ = func_registry == nullptr ? GetFunctionRegistry() : func_registry;
 }
 
-internal::CpuInfo* ExecContext::cpu_info() const {
-  return internal::CpuInfo::GetInstance();
-}
+CpuInfo* ExecContext::cpu_info() const { return CpuInfo::GetInstance(); }
 
 // ----------------------------------------------------------------------
 // SelectionVector
@@ -950,18 +921,9 @@ Result<Datum> CallFunction(ExecContext* ctx, const std::string& func_name,
     ExecContext default_ctx;
     return CallFunction(&default_ctx, func_name, args, options);
   }
-
-  // type-check Datum arguments here. Really we'd like to avoid this as much as
-  // possible
-  RETURN_NOT_OK(detail::CheckAllValues(args));
   ARROW_ASSIGN_OR_RAISE(std::shared_ptr<const Function> func,
                         ctx->func_registry()->GetFunction(func_name));
-  ARROW_ASSIGN_OR_RAISE(auto executor,
-                        detail::FunctionExecutor::Make(ctx, func.get(), options));
-
-  auto listener = std::make_shared<detail::DatumAccumulator>();
-  RETURN_NOT_OK(executor->Execute(args, listener.get()));
-  return executor->WrapResults(args, listener->values());
+  return func->Execute(args, options, ctx);
 }
 
 }  // namespace compute
diff --git a/cpp/src/arrow/compute/exec.h b/cpp/src/arrow/compute/exec.h
index 909f2830ae0..721d9e8d99e 100644
--- a/cpp/src/arrow/compute/exec.h
+++ b/cpp/src/arrow/compute/exec.h
@@ -60,7 +60,7 @@ class ARROW_EXPORT ExecContext {
 
   MemoryPool* memory_pool() const { return pool_; }
 
-  internal::CpuInfo* cpu_info() const;
+  ::arrow::internal::CpuInfo* cpu_info() const;
 
   FunctionRegistry* func_registry() const { return func_registry_; }
 
diff --git a/cpp/src/arrow/compute/exec_internal.h b/cpp/src/arrow/compute/exec_internal.h
index ad2cbdb025c..77157ed3ef7 100644
--- a/cpp/src/arrow/compute/exec_internal.h
+++ b/cpp/src/arrow/compute/exec_internal.h
@@ -34,16 +34,6 @@ namespace compute {
 
 class Function;
 
-// \brief Make a copy of the buffers into a destination array without carrying
-// the type.
-static inline void ZeroCopyData(const ArrayData& input, ArrayData* output) {
-  output->length = input.length;
-  output->SetNullCount(input.null_count);
-  output->buffers = input.buffers;
-  output->offset = input.offset;
-  output->child_data = input.child_data;
-}
-
 namespace detail {
 
 /// \brief Break std::vector<Datum> into a sequence of ExecBatch for kernel
@@ -88,9 +78,26 @@ class ARROW_EXPORT ExecListener {
  public:
   virtual ~ExecListener() = default;
 
-  virtual Status OnResult(Datum value);
+  virtual Status OnResult(Datum) { return Status::NotImplemented("OnResult"); }
 };
 
+class DatumAccumulator : public ExecListener {
+ public:
+  DatumAccumulator() {}
+
+  Status OnResult(Datum value) override {
+    values_.emplace_back(value);
+    return Status::OK();
+  }
+
+  std::vector<Datum> values() const { return values_; }
+
+ private:
+  std::vector<Datum> values_;
+};
+
+Status CheckAllValues(const std::vector<Datum>& values);
+
 class ARROW_EXPORT FunctionExecutor {
  public:
   virtual ~FunctionExecutor() = default;
diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc
index ae58da49f71..414f3219a6c 100644
--- a/cpp/src/arrow/compute/function.cc
+++ b/cpp/src/arrow/compute/function.cc
@@ -21,6 +21,8 @@
 #include <memory>
 #include <sstream>
 
+#include "arrow/compute/exec_internal.h"
+
 namespace arrow {
 
 struct ValueDescr;
@@ -78,6 +80,22 @@ Result<const KernelType*> DispatchExactImpl(const Function& func,
                                 FormatArgTypes(values));
 }
 
+Result<Datum> Function::Execute(const std::vector<Datum>& args,
+                                const FunctionOptions* options, ExecContext* ctx) const {
+  if (ctx == nullptr) {
+    ExecContext default_ctx;
+    return Execute(args, options, &default_ctx);
+  }
+  // type-check Datum arguments here. Really we'd like to avoid this as much as
+  // possible
+  RETURN_NOT_OK(detail::CheckAllValues(args));
+  ARROW_ASSIGN_OR_RAISE(auto executor,
+                        detail::FunctionExecutor::Make(ctx, this, options));
+  auto listener = std::make_shared<detail::DatumAccumulator>();
+  RETURN_NOT_OK(executor->Execute(args, listener.get()));
+  return executor->WrapResults(args, listener->values());
+}
+
 Status ScalarFunction::AddKernel(std::vector<InputType> in_types, OutputType out_type,
                                  ArrayKernelExec exec, KernelInit init) {
   RETURN_NOT_OK(CheckArity(in_types, arity_));
diff --git a/cpp/src/arrow/compute/function.h b/cpp/src/arrow/compute/function.h
index 3fa9ab1ae24..dfbacff3c22 100644
--- a/cpp/src/arrow/compute/function.h
+++ b/cpp/src/arrow/compute/function.h
@@ -92,6 +92,11 @@ class ARROW_EXPORT Function {
   /// \brief Returns the number of registered kernels for this function
   virtual int num_kernels() const = 0;
 
+  /// \brief Convenience for invoking a function with kernel dispatch and
+  /// memory allocation details taken care of
+  Result<Datum> Execute(const std::vector<Datum>& args, const FunctionOptions* options,
+                        ExecContext* ctx = NULLPTR) const;
+
  protected:
   Function(std::string name, Function::Kind kind, const FunctionArity& arity)
       : name_(std::move(name)), kind_(kind), arity_(arity) {}
@@ -136,7 +141,7 @@ class ARROW_EXPORT ScalarFunction : public detail::FunctionImpl<ScalarKernel> {
   /// fixed-width types, and default null handling (intersect validity bitmaps
   /// of inputs)
   Status AddKernel(std::vector<InputType> in_types, OutputType out_type,
-                   ArrayKernelExec func, KernelInit init = NULLPTR);
+                   ArrayKernelExec exec, KernelInit init = NULLPTR);
 
   /// \brief Add a kernel (function implementation). Returns error if fails
   /// to match the other parameters of the function
@@ -164,7 +169,7 @@ class ARROW_EXPORT VectorFunction : public detail::FunctionImpl<VectorKernel> {
   /// fixed-width types, and default null handling (intersect validity bitmaps
   /// of inputs)
   Status AddKernel(std::vector<InputType> in_types, OutputType out_type,
-                   ArrayKernelExec func, KernelInit init = NULLPTR);
+                   ArrayKernelExec exec, KernelInit init = NULLPTR);
 
   /// \brief Add a kernel (function implementation). Returns error if fails
   /// to match the other parameters of the function
diff --git a/cpp/src/arrow/compute/kernel.cc b/cpp/src/arrow/compute/kernel.cc
index 1ed781a9c58..9cee13175e2 100644
--- a/cpp/src/arrow/compute/kernel.cc
+++ b/cpp/src/arrow/compute/kernel.cc
@@ -180,7 +180,8 @@ OutputType::OutputType(ValueDescr descr) : OutputType(descr.type) {
   shape_ = descr.shape;
 }
 
-Result<ValueDescr> OutputType::Resolve(const std::vector<ValueDescr>& args) const {
+Result<ValueDescr> OutputType::Resolve(KernelContext* ctx,
+                                       const std::vector<ValueDescr>& args) const {
   if (kind_ == OutputType::FIXED) {
     ValueDescr::Shape out_shape = shape_;
     if (out_shape == ValueDescr::ANY) {
@@ -188,7 +189,7 @@ Result<ValueDescr> OutputType::Resolve(const std::vector<ValueDescr>& args) cons
     }
     return ValueDescr(type_, out_shape);
   } else {
-    return resolver_(args);
+    return resolver_(ctx, args);
   }
 }
 
diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h
index cb9becc00dd..28a11fd1fc6 100644
--- a/cpp/src/arrow/compute/kernel.h
+++ b/cpp/src/arrow/compute/kernel.h
@@ -92,6 +92,15 @@ class ARROW_EXPORT KernelContext {
   KernelState* state_;
 };
 
+#define ARROW_CTX_RETURN_IF_ERROR(CTX)              \
+  do {                                              \
+    if (ARROW_PREDICT_FALSE((CTX)->HasError())) {   \
+      Status s = (CTX)->status();                   \
+      (CTX)->ResetStatus();                         \
+      return s;                                     \
+    }                                               \
+  } while (0)
+
 /// A standard function taking zero or more Array/Scalar values and returning
 /// Array/Scalar output. May be used for SCALAR and VECTOR kernel kinds. Should
 /// write into pre-allocated memory except in cases when a builder
@@ -226,8 +235,10 @@ class ARROW_EXPORT OutputType {
 
   /// Type resolution function. Given input types and shapes, return output
   /// type and shape. This function SHOULD _not_ be used to check for arity,
-  /// that SHOULD be performed one or more layers above.
-  using Resolver = std::function<Result<ValueDescr>(const std::vector<ValueDescr>&)>;
+  /// that SHOULD be performed one or more layers above. May make use of kernel
+  /// state to know what type to output
+  using Resolver =
+    std::function<Result<ValueDescr>(KernelContext*, const std::vector<ValueDescr>&)>;
 
   OutputType(std::shared_ptr<DataType> type)  // NOLINT implicit construction
       : kind_(FIXED), type_(std::move(type)) {}
@@ -252,8 +263,10 @@ class ARROW_EXPORT OutputType {
   }
 
   /// \brief Return the shape and type of the expected output value of the
-  /// kernel given the value descriptors (shapes and types)
-  Result<ValueDescr> Resolve(const std::vector<ValueDescr>& args) const;
+  /// kernel given the value descriptors (shapes and types). The resolver may
+  /// make use of state information kept in the KernelContext
+  Result<ValueDescr> Resolve(KernelContext* ctx,
+                             const std::vector<ValueDescr>& args) const;
 
   /// \brief The value type for the FIXED kind rule
   const std::shared_ptr<DataType>& type() const;
diff --git a/cpp/src/arrow/compute/kernel_test.cc b/cpp/src/arrow/compute/kernel_test.cc
index 599a9fd4e6a..ba8422a6df4 100644
--- a/cpp/src/arrow/compute/kernel_test.cc
+++ b/cpp/src/arrow/compute/kernel_test.cc
@@ -205,13 +205,14 @@ TEST(OutputType, Constructors) {
   ASSERT_EQ(OutputType::FIXED, ty1.kind());
   AssertTypeEqual(*int8(), *ty1.type());
 
-  auto DummyResolver = [](const std::vector<ValueDescr>& args) {
+  auto DummyResolver = [](KernelContext*,
+                          const std::vector<ValueDescr>& args) -> Result<ValueDescr> {
     return ValueDescr(int32(), GetBroadcastShape(args));
   };
   OutputType ty2(DummyResolver);
   ASSERT_EQ(OutputType::COMPUTED, ty2.kind());
 
-  ASSERT_OK_AND_ASSIGN(ValueDescr out_descr2, ty2.Resolve({}));
+  ASSERT_OK_AND_ASSIGN(ValueDescr out_descr2, ty2.Resolve(nullptr, {}));
   ASSERT_EQ(ValueDescr::Scalar(int32()), out_descr2);
 
   // Copy constructor
@@ -221,7 +222,7 @@ TEST(OutputType, Constructors) {
 
   OutputType ty4 = ty2;
   ASSERT_EQ(OutputType::COMPUTED, ty4.kind());
-  ASSERT_OK_AND_ASSIGN(ValueDescr out_descr4, ty4.Resolve({}));
+  ASSERT_OK_AND_ASSIGN(ValueDescr out_descr4, ty4.Resolve(nullptr, {}));
   ASSERT_EQ(ValueDescr::Scalar(int32()), out_descr4);
 
   // Move constructor
@@ -231,7 +232,7 @@ TEST(OutputType, Constructors) {
 
   OutputType ty6 = std::move(ty4);
   ASSERT_EQ(OutputType::COMPUTED, ty6.kind());
-  ASSERT_OK_AND_ASSIGN(ValueDescr out_descr6, ty6.Resolve({}));
+  ASSERT_OK_AND_ASSIGN(ValueDescr out_descr6, ty6.Resolve(nullptr, {}));
   ASSERT_EQ(ValueDescr::Scalar(int32()), out_descr6);
 
   // ToString
@@ -245,33 +246,36 @@ TEST(OutputType, Resolve) {
   // Check shape promotion rules for FIXED kind
   OutputType ty1(int32());
 
-  ASSERT_OK_AND_ASSIGN(ValueDescr descr, ty1.Resolve({}));
+  ASSERT_OK_AND_ASSIGN(ValueDescr descr, ty1.Resolve(nullptr, {}));
   ASSERT_EQ(ValueDescr::Scalar(int32()), descr);
 
-  ASSERT_OK_AND_ASSIGN(descr, ty1.Resolve({ValueDescr(int8(), ValueDescr::SCALAR)}));
+  ASSERT_OK_AND_ASSIGN(descr, ty1.Resolve(nullptr,
+                                          {ValueDescr(int8(), ValueDescr::SCALAR)}));
   ASSERT_EQ(ValueDescr::Scalar(int32()), descr);
 
-  ASSERT_OK_AND_ASSIGN(descr, ty1.Resolve({ValueDescr(int8(), ValueDescr::SCALAR),
+  ASSERT_OK_AND_ASSIGN(descr, ty1.Resolve(nullptr,
+                                          {ValueDescr(int8(), ValueDescr::SCALAR),
                                            ValueDescr(int8(), ValueDescr::ARRAY)}));
   ASSERT_EQ(ValueDescr::Array(int32()), descr);
 
-  OutputType ty2([](const std::vector<ValueDescr>& args) -> Result<ValueDescr> {
+  OutputType ty2([](KernelContext*, const std::vector<ValueDescr>& args) {
     return ValueDescr(args[0].type, GetBroadcastShape(args));
   });
 
-  ASSERT_OK_AND_ASSIGN(descr, ty2.Resolve({ValueDescr::Array(utf8())}));
+  ASSERT_OK_AND_ASSIGN(descr, ty2.Resolve(nullptr, {ValueDescr::Array(utf8())}));
   ASSERT_EQ(ValueDescr::Array(utf8()), descr);
 
   // Type resolver that returns an error
-  OutputType ty3([](const std::vector<ValueDescr>& args) -> Result<ValueDescr> {
-    // NB: checking the value types versus the function arity should be
-    // validated elsewhere, so this is just for illustration purposes
-    if (args.size() == 0) {
-      return Status::Invalid("Need at least one argument");
-    }
-    return ValueDescr(args[0]);
-  });
-  ASSERT_RAISES(Invalid, ty3.Resolve({}));
+  OutputType ty3(
+      [](KernelContext* ctx, const std::vector<ValueDescr>& args) -> Result<ValueDescr> {
+        // NB: checking the value types versus the function arity should be
+        // validated elsewhere, so this is just for illustration purposes
+        if (args.size() == 0) {
+          return Status::Invalid("Need at least one argument");
+        }
+        return ValueDescr(args[0]);
+      });
+  ASSERT_RAISES(Invalid, ty3.Resolve(nullptr, {}));
 }
 
 TEST(OutputType, ResolveDescr) {
@@ -285,12 +289,12 @@ TEST(OutputType, ResolveDescr) {
   ASSERT_EQ(ValueDescr::ARRAY, ty2.shape());
 
   {
-    ASSERT_OK_AND_ASSIGN(ValueDescr descr, ty1.Resolve({}));
+    ASSERT_OK_AND_ASSIGN(ValueDescr descr, ty1.Resolve(nullptr, {}));
     ASSERT_EQ(d1, descr);
   }
 
   {
-    ASSERT_OK_AND_ASSIGN(ValueDescr descr, ty2.Resolve({}));
+    ASSERT_OK_AND_ASSIGN(ValueDescr descr, ty2.Resolve(nullptr, {}));
     ASSERT_EQ(d2, descr);
   }
 }
@@ -420,7 +424,9 @@ TEST(KernelSignature, ToString) {
             sig.ToString());
 
   OutputType out_type(
-      [](const std::vector<ValueDescr>& args) { return Status::Invalid("NYI"); });
+      [](KernelContext*, const std::vector<ValueDescr>& args) {
+        return Status::Invalid("NYI");
+      });
   KernelSignature sig2({int8(), Type::DECIMAL}, out_type);
   ASSERT_EQ("(any[int8], any[decimal*]) -> computed", sig2.ToString());
 }
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc
index 463c6a714c0..60df0eb0c68 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.cc
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc
@@ -145,7 +145,7 @@ const std::vector<std::shared_ptr<DataType>>& PrimitiveTypes() {
   return g_primitive_types;
 }
 
-Result<ValueDescr> FirstType(const std::vector<ValueDescr>& descrs) {
+Result<ValueDescr> FirstType(KernelContext*, const std::vector<ValueDescr>& descrs) {
   return descrs[0];
 }
 
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h
index fc3c8bcedc7..ad6e00f4ac7 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.h
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -25,20 +25,41 @@
 #include "arrow/type_traits.h"
 #include "arrow/util/bit_util.h"
 #include "arrow/util/logging.h"
+#include "arrow/util/optional.h"
 #include "arrow/util/string_view.h"
 
 namespace arrow {
+
+using internal::BitmapReader;
+using internal::GenerateBitsUnrolled;
+
 namespace compute {
 
-#define CTX_RETURN_IF_ERROR(CTX, S)                             \
-  do {                                                          \
-    ::arrow::Status _s = ::arrow::internal::GenericToStatus(S); \
-    if (ARROW_PREDICT_FALSE(!_s.ok())) {                        \
-      (CTX)->SetStatus(_s);                                     \
-      return;                                                   \
-    }                                                           \
+#ifdef ARROW_EXTRA_ERROR_CONTEXT
+
+#define KERNEL_ABORT_IF_ERROR(ctx, expr)                \
+  do {                                                  \
+    Status _st = (expr);                                \
+    if (ARROW_PREDICT_FALSE(!_st.ok())) {               \
+      _st.AddContextLine(__FILE__, __LINE__, #expr);    \
+      ctx->SetStatus(_st);                              \
+      return;                                           \
+    }                                                   \
+  } while (0)
+
+#else
+
+#define KERNEL_ABORT_IF_ERROR(ctx, expr)        \
+  do {                                          \
+    Status _st = (expr);                        \
+    if (ARROW_PREDICT_FALSE(!_st.ok())) {       \
+      ctx->SetStatus(_st);                      \
+      return;                                   \
+    }                                           \
   } while (0)
 
+#endif  // ARROW_EXTRA_ERROR_CONTEXT
+
 // A kernel that exposes Call methods that handles iteration over ArrayData
 // inputs itself
 //
@@ -81,7 +102,7 @@ struct ArrayIterator<Type, enable_if_has_c_type_not_boolean<Type>> {
 
 template <typename Type>
 struct ArrayIterator<Type, enable_if_boolean<Type>> {
-  internal::BitmapReader reader;
+  BitmapReader reader;
   ArrayIterator(const ArrayData& data)
       : reader(data.buffers[1]->data(), data.offset, data.length) {}
   bool operator()() {
@@ -246,8 +267,8 @@ struct OutputAdapter<Type, enable_if_boolean<Type>> {
   static void Write(KernelContext*, Datum* out, Generator&& generator) {
     ArrayData* out_arr = out->mutable_array();
     auto out_bitmap = out_arr->buffers[1]->mutable_data();
-    internal::GenerateBitsUnrolled(out_bitmap, out_arr->offset, out_arr->length,
-                                   std::forward<Generator>(generator));
+    GenerateBitsUnrolled(out_bitmap, out_arr->offset, out_arr->length,
+                         std::forward<Generator>(generator));
   }
 };
 
@@ -287,8 +308,12 @@ struct ScalarUnary {
   }
 
   static void Scalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    ARG0 arg0 = UnboxScalar<Arg0Type>::Unbox(batch[0]);
-    out->value = std::make_shared<OutScalar>(Op::template Call<OUT, ARG0>(ctx, arg0));
+    if (batch[0].scalar()->is_valid) {
+      ARG0 arg0 = UnboxScalar<Arg0Type>::Unbox(batch[0]);
+      out->value = std::make_shared<OutScalar>(Op::template Call<OUT, ARG0>(ctx, arg0));
+    } else {
+      out->value = MakeNullScalar(batch[0].type());
+    }
   }
 
   static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
@@ -300,6 +325,48 @@ struct ScalarUnary {
   }
 };
 
+// Applies a scalar operation with state on the null-null values of a single
+// array
+template <typename OutType, typename Arg0Type, typename Op>
+struct ScalarUnaryNotNullStateful {
+  using OutScalar = typename TypeTraits<OutType>::ScalarType;
+  using OUT = typename CodegenTraits<OutType>::value_type;
+  using ARG0 = typename CodegenTraits<Arg0Type>::value_type;
+
+  Op op;
+
+  ScalarUnaryNotNullStateful(Op op) : op(std::move(op)) {}
+
+  void Array(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    ArrayData* out_arr = out->mutable_array();
+    auto out_data = out_arr->GetMutableValues<OUT>(kPrimitiveData);
+    VisitArrayDataInline(*batch[0].array(), [&](util::optional<ARG0> v) {
+        if (v.has_value()) {
+          *out_data = this->op.template Call<OUT, ARG0>(ctx, *v);
+        }
+        ++out_data;
+      });
+  }
+
+  void Scalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    if (batch[0].scalar()->is_valid) {
+      ARG0 arg0 = UnboxScalar<Arg0Type>::Unbox(batch[0]);
+      out->value = std::make_shared<OutScalar>(
+          this->op.template Call<OUT, ARG0>(ctx, arg0));
+    } else {
+      out->value = MakeNullScalar(batch[0].type());
+    }
+  }
+
+  void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    if (batch[0].kind() == Datum::ARRAY) {
+      return Array(ctx, batch, out);
+    } else {
+      return Scalar(ctx, batch, out);
+    }
+  }
+};
+
 template <typename OutType, typename Arg0Type, typename Arg1Type, typename Op,
           typename FlippedOp = Op>
 struct ScalarBinary {
@@ -530,7 +597,7 @@ ArrayKernelExec Temporal(const DataType& type) {
 // ----------------------------------------------------------------------
 // Reusable type resolvers
 
-Result<ValueDescr> FirstType(const std::vector<ValueDescr>& descrs);
+Result<ValueDescr> FirstType(KernelContext*, const std::vector<ValueDescr>& descrs);
 
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/generated/cast_codegen_internal.h b/cpp/src/arrow/compute/kernels/generated/cast_codegen_internal.h
deleted file mode 100644
index bc4ce64d0fb..00000000000
--- a/cpp/src/arrow/compute/kernels/generated/cast_codegen_internal.h
+++ /dev/null
@@ -1,258 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// THIS FILE IS AUTOMATICALLY GENERATED, DO NOT EDIT
-// Generated by codegen.py script
-#define BOOLEAN_CASES(TEMPLATE) \
-  TEMPLATE(BooleanType, Int8Type) \
-  TEMPLATE(BooleanType, Int16Type) \
-  TEMPLATE(BooleanType, Int32Type) \
-  TEMPLATE(BooleanType, Int64Type) \
-  TEMPLATE(BooleanType, UInt8Type) \
-  TEMPLATE(BooleanType, UInt16Type) \
-  TEMPLATE(BooleanType, UInt32Type) \
-  TEMPLATE(BooleanType, UInt64Type) \
-  TEMPLATE(BooleanType, FloatType) \
-  TEMPLATE(BooleanType, DoubleType) \
-  TEMPLATE(BooleanType, StringType) \
-  TEMPLATE(BooleanType, LargeStringType)
-
-#define UINT8_CASES(TEMPLATE) \
-  TEMPLATE(UInt8Type, BooleanType) \
-  TEMPLATE(UInt8Type, Int8Type) \
-  TEMPLATE(UInt8Type, Int16Type) \
-  TEMPLATE(UInt8Type, Int32Type) \
-  TEMPLATE(UInt8Type, Int64Type) \
-  TEMPLATE(UInt8Type, UInt16Type) \
-  TEMPLATE(UInt8Type, UInt32Type) \
-  TEMPLATE(UInt8Type, UInt64Type) \
-  TEMPLATE(UInt8Type, FloatType) \
-  TEMPLATE(UInt8Type, DoubleType) \
-  TEMPLATE(UInt8Type, StringType) \
-  TEMPLATE(UInt8Type, LargeStringType)
-
-#define INT8_CASES(TEMPLATE) \
-  TEMPLATE(Int8Type, BooleanType) \
-  TEMPLATE(Int8Type, Int16Type) \
-  TEMPLATE(Int8Type, Int32Type) \
-  TEMPLATE(Int8Type, Int64Type) \
-  TEMPLATE(Int8Type, UInt8Type) \
-  TEMPLATE(Int8Type, UInt16Type) \
-  TEMPLATE(Int8Type, UInt32Type) \
-  TEMPLATE(Int8Type, UInt64Type) \
-  TEMPLATE(Int8Type, FloatType) \
-  TEMPLATE(Int8Type, DoubleType) \
-  TEMPLATE(Int8Type, StringType) \
-  TEMPLATE(Int8Type, LargeStringType)
-
-#define UINT16_CASES(TEMPLATE) \
-  TEMPLATE(UInt16Type, BooleanType) \
-  TEMPLATE(UInt16Type, Int8Type) \
-  TEMPLATE(UInt16Type, Int16Type) \
-  TEMPLATE(UInt16Type, Int32Type) \
-  TEMPLATE(UInt16Type, Int64Type) \
-  TEMPLATE(UInt16Type, UInt8Type) \
-  TEMPLATE(UInt16Type, UInt32Type) \
-  TEMPLATE(UInt16Type, UInt64Type) \
-  TEMPLATE(UInt16Type, FloatType) \
-  TEMPLATE(UInt16Type, DoubleType) \
-  TEMPLATE(UInt16Type, StringType) \
-  TEMPLATE(UInt16Type, LargeStringType)
-
-#define INT16_CASES(TEMPLATE) \
-  TEMPLATE(Int16Type, BooleanType) \
-  TEMPLATE(Int16Type, Int8Type) \
-  TEMPLATE(Int16Type, Int32Type) \
-  TEMPLATE(Int16Type, Int64Type) \
-  TEMPLATE(Int16Type, UInt8Type) \
-  TEMPLATE(Int16Type, UInt16Type) \
-  TEMPLATE(Int16Type, UInt32Type) \
-  TEMPLATE(Int16Type, UInt64Type) \
-  TEMPLATE(Int16Type, FloatType) \
-  TEMPLATE(Int16Type, DoubleType) \
-  TEMPLATE(Int16Type, StringType) \
-  TEMPLATE(Int16Type, LargeStringType)
-
-#define UINT32_CASES(TEMPLATE) \
-  TEMPLATE(UInt32Type, BooleanType) \
-  TEMPLATE(UInt32Type, Int8Type) \
-  TEMPLATE(UInt32Type, Int16Type) \
-  TEMPLATE(UInt32Type, Int32Type) \
-  TEMPLATE(UInt32Type, Int64Type) \
-  TEMPLATE(UInt32Type, UInt8Type) \
-  TEMPLATE(UInt32Type, UInt16Type) \
-  TEMPLATE(UInt32Type, UInt64Type) \
-  TEMPLATE(UInt32Type, FloatType) \
-  TEMPLATE(UInt32Type, DoubleType) \
-  TEMPLATE(UInt32Type, StringType) \
-  TEMPLATE(UInt32Type, LargeStringType)
-
-#define UINT64_CASES(TEMPLATE) \
-  TEMPLATE(UInt64Type, BooleanType) \
-  TEMPLATE(UInt64Type, Int8Type) \
-  TEMPLATE(UInt64Type, Int16Type) \
-  TEMPLATE(UInt64Type, Int32Type) \
-  TEMPLATE(UInt64Type, Int64Type) \
-  TEMPLATE(UInt64Type, UInt8Type) \
-  TEMPLATE(UInt64Type, UInt16Type) \
-  TEMPLATE(UInt64Type, UInt32Type) \
-  TEMPLATE(UInt64Type, FloatType) \
-  TEMPLATE(UInt64Type, DoubleType) \
-  TEMPLATE(UInt64Type, StringType) \
-  TEMPLATE(UInt64Type, LargeStringType)
-
-#define INT32_CASES(TEMPLATE) \
-  TEMPLATE(Int32Type, BooleanType) \
-  TEMPLATE(Int32Type, Int8Type) \
-  TEMPLATE(Int32Type, Int16Type) \
-  TEMPLATE(Int32Type, Int64Type) \
-  TEMPLATE(Int32Type, UInt8Type) \
-  TEMPLATE(Int32Type, UInt16Type) \
-  TEMPLATE(Int32Type, UInt32Type) \
-  TEMPLATE(Int32Type, UInt64Type) \
-  TEMPLATE(Int32Type, FloatType) \
-  TEMPLATE(Int32Type, DoubleType) \
-  TEMPLATE(Int32Type, StringType) \
-  TEMPLATE(Int32Type, LargeStringType)
-
-#define INT64_CASES(TEMPLATE) \
-  TEMPLATE(Int64Type, BooleanType) \
-  TEMPLATE(Int64Type, Int8Type) \
-  TEMPLATE(Int64Type, Int16Type) \
-  TEMPLATE(Int64Type, Int32Type) \
-  TEMPLATE(Int64Type, UInt8Type) \
-  TEMPLATE(Int64Type, UInt16Type) \
-  TEMPLATE(Int64Type, UInt32Type) \
-  TEMPLATE(Int64Type, UInt64Type) \
-  TEMPLATE(Int64Type, FloatType) \
-  TEMPLATE(Int64Type, DoubleType) \
-  TEMPLATE(Int64Type, StringType) \
-  TEMPLATE(Int64Type, LargeStringType)
-
-#define FLOAT_CASES(TEMPLATE) \
-  TEMPLATE(FloatType, BooleanType) \
-  TEMPLATE(FloatType, Int8Type) \
-  TEMPLATE(FloatType, Int16Type) \
-  TEMPLATE(FloatType, Int32Type) \
-  TEMPLATE(FloatType, Int64Type) \
-  TEMPLATE(FloatType, UInt8Type) \
-  TEMPLATE(FloatType, UInt16Type) \
-  TEMPLATE(FloatType, UInt32Type) \
-  TEMPLATE(FloatType, UInt64Type) \
-  TEMPLATE(FloatType, DoubleType) \
-  TEMPLATE(FloatType, StringType) \
-  TEMPLATE(FloatType, LargeStringType)
-
-#define DOUBLE_CASES(TEMPLATE) \
-  TEMPLATE(DoubleType, BooleanType) \
-  TEMPLATE(DoubleType, Int8Type) \
-  TEMPLATE(DoubleType, Int16Type) \
-  TEMPLATE(DoubleType, Int32Type) \
-  TEMPLATE(DoubleType, Int64Type) \
-  TEMPLATE(DoubleType, UInt8Type) \
-  TEMPLATE(DoubleType, UInt16Type) \
-  TEMPLATE(DoubleType, UInt32Type) \
-  TEMPLATE(DoubleType, UInt64Type) \
-  TEMPLATE(DoubleType, FloatType) \
-  TEMPLATE(DoubleType, StringType) \
-  TEMPLATE(DoubleType, LargeStringType)
-
-#define DECIMAL128_CASES(TEMPLATE) \
-  TEMPLATE(Decimal128Type, Decimal128Type) \
-  TEMPLATE(Decimal128Type, Int8Type) \
-  TEMPLATE(Decimal128Type, Int16Type) \
-  TEMPLATE(Decimal128Type, Int32Type) \
-  TEMPLATE(Decimal128Type, Int64Type)
-
-#define DATE32_CASES(TEMPLATE) \
-  TEMPLATE(Date32Type, Date64Type)
-
-#define DATE64_CASES(TEMPLATE) \
-  TEMPLATE(Date64Type, Date32Type)
-
-#define TIME32_CASES(TEMPLATE) \
-  TEMPLATE(Time32Type, Time32Type) \
-  TEMPLATE(Time32Type, Time64Type)
-
-#define TIME64_CASES(TEMPLATE) \
-  TEMPLATE(Time64Type, Time32Type) \
-  TEMPLATE(Time64Type, Time64Type)
-
-#define TIMESTAMP_CASES(TEMPLATE) \
-  TEMPLATE(TimestampType, Date32Type) \
-  TEMPLATE(TimestampType, Date64Type) \
-  TEMPLATE(TimestampType, TimestampType)
-
-#define DURATION_CASES(TEMPLATE) \
-  TEMPLATE(DurationType, DurationType)
-
-#define BINARY_CASES(TEMPLATE) \
-  TEMPLATE(BinaryType, StringType)
-
-#define LARGEBINARY_CASES(TEMPLATE) \
-  TEMPLATE(LargeBinaryType, LargeStringType)
-
-#define STRING_CASES(TEMPLATE) \
-  TEMPLATE(StringType, BooleanType) \
-  TEMPLATE(StringType, Int8Type) \
-  TEMPLATE(StringType, Int16Type) \
-  TEMPLATE(StringType, Int32Type) \
-  TEMPLATE(StringType, Int64Type) \
-  TEMPLATE(StringType, UInt8Type) \
-  TEMPLATE(StringType, UInt16Type) \
-  TEMPLATE(StringType, UInt32Type) \
-  TEMPLATE(StringType, UInt64Type) \
-  TEMPLATE(StringType, FloatType) \
-  TEMPLATE(StringType, DoubleType) \
-  TEMPLATE(StringType, TimestampType)
-
-#define LARGESTRING_CASES(TEMPLATE) \
-  TEMPLATE(LargeStringType, BooleanType) \
-  TEMPLATE(LargeStringType, Int8Type) \
-  TEMPLATE(LargeStringType, Int16Type) \
-  TEMPLATE(LargeStringType, Int32Type) \
-  TEMPLATE(LargeStringType, Int64Type) \
-  TEMPLATE(LargeStringType, UInt8Type) \
-  TEMPLATE(LargeStringType, UInt16Type) \
-  TEMPLATE(LargeStringType, UInt32Type) \
-  TEMPLATE(LargeStringType, UInt64Type) \
-  TEMPLATE(LargeStringType, FloatType) \
-  TEMPLATE(LargeStringType, DoubleType) \
-  TEMPLATE(LargeStringType, TimestampType)
-
-#define DICTIONARY_CASES(TEMPLATE) \
-  TEMPLATE(DictionaryType, Int8Type) \
-  TEMPLATE(DictionaryType, Int16Type) \
-  TEMPLATE(DictionaryType, Int32Type) \
-  TEMPLATE(DictionaryType, Int64Type) \
-  TEMPLATE(DictionaryType, UInt8Type) \
-  TEMPLATE(DictionaryType, UInt16Type) \
-  TEMPLATE(DictionaryType, UInt32Type) \
-  TEMPLATE(DictionaryType, UInt64Type) \
-  TEMPLATE(DictionaryType, FloatType) \
-  TEMPLATE(DictionaryType, DoubleType) \
-  TEMPLATE(DictionaryType, Date32Type) \
-  TEMPLATE(DictionaryType, Date64Type) \
-  TEMPLATE(DictionaryType, Time32Type) \
-  TEMPLATE(DictionaryType, Time64Type) \
-  TEMPLATE(DictionaryType, TimestampType) \
-  TEMPLATE(DictionaryType, DurationType) \
-  TEMPLATE(DictionaryType, NullType) \
-  TEMPLATE(DictionaryType, BinaryType) \
-  TEMPLATE(DictionaryType, FixedSizeBinaryType) \
-  TEMPLATE(DictionaryType, StringType) \
-  TEMPLATE(DictionaryType, Decimal128Type)
diff --git a/cpp/src/arrow/compute/kernels/generated/codegen.py b/cpp/src/arrow/compute/kernels/generated/codegen.py
deleted file mode 100644
index e3bdd58a0b9..00000000000
--- a/cpp/src/arrow/compute/kernels/generated/codegen.py
+++ /dev/null
@@ -1,142 +0,0 @@
-#!/usr/bin/env python
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Generate boilerplate code for kernel instantiation and other tedious tasks
-
-
-import io
-import os
-
-SIGNED_INTEGER_TYPES = ['Int8', 'Int16', 'Int32', 'Int64']
-INTEGER_TYPES = SIGNED_INTEGER_TYPES + ['UInt8', 'UInt16', 'UInt32', 'UInt64']
-FLOATING_TYPES = ['Float', 'Double']
-NUMERIC_TYPES = ['Boolean'] + INTEGER_TYPES + FLOATING_TYPES
-STRING_TYPES = ['String', 'LargeString']
-
-DATE_TIME_TYPES = ['Date32', 'Date64', 'Time32', 'Time64', 'Timestamp',
-                   'Duration']
-
-
-def _format_type(name):
-    return name + "Type"
-
-
-class CastCodeGenerator(object):
-
-    def __init__(self, type_name, out_types, parametric=False,
-                 exclusions=None):
-        self.type_name = type_name
-        self.out_types = out_types
-        self.parametric = parametric
-        self.exclusions = exclusions
-
-    def generate(self):
-        buf = io.StringIO()
-        print("#define {0}_CASES(TEMPLATE) \\"
-              .format(self.type_name.upper()), file=buf)
-
-        this_type = _format_type(self.type_name)
-
-        templates = []
-        for out_type in self.out_types:
-            if not self.parametric and out_type == self.type_name:
-                # Parametric types need T -> T cast generated
-                continue
-            templates.append("  TEMPLATE({0}, {1})"
-                             .format(this_type, _format_type(out_type)))
-
-        print(" \\\n".join(templates), file=buf)
-        return buf.getvalue()
-
-
-CAST_GENERATORS = [
-    CastCodeGenerator('Boolean', NUMERIC_TYPES + STRING_TYPES),
-    CastCodeGenerator('UInt8', NUMERIC_TYPES + STRING_TYPES),
-    CastCodeGenerator('Int8', NUMERIC_TYPES + STRING_TYPES),
-    CastCodeGenerator('UInt16', NUMERIC_TYPES + STRING_TYPES),
-    CastCodeGenerator('Int16', NUMERIC_TYPES + STRING_TYPES),
-    CastCodeGenerator('UInt32', NUMERIC_TYPES + STRING_TYPES),
-    CastCodeGenerator('UInt64', NUMERIC_TYPES + STRING_TYPES),
-    CastCodeGenerator('Int32', NUMERIC_TYPES + STRING_TYPES),
-    CastCodeGenerator('Int64', NUMERIC_TYPES + STRING_TYPES),
-    CastCodeGenerator('Float', NUMERIC_TYPES + STRING_TYPES),
-    CastCodeGenerator('Double', NUMERIC_TYPES + STRING_TYPES),
-    CastCodeGenerator('Decimal128', ['Decimal128'] + SIGNED_INTEGER_TYPES,
-                      parametric=True),
-    CastCodeGenerator('Date32', ['Date64']),
-    CastCodeGenerator('Date64', ['Date32']),
-    CastCodeGenerator('Time32', ['Time32', 'Time64'],
-                      parametric=True),
-    CastCodeGenerator('Time64', ['Time32', 'Time64'],
-                      parametric=True),
-    CastCodeGenerator('Timestamp', ['Date32', 'Date64', 'Timestamp'],
-                      parametric=True),
-    CastCodeGenerator('Duration', ['Duration'], parametric=True),
-    CastCodeGenerator('Binary', ['String']),
-    CastCodeGenerator('LargeBinary', ['LargeString']),
-    CastCodeGenerator('String', NUMERIC_TYPES + ['Timestamp']),
-    CastCodeGenerator('LargeString', NUMERIC_TYPES + ['Timestamp']),
-    CastCodeGenerator('Dictionary',
-                      INTEGER_TYPES + FLOATING_TYPES + DATE_TIME_TYPES +
-                      ['Null', 'Binary', 'FixedSizeBinary', 'String',
-                       'Decimal128'])
-]
-
-
-def generate_cast_code():
-    blocks = [generator.generate() for generator in CAST_GENERATORS]
-    return '\n'.join(blocks)
-
-
-def write_file_with_preamble(path, code):
-    preamble = """// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// THIS FILE IS AUTOMATICALLY GENERATED, DO NOT EDIT
-// Generated by codegen.py script
-"""
-
-    with open(path, 'wb') as f:
-        f.write(preamble.encode('utf-8'))
-        f.write(code.encode('utf-8'))
-
-
-def write_files():
-    here = os.path.abspath(os.path.dirname(__file__))
-    cast_code = generate_cast_code()
-    write_file_with_preamble(os.path.join(here, 'cast_codegen_internal.h'),
-                             cast_code)
-
-
-if __name__ == '__main__':
-    write_files()
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast.cc b/cpp/src/arrow/compute/kernels/scalar_cast.cc
index bc82418befb..93f10bb1f19 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast.cc
@@ -50,400 +50,5 @@ namespace arrow {
 using internal::checked_cast;
 using internal::CopyBitmap;
 
-namespace compute {
-
-Status CastNotImplemented(const DataType& in_type, const DataType& out_type) {
-  return Status::NotImplemented("No cast implemented from ", in_type.ToString(), " to ",
-                                out_type.ToString());
-}
-
-// ----------------------------------------------------------------------
-// Dictionary to null
-
-template <>
-struct CastFunctor<NullType, DictionaryType> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    output->buffers = {nullptr};
-    output->null_count = output->length;
-  }
-};
-
-// ----------------------------------------------------------------------
-// Null to other things
-
-class FromNullCastKernel : public CastKernelBase {
- public:
-  explicit FromNullCastKernel(std::shared_ptr<DataType> out_type)
-      : CastKernelBase(std::move(out_type)) {}
-
-  Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override {
-    DCHECK_EQ(Datum::ARRAY, input.kind());
-
-    const ArrayData& in_data = *input.array();
-    DCHECK_EQ(Type::NA, in_data.type->id());
-    auto length = in_data.length;
-
-    // A ArrayData may be preallocated for the output (see InvokeUnaryArrayKernel),
-    // however, it doesn't have any actual data, so throw it away and start anew.
-    std::unique_ptr<ArrayBuilder> builder;
-    RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), out_type_, &builder));
-    NullBuilderVisitor visitor = {length, builder.get()};
-    RETURN_NOT_OK(VisitTypeInline(*out_type_, &visitor));
-
-    std::shared_ptr<Array> out_array;
-    RETURN_NOT_OK(visitor.builder_->Finish(&out_array));
-    out->value = out_array->data();
-    return Status::OK();
-  }
-
-  struct NullBuilderVisitor {
-    // Generic implementation
-    Status Visit(const DataType& type) { return builder_->AppendNulls(length_); }
-
-    Status Visit(const StructType& type) {
-      RETURN_NOT_OK(builder_->AppendNulls(length_));
-      auto& struct_builder = checked_cast<StructBuilder&>(*builder_);
-      // Append nulls to all child builders too
-      for (int i = 0; i < struct_builder.num_fields(); ++i) {
-        NullBuilderVisitor visitor = {length_, struct_builder.field_builder(i)};
-        RETURN_NOT_OK(VisitTypeInline(*type.field(i)->type(), &visitor));
-      }
-      return Status::OK();
-    }
-
-    Status Visit(const DictionaryType& type) {
-      // XXX (ARROW-5215): Cannot implement this easily, as DictionaryBuilder
-      // disregards the index type given in the dictionary type, and instead
-      // chooses the smallest possible index type.
-      return CastNotImplemented(*null(), type);
-    }
-
-    Status Visit(const UnionType& type) { return CastNotImplemented(*null(), type); }
-
-    int64_t length_;
-    ArrayBuilder* builder_;
-  };
-};
-
-// ----------------------------------------------------------------------
-
-class IdentityCast : public CastKernelBase {
- public:
-  using CastKernelBase::CastKernelBase;
-
-  Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override {
-    DCHECK_EQ(input.kind(), Datum::ARRAY);
-    out->value = input.array()->Copy();
-    return Status::OK();
-  }
-};
-
-class ZeroCopyCast : public CastKernelBase {
- public:
-  using CastKernelBase::CastKernelBase;
-
-  Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override {
-    DCHECK_EQ(input.kind(), Datum::ARRAY);
-    auto result = input.array()->Copy();
-    result->type = out_type_;
-    out->value = result;
-    return Status::OK();
-  }
-};
-
-class ExtensionCastKernel : public CastKernelBase {
- public:
-  static Status Make(const DataType& in_type, std::shared_ptr<DataType> out_type,
-                     const CastOptions& options,
-                     std::unique_ptr<CastKernelBase>* kernel) {
-    const auto storage_type = checked_cast<const ExtensionType&>(in_type).storage_type();
-
-    std::unique_ptr<UnaryKernel> storage_caster;
-    RETURN_NOT_OK(GetCastFunction(*storage_type, out_type, options, &storage_caster));
-    kernel->reset(
-        new ExtensionCastKernel(std::move(storage_caster), std::move(out_type)));
-
-    return Status::OK();
-  }
-
-  Status Init(const DataType& in_type) override {
-    auto& type = checked_cast<const ExtensionType&>(in_type);
-    storage_type_ = type.storage_type();
-    extension_name_ = type.extension_name();
-    return Status::OK();
-  }
-
-  Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override {
-    DCHECK_EQ(input.kind(), Datum::ARRAY);
-
-    // validate: type is the same as the type the kernel was constructed with
-    const auto& input_type = checked_cast<const ExtensionType&>(*input.type());
-    if (input_type.extension_name() != extension_name_) {
-      return Status::TypeError(
-          "The cast kernel was constructed to cast from the extension type named '",
-          extension_name_, "' but input has extension type named '",
-          input_type.extension_name(), "'");
-    }
-    if (!input_type.storage_type()->Equals(storage_type_)) {
-      return Status::TypeError("The cast kernel was constructed with a storage type: ",
-                               storage_type_->ToString(),
-                               ", but it is called with a different storage type:",
-                               input_type.storage_type()->ToString());
-    }
-
-    // construct an ArrayData object with the underlying storage type
-    auto new_input = input.array()->Copy();
-    new_input->type = storage_type_;
-    return InvokeWithAllocation(ctx, storage_caster_.get(), new_input, out);
-  }
-
- protected:
-  ExtensionCastKernel(std::unique_ptr<UnaryKernel> storage_caster,
-                      std::shared_ptr<DataType> out_type)
-      : CastKernelBase(std::move(out_type)), storage_caster_(std::move(storage_caster)) {}
-
-  std::string extension_name_;
-  std::shared_ptr<DataType> storage_type_;
-  std::unique_ptr<UnaryKernel> storage_caster_;
-};
-
-class CastKernel : public CastKernelBase {
- public:
-  CastKernel(const CastOptions& options, const CastFunction& func,
-             std::shared_ptr<DataType> out_type)
-      : CastKernelBase(std::move(out_type)), options_(options), func_(func) {}
-
-  Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override {
-    DCHECK_EQ(input.kind(), Datum::ARRAY);
-    DCHECK_EQ(out->kind(), Datum::ARRAY);
-
-    const ArrayData& in_data = *input.array();
-    ArrayData* result = out->array().get();
-
-    RETURN_NOT_OK(detail::PropagateNulls(ctx, in_data, result));
-
-    func_(ctx, options_, in_data, result);
-    ARROW_RETURN_IF_ERROR(ctx);
-    return Status::OK();
-  }
-
- private:
-  CastOptions options_;
-  CastFunction func_;
-};
-
-class DictionaryCastKernel : public CastKernel {
- public:
-  using CastKernel::CastKernel;
-
-  Status Init(const DataType& in_type) override {
-    const auto value_type = checked_cast<const DictionaryType&>(in_type).value_type();
-    if (!out_type_->Equals(value_type)) {
-      return CastNotImplemented(in_type, *out_type_);
-    }
-    return Status::OK();
-  }
-};
-
-#define CAST_CASE(InType, OutType)                                                      \
-  case OutType::type_id:                                                                \
-    func = [](FunctionContext* ctx, const CastOptions& options, const ArrayData& input, \
-              ArrayData* out) {                                                         \
-      CastFunctor<OutType, InType> func;                                                \
-      func(ctx, options, input, out);                                                   \
-    };                                                                                  \
-    break;
-
-#define GET_CAST_FUNCTION(CASE_GENERATOR, InType, KernelType)           \
-  static std::unique_ptr<CastKernelBase> Get##InType##CastFunc(         \
-      std::shared_ptr<DataType> out_type, const CastOptions& options) { \
-    CastFunction func;                                                  \
-    switch (out_type->id()) {                                           \
-      CASE_GENERATOR(CAST_CASE);                                        \
-      default:                                                          \
-        break;                                                          \
-    }                                                                   \
-    if (func != nullptr) {                                              \
-      return std::unique_ptr<CastKernelBase>(                           \
-          new KernelType(options, func, std::move(out_type)));          \
-    }                                                                   \
-    return nullptr;                                                     \
-  }
-
-#include "generated/cast_codegen_internal.h"  // NOLINT
-
-GET_CAST_FUNCTION(BOOLEAN_CASES, BooleanType, CastKernel)
-GET_CAST_FUNCTION(UINT8_CASES, UInt8Type, CastKernel)
-GET_CAST_FUNCTION(INT8_CASES, Int8Type, CastKernel)
-GET_CAST_FUNCTION(UINT16_CASES, UInt16Type, CastKernel)
-GET_CAST_FUNCTION(INT16_CASES, Int16Type, CastKernel)
-GET_CAST_FUNCTION(UINT32_CASES, UInt32Type, CastKernel)
-GET_CAST_FUNCTION(INT32_CASES, Int32Type, CastKernel)
-GET_CAST_FUNCTION(UINT64_CASES, UInt64Type, CastKernel)
-GET_CAST_FUNCTION(INT64_CASES, Int64Type, CastKernel)
-GET_CAST_FUNCTION(FLOAT_CASES, FloatType, CastKernel)
-GET_CAST_FUNCTION(DOUBLE_CASES, DoubleType, CastKernel)
-GET_CAST_FUNCTION(DECIMAL128_CASES, Decimal128Type, CastKernel)
-GET_CAST_FUNCTION(DATE32_CASES, Date32Type, CastKernel)
-GET_CAST_FUNCTION(DATE64_CASES, Date64Type, CastKernel)
-GET_CAST_FUNCTION(TIME32_CASES, Time32Type, CastKernel)
-GET_CAST_FUNCTION(TIME64_CASES, Time64Type, CastKernel)
-GET_CAST_FUNCTION(TIMESTAMP_CASES, TimestampType, CastKernel)
-GET_CAST_FUNCTION(DURATION_CASES, DurationType, CastKernel)
-GET_CAST_FUNCTION(BINARY_CASES, BinaryType, CastKernel)
-GET_CAST_FUNCTION(STRING_CASES, StringType, CastKernel)
-GET_CAST_FUNCTION(LARGEBINARY_CASES, LargeBinaryType, CastKernel)
-GET_CAST_FUNCTION(LARGESTRING_CASES, LargeStringType, CastKernel)
-GET_CAST_FUNCTION(DICTIONARY_CASES, DictionaryType, DictionaryCastKernel)
-
-#define CAST_FUNCTION_CASE(InType)                          \
-  case InType::type_id:                                     \
-    cast_kernel = Get##InType##CastFunc(out_type, options); \
-    break
-
-namespace {
-
-template <typename TypeClass>
-Status GetListCastFunc(const DataType& in_type, std::shared_ptr<DataType> out_type,
-                       const CastOptions& options,
-                       std::unique_ptr<CastKernelBase>* kernel) {
-  if (out_type->id() != TypeClass::type_id) {
-    return Status::Invalid("Cannot cast from ", in_type.ToString(), " to ",
-                           out_type->ToString());
-  }
-  const DataType& in_value_type = *checked_cast<const TypeClass&>(in_type).value_type();
-  std::shared_ptr<DataType> out_value_type =
-      checked_cast<const TypeClass&>(*out_type).value_type();
-  std::unique_ptr<UnaryKernel> child_caster;
-  RETURN_NOT_OK(GetCastFunction(in_value_type, out_value_type, options, &child_caster));
-  *kernel = std::unique_ptr<CastKernelBase>(
-      new ListCastKernel<TypeClass>(std::move(child_caster), std::move(out_type)));
-  return Status::OK();
-}
-
-}  // namespace
-
-inline bool IsZeroCopyCast(Type::type in_type, Type::type out_type) {
-  switch (in_type) {
-    case Type::INT32:
-      return (out_type == Type::DATE32) || (out_type == Type::TIME32);
-    case Type::INT64:
-      return ((out_type == Type::DATE64) || (out_type == Type::TIME64) ||
-              (out_type == Type::TIMESTAMP) || (out_type == Type::DURATION));
-    case Type::DATE32:
-    case Type::TIME32:
-      return out_type == Type::INT32;
-    case Type::DATE64:
-    case Type::TIME64:
-    case Type::TIMESTAMP:
-    case Type::DURATION:
-      return out_type == Type::INT64;
-    default:
-      break;
-  }
-  return false;
-}
-
-Status GetCastFunction(const DataType& in_type, std::shared_ptr<DataType> out_type,
-                       const CastOptions& options, std::unique_ptr<UnaryKernel>* kernel) {
-  if (in_type.Equals(out_type)) {
-    kernel->reset(new IdentityCast(std::move(out_type)));
-    return Status::OK();
-  }
-
-  if (IsZeroCopyCast(in_type.id(), out_type->id())) {
-    kernel->reset(new ZeroCopyCast(std::move(out_type)));
-    return Status::OK();
-  }
-
-  std::unique_ptr<CastKernelBase> cast_kernel;
-  switch (in_type.id()) {
-    CAST_FUNCTION_CASE(BooleanType);
-    CAST_FUNCTION_CASE(UInt8Type);
-    CAST_FUNCTION_CASE(Int8Type);
-    CAST_FUNCTION_CASE(UInt16Type);
-    CAST_FUNCTION_CASE(Int16Type);
-    CAST_FUNCTION_CASE(UInt32Type);
-    CAST_FUNCTION_CASE(Int32Type);
-    CAST_FUNCTION_CASE(UInt64Type);
-    CAST_FUNCTION_CASE(Int64Type);
-    CAST_FUNCTION_CASE(FloatType);
-    CAST_FUNCTION_CASE(DoubleType);
-    CAST_FUNCTION_CASE(Decimal128Type);
-    CAST_FUNCTION_CASE(Date32Type);
-    CAST_FUNCTION_CASE(Date64Type);
-    CAST_FUNCTION_CASE(Time32Type);
-    CAST_FUNCTION_CASE(Time64Type);
-    CAST_FUNCTION_CASE(TimestampType);
-    CAST_FUNCTION_CASE(DurationType);
-    CAST_FUNCTION_CASE(BinaryType);
-    CAST_FUNCTION_CASE(StringType);
-    CAST_FUNCTION_CASE(LargeBinaryType);
-    CAST_FUNCTION_CASE(LargeStringType);
-    CAST_FUNCTION_CASE(DictionaryType);
-    case Type::NA:
-      cast_kernel.reset(new FromNullCastKernel(out_type));
-      break;
-    case Type::LIST:
-      RETURN_NOT_OK(GetListCastFunc<ListType>(in_type, out_type, options, &cast_kernel));
-      break;
-    case Type::LARGE_LIST:
-      RETURN_NOT_OK(
-          GetListCastFunc<LargeListType>(in_type, out_type, options, &cast_kernel));
-      break;
-    case Type::EXTENSION:
-      RETURN_NOT_OK(
-          ExtensionCastKernel::Make(std::move(in_type), out_type, options, &cast_kernel));
-      break;
-    default:
-      break;
-  }
-  if (cast_kernel == nullptr) {
-    return CastNotImplemented(in_type, *out_type);
-  }
-  Status st = cast_kernel->Init(in_type);
-  if (st.ok()) {
-    *kernel = std::move(cast_kernel);
-  }
-  return st;
-}
-
-Status Cast(FunctionContext* ctx, const Datum& value, std::shared_ptr<DataType> out_type,
-            const CastOptions& options, Datum* out) {
-  const DataType& in_type = *value.type();
-
-  // Dynamic dispatch to obtain right cast function
-  std::unique_ptr<UnaryKernel> func;
-  RETURN_NOT_OK(GetCastFunction(in_type, std::move(out_type), options, &func));
-  return InvokeWithAllocation(ctx, func.get(), value, out);
-}
-
-Status Cast(FunctionContext* ctx, const Array& array, std::shared_ptr<DataType> out_type,
-            const CastOptions& options, std::shared_ptr<Array>* out) {
-  Datum datum_out;
-  RETURN_NOT_OK(Cast(ctx, Datum(array.data()), std::move(out_type), options, &datum_out));
-  DCHECK_EQ(Datum::ARRAY, datum_out.kind());
-  *out = MakeArray(datum_out.array());
-  return Status::OK();
-}
-// ----------------------------------------------------------------------
-// Casting
-
-Result<std::shared_ptr<Array>> Cast(const Array& value, std::shared_ptr<DataType> to_type,
-                                    const CastOptions& options, ExecContext* context) {
-  return Status::NotImplemented("NYI");
-}
-
-Result<Datum> Cast(const Datum& value, std::shared_ptr<DataType> to_type,
-                   const CastOptions& options, ExecContext* context) {
-  return Status::NotImplemented("NYI");
-}
-
-bool CanCast(const DataType& from_type, const DataType& to_type) {
-  // TODO
-  return false;
-}
-
-}  // namespace compute
+namespace compute {}  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
index 3b5bd1e3ffd..7d1f34415e0 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
@@ -17,12 +17,14 @@
 
 // Cast types to boolean
 
+#include "arrow/compute/cast_internal.h"
 #include "arrow/compute/kernels/common.h"
+#include "arrow/compute/kernels/scalar_cast_internal.h"
 #include "arrow/util/value_parsing.h"
 
 namespace arrow {
 namespace compute {
-namespace codegen {
+namespace internal {
 
 struct IsNonZero {
   template <typename OUT, typename ARG0>
@@ -43,25 +45,20 @@ struct ParseBooleanString {
   }
 };
 
-void AddBooleanCasts(FunctionRegistry* registry) {
-  auto func = std::make_shared<ScalarFunction>("cast_boolean", /*arity=*/1);
+std::vector<std::shared_ptr<CastFunction>> GetBooleanCasts() {
+  auto func = std::make_shared<CastFunction>("cast_boolean", Type::BOOL);
+  AddCommonCasts<BooleanType>(boolean(), func.get());
+
   for (const auto& ty : NumericTypes()) {
-    auto exec = codegen::Numeric<ScalarUnary, BooleanType, IsNonZero>(*ty);
-    DCHECK_OK(func->AddKernel({ty}, boolean(), exec));
+    auto exec = codegen::Numeric<codegen::ScalarUnary, BooleanType, IsNonZero>(*ty);
+    DCHECK_OK(func->AddKernel(ty->id(), {ty}, boolean(), exec));
   }
   for (const auto& ty : BaseBinaryTypes()) {
-    auto exec = codegen::BaseBinary<ScalarUnary, BooleanType, ParseBooleanString>(*ty);
-    DCHECK_OK(func->AddKernel({ty}, boolean(), exec));
+    auto exec =
+        codegen::BaseBinary<codegen::ScalarUnary, BooleanType, ParseBooleanString>(*ty);
+    DCHECK_OK(func->AddKernel(ty->id(), {ty}, boolean(), exec));
   }
-  DCHECK_OK(registry->AddFunction(std::move(func)));
-}
-
-}  // namespace codegen
-
-namespace internal {
-
-void RegisterScalarCastBoolean(FunctionRegistry* registry) {
-  codegen::AddBooleanCasts(registry);
+  return {func};
 }
 
 }  // namespace internal
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_decimal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_decimal.cc
deleted file mode 100644
index 928d5984292..00000000000
--- a/cpp/src/arrow/compute/kernels/scalar_cast_decimal.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// Implementation of casting to decimal types (including between
-// decimal types)
-
-namespace arrow {
-namespace compute {
-
-// Decimal to Decimal
-
-template <>
-struct CastFunctor<Decimal128Type, Decimal128Type> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    const auto& in_type_inst = checked_cast<const Decimal128Type&>(*input.type);
-    const auto& out_type_inst = checked_cast<const Decimal128Type&>(*output->type);
-    auto in_scale = in_type_inst.scale();
-    auto out_scale = out_type_inst.scale();
-
-    auto out_data = output->GetMutableValues<uint8_t>(1);
-
-    const auto write_zero = [](uint8_t* out_data) { memset(out_data, 0, 16); };
-
-    if (options.allow_decimal_truncate) {
-      if (in_scale < out_scale) {
-        // Unsafe upscale
-        auto convert_value = [&](util::optional<util::string_view> v) {
-          if (v.has_value()) {
-            auto dec_value = Decimal128(reinterpret_cast<const uint8_t*>(v->data()));
-            dec_value.IncreaseScaleBy(out_scale - in_scale).ToBytes(out_data);
-          } else {
-            write_zero(out_data);
-          }
-          out_data += 16;
-        };
-        VisitArrayDataInline<Decimal128Type>(input, std::move(convert_value));
-      } else {
-        // Unsafe downscale
-        auto convert_value = [&](util::optional<util::string_view> v) {
-          if (v.has_value()) {
-            auto dec_value = Decimal128(reinterpret_cast<const uint8_t*>(v->data()));
-            dec_value.ReduceScaleBy(in_scale - out_scale, false).ToBytes(out_data);
-          } else {
-            write_zero(out_data);
-          }
-          out_data += 16;
-        };
-        VisitArrayDataInline<Decimal128Type>(input, std::move(convert_value));
-      }
-    } else {
-      // Safe rescale
-      auto convert_value = [&](util::optional<util::string_view> v) {
-        if (v.has_value()) {
-          auto dec_value = Decimal128(reinterpret_cast<const uint8_t*>(v->data()));
-          auto result = dec_value.Rescale(in_scale, out_scale);
-          if (ARROW_PREDICT_FALSE(!result.ok())) {
-            ctx->SetStatus(result.status());
-            write_zero(out_data);
-          } else {
-            (*std::move(result)).ToBytes(out_data);
-          }
-        } else {
-          write_zero(out_data);
-        }
-        out_data += 16;
-      };
-      VisitArrayDataInline<Decimal128Type>(input, std::move(convert_value));
-    }
-  }
-};
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
new file mode 100644
index 00000000000..78b1165f806
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
@@ -0,0 +1,94 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/kernels/scalar_cast_internal.h"
+#include "arrow/compute/kernels/common.h"
+
+namespace arrow {
+namespace compute {
+namespace internal {
+
+void CastFromExtension(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  // const CastOptions& options = static_cast<const CastState*>(ctx->state())->options;
+
+  const DataType& in_type = *batch[0].type();
+  const auto storage_type = checked_cast<const ExtensionType&>(in_type).storage_type();
+
+  std::shared_ptr<const CastFunction> cast_func;
+  Status s = GetCastFunction(storage_type, out->type()).Value(&cast_func);
+  if (!s.ok()) {
+    ctx->SetStatus(s);
+    return;
+  }
+
+  // TODO: Finish implementing this
+
+  // KERNEL_ABORT_IF_ERROR(ctx, cast_func->Execute(*batch[0]->array(), out->mutable_array()));
+}
+
+std::unique_ptr<KernelState> CastInit(KernelContext* ctx, const KernelInitArgs& args) {
+  // NOTE: TakeOptions are currently unused, but we pass it through anyway
+  auto cast_options = static_cast<const CastOptions*>(args.options);
+
+  // Ensure that the requested type to cast to was attached to the options
+  DCHECK(cast_options->to_type);
+  return std::unique_ptr<KernelState>(new CastState(*cast_options));
+}
+
+Result<ValueDescr> ResolveOutputFromOptions(KernelContext* ctx,
+                                            const std::vector<ValueDescr>& args) {
+  const CastOptions& options = static_cast<const CastState&>(*ctx->state()).options;
+  return ValueDescr(options.to_type, args[0].shape);
+}
+
+void ZeroCopyCastExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  if (batch[0].kind() == Datum::ARRAY) {
+    // Make a copy of the buffers into a destination array without carrying
+    // the type
+    const ArrayData& input = *batch[0].array();
+    ArrayData* output = out->mutable_array();
+    output->length = input.length;
+    output->SetNullCount(input.null_count);
+    output->buffers = input.buffers;
+    output->offset = input.offset;
+    output->child_data = input.child_data;
+  } else {
+    ctx->SetStatus(
+        Status::NotImplemented("This cast not yet implemented for "
+                               "scalar input"));
+  }
+}
+
+void AddZeroCopyCast(const std::shared_ptr<DataType>& in_type,
+                     const std::shared_ptr<DataType>& out_type, CastFunction* func) {
+  auto sig = KernelSignature::Make({in_type}, out_type);
+
+  ScalarKernel kernel;
+  kernel.init = CastInit;
+  kernel.exec = ZeroCopyCastExec;
+  kernel.signature = sig;
+
+  // Turn off memory allocation
+  kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+  kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+
+  DCHECK_OK(func->AddKernel(in_type->id(), std::move(kernel)));
+}
+
+}  // namespace internal
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
index c5ef9c36676..7efee6f6d60 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
@@ -18,45 +18,35 @@
 #pragma once
 
 #include <memory>
+#include <vector>
 
-#ifdef ARROW_EXTRA_ERROR_CONTEXT
-
-#define FUNC_RETURN_NOT_OK(expr)                     \
-  do {                                               \
-    Status _st = (expr);                             \
-    if (ARROW_PREDICT_FALSE(!_st.ok())) {            \
-      _st.AddContextLine(__FILE__, __LINE__, #expr); \
-      ctx->SetStatus(_st);                           \
-      return;                                        \
-    }                                                \
-  } while (0)
-
-#else
-
-#define FUNC_RETURN_NOT_OK(expr)          \
-  do {                                    \
-    Status _st = (expr);                  \
-    if (ARROW_PREDICT_FALSE(!_st.ok())) { \
-      ctx->SetStatus(_st);                \
-      return;                             \
-    }                                     \
-  } while (0)
-
-#endif  // ARROW_EXTRA_ERROR_CONTEXT
+#include "arrow/builder.h"
+#include "arrow/compute/cast.h"
+#include "arrow/compute/kernels/common.h"
 
 namespace arrow {
 
 using internal::checked_cast;
-using internal::CopyBitmap;
 
 namespace compute {
+namespace internal {
+
+struct CastState : public KernelState {
+  CastState(const CastOptions& options) : options(options) {}
+  CastOptions options;
+};
 
 template <typename OutType, typename InType, typename Enable = void>
 struct CastFunctor {};
 
-typedef std::function<void(FunctionContext*, const CastOptions& options, const ArrayData&,
-                           ArrayData*)>
-    CastFunction;
+// No-op functor for identity casts
+template <typename O, typename I>
+struct CastFunctor<
+    O, I, enable_if_t<std::is_same<O, I>::value && is_parameter_free_type<I>::value>> {
+  static void Exec(KernelContext*, const ExecBatch&, Datum*) {}
+};
+
+void CastFromExtension(KernelContext* ctx, const ExecBatch& batch, Datum* out);
 
 // ----------------------------------------------------------------------
 // Dictionary to other things
@@ -69,7 +59,7 @@ template <typename T, typename IndexType>
 struct FromDictVisitor<T, IndexType, enable_if_fixed_size_binary<T>> {
   using ArrayType = typename TypeTraits<T>::ArrayType;
 
-  FromDictVisitor(FunctionContext* ctx, const ArrayType& dictionary, ArrayData* output)
+  FromDictVisitor(KernelContext* ctx, const ArrayType& dictionary, ArrayData* output)
       : dictionary_(dictionary),
         byte_width_(dictionary.byte_width()),
         out_(output->buffers[1]->mutable_data() + byte_width_ * output->offset) {}
@@ -101,7 +91,7 @@ template <typename T, typename IndexType>
 struct FromDictVisitor<T, IndexType, enable_if_base_binary<T>> {
   using ArrayType = typename TypeTraits<T>::ArrayType;
 
-  FromDictVisitor(FunctionContext* ctx, const ArrayType& dictionary, ArrayData* output)
+  FromDictVisitor(KernelContext* ctx, const ArrayType& dictionary, ArrayData* output)
       : ctx_(ctx), dictionary_(dictionary), output_(output) {}
 
   Status Init() {
@@ -127,7 +117,7 @@ struct FromDictVisitor<T, IndexType, enable_if_base_binary<T>> {
     return Status::OK();
   }
 
-  FunctionContext* ctx_;
+  KernelContext* ctx_;
   const ArrayType& dictionary_;
   ArrayData* output_;
   std::unique_ptr<ArrayBuilder> builder_;
@@ -142,7 +132,7 @@ struct FromDictVisitor<
 
   using value_type = typename T::c_type;
 
-  FromDictVisitor(FunctionContext* ctx, const ArrayType& dictionary, ArrayData* output)
+  FromDictVisitor(KernelContext* ctx, const ArrayType& dictionary, ArrayData* output)
       : dictionary_(dictionary), out_(output->GetMutableValues<value_type>(1)) {}
 
   Status Init() { return Status::OK(); }
@@ -168,21 +158,23 @@ struct FromDictUnpackHelper {
   using ArrayType = typename TypeTraits<T>::ArrayType;
 
   template <typename IndexType>
-  Status Unpack(FunctionContext* ctx, const ArrayData& indices,
-                const ArrayType& dictionary, ArrayData* output) {
+  void Unpack(KernelContext* ctx, const ArrayData& indices, const ArrayType& dictionary,
+              ArrayData* output) {
     FromDictVisitor<T, IndexType> visitor{ctx, dictionary, output};
-    RETURN_NOT_OK(visitor.Init());
-    RETURN_NOT_OK(ArrayDataVisitor<IndexType>::Visit(indices, &visitor));
-    return visitor.Finish();
+    KERNEL_ABORT_IF_ERROR(ctx, visitor.Init());
+    KERNEL_ABORT_IF_ERROR(ctx, ArrayDataVisitor<IndexType>::Visit(indices, &visitor));
+    KERNEL_ABORT_IF_ERROR(ctx, visitor.Finish());
   }
 };
 
 // Dispatch dictionary casts to UnpackHelper
 template <typename T>
-struct CastFunctor<T, DictionaryType> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    using ArrayType = typename TypeTraits<T>::ArrayType;
+struct FromDictionaryCast {
+  using ArrayType = typename TypeTraits<T>::ArrayType;
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const ArrayData& input = *batch[0].array();
+    ArrayData* output = out->mutable_array();
 
     const DictionaryType& type = checked_cast<const DictionaryType&>(*input.type);
     const Array& dictionary = *input.dictionary;
@@ -195,28 +187,92 @@ struct CastFunctor<T, DictionaryType> {
     FromDictUnpackHelper<T> unpack_helper;
     switch (type.index_type()->id()) {
       case Type::INT8:
-        FUNC_RETURN_NOT_OK(unpack_helper.template Unpack<Int8Type>(
-            ctx, input, static_cast<const ArrayType&>(dictionary), output));
+        unpack_helper.template Unpack<Int8Type>(
+            ctx, input, static_cast<const ArrayType&>(dictionary), output);
         break;
       case Type::INT16:
-        FUNC_RETURN_NOT_OK(unpack_helper.template Unpack<Int16Type>(
-            ctx, input, static_cast<const ArrayType&>(dictionary), output));
+        unpack_helper.template Unpack<Int16Type>(
+            ctx, input, static_cast<const ArrayType&>(dictionary), output);
         break;
       case Type::INT32:
-        FUNC_RETURN_NOT_OK(unpack_helper.template Unpack<Int32Type>(
-            ctx, input, static_cast<const ArrayType&>(dictionary), output));
+        unpack_helper.template Unpack<Int32Type>(
+            ctx, input, static_cast<const ArrayType&>(dictionary), output);
         break;
       case Type::INT64:
-        FUNC_RETURN_NOT_OK(unpack_helper.template Unpack<Int64Type>(
-            ctx, input, static_cast<const ArrayType&>(dictionary), output));
+        unpack_helper.template Unpack<Int64Type>(
+            ctx, input, static_cast<const ArrayType&>(dictionary), output);
         break;
       default:
         ctx->SetStatus(
             Status::TypeError("Invalid index type: ", type.index_type()->ToString()));
-        return;
+        break;
     }
   }
 };
 
+template <>
+struct FromDictionaryCast<NullType> {
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    ArrayData* output = out->mutable_array();
+    output->buffers = {nullptr};
+    output->null_count = batch.length;
+  }
+};
+
+template <>
+struct FromDictionaryCast<BooleanType> {
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {}
+};
+
+template <typename T>
+struct FromNullCast {
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    ArrayData* output = out->mutable_array();
+    std::shared_ptr<Array> nulls;
+    Status s = MakeArrayOfNull(output->type, batch.length).Value(&nulls);
+    KERNEL_ABORT_IF_ERROR(ctx, s);
+    out->value = nulls->data();
+  }
+};
+
+// Adds a cast function where the functor is defined and the input and output
+// types have a type_singleton
+template <typename InType, typename OutType>
+void AddSimpleCast(InputType in_ty, OutputType out_ty, CastFunction* func) {
+  DCHECK_OK(func->AddKernel(InType::type_id, {in_ty}, out_ty,
+                            CastFunctor<OutType, InType>::Exec));
+}
+
+void ZeroCopyCastExec(KernelContext* ctx, const ExecBatch& batch, Datum* out);
+
+void AddZeroCopyCast(const std::shared_ptr<DataType>& in_type,
+                     const std::shared_ptr<DataType>& out_type, CastFunction* func);
+
+std::unique_ptr<KernelState> CastInit(KernelContext* ctx, const KernelInitArgs& args);
+
+// OutputType::Resolver that returns a descr with the shape of the input
+// argument and the type from CastOptions
+Result<ValueDescr> ResolveOutputFromOptions(KernelContext* ctx,
+                                            const std::vector<ValueDescr>& args);
+
+template <typename OutType>
+void AddCommonCasts(OutputType out_ty, CastFunction* func) {
+  // From null to this type
+  DCHECK_OK(func->AddKernel(Type::NA, {InputType::Array(null())}, out_ty,
+                            FromNullCast<OutType>::Exec));
+
+  // From dictionary to this type
+  if (OutType::type_id != Type::BOOL) {
+    // Dictionary unpacking not implemented for boolean
+    DCHECK_OK(func->AddKernel(Type::DICTIONARY, {InputType::Array(Type::DICTIONARY)},
+                              out_ty, FromDictionaryCast<OutType>::Exec));
+  }
+
+  // From extension type to this type
+  DCHECK_OK(func->AddKernel(Type::EXTENSION, {InputType::Array(Type::EXTENSION)}, out_ty,
+                            CastFromExtension));
+}
+
+}  // namespace internal
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
index acbc6be30c6..bbb5ee7c79c 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
@@ -27,7 +27,7 @@ class ListCastKernel : public CastKernelBase {
                  std::shared_ptr<DataType> out_type)
       : CastKernelBase(std::move(out_type)), child_caster_(std::move(child_caster)) {}
 
-  Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override {
+  Status Call(KernelContext* ctx, const Datum& input, Datum* out) override {
     DCHECK_EQ(Datum::ARRAY, input.kind());
 
     const ArrayData& in_data = *input.array();
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
index be364e3cfcf..2106d88c117 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
@@ -17,8 +17,16 @@
 
 // Implementation of casting to integer or floating point types
 
+#include "arrow/compute/kernels/common.h"
+#include "arrow/compute/kernels/scalar_cast_internal.h"
+#include "arrow/util/value_parsing.h"
+
 namespace arrow {
+
+using internal::ParseValue;
+
 namespace compute {
+namespace internal {
 
 // ----------------------------------------------------------------------
 // Integers and Floating Point
@@ -176,13 +184,16 @@ struct is_safe_numeric_cast<
 };
 
 // ----------------------------------------------------------------------
-// Possible integer truncation
+// Integer to other number types
 
+template <typename O, typename I>
 struct IntegerDowncastNoOverflow {
+  using InT = typename I::c_type;
+  static constexpr InT kMax = SafeMaximum<O, I>();
+  static constexpr InT kMin = SafeMinimum<O, I>();
+
   template <typename OutT, typename InT>
-  OutT Call(KernelContext ctx, InT val) {
-    constexpr InT kMax = SafeMaximum<OutT, InT>();
-    constexpr InT kMin = SafeMinimum<OutT, InT>();
+  static OutT Call(KernelContext* ctx, InT val) {
     if (ARROW_PREDICT_FALSE(val > kMax || val < kMin)) {
       ctx->SetStatus(Status::Invalid("Integer value out of bounds"));
     }
@@ -191,10 +202,10 @@ struct IntegerDowncastNoOverflow {
 };
 
 struct StaticCast {
-  ARROW_DISABLE_UBSAN("float-cast-overflow")
   template <typename OutT, typename InT>
-  OutT Call(KernelContext ctx, InT val) {
-    return static_cast<out_type>(val);
+  ARROW_DISABLE_UBSAN("float-cast-overflow")
+  static OutT Call(KernelContext*, InT val) {
+    return static_cast<OutT>(val);
   }
 };
 
@@ -203,34 +214,39 @@ struct CastFunctor<O, I,
                    enable_if_t<is_number_downcast<O, I>::value ||
                                is_integral_signed_to_unsigned<O, I>::value ||
                                is_integral_unsigned_to_signed<O, I>::value>> {
-  using in_type = typename I::c_type;
-  using out_type = typename O::c_type;
-  out_type Call(KernelContext ctx, in_type val) {
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const auto& options = static_cast<const CastState*>(ctx->state())->options;
     if (!options.allow_int_overflow) {
-      // TODO
+      codegen::ScalarUnary<O, I, IntegerDowncastNoOverflow<O, I>>::Exec(ctx, batch, out);
     } else {
-      return static_cast<out_type>(val);
+      codegen::ScalarUnary<O, I, StaticCast>::Exec(ctx, batch, out);
     }
   }
 };
 
+// ----------------------------------------------------------------------
+// Float to other number types
+
+struct FloatToIntegerNoTruncate {
+  template <typename OutT, typename InT>
+  static OutT Call(KernelContext* ctx, InT val) {
+    auto out_value = static_cast<OutT>(val);
+    if (ARROW_PREDICT_FALSE(static_cast<InT>(out_value) != val)) {
+      ctx->SetStatus(Status::Invalid("Floating point value truncated"));
+    }
+    return out_value;
+  }
+};
+
 template <typename O, typename I>
 struct CastFunctor<O, I, enable_if_t<is_float_truncate<O, I>::value>> {
-  using in_type = typename I::c_type;
-  using out_type = typename O::c_type;
-
   ARROW_DISABLE_UBSAN("float-cast-overflow")
-  out_type Call(KernelContext ctx, in_type val) {
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const auto& options = static_cast<const CastState*>(ctx->state())->options;
     if (options.allow_float_truncate) {
-      // unsafe cast
-      return static_cast<out_type>(*in_data++);
+      codegen::ScalarUnary<O, I, StaticCast>::Exec(ctx, batch, out);
     } else {
-      // safe cast
-      auto out_value = static_cast<out_type>(*in_data);
-      if (ARROW_PREDICT_FALSE(static_cast<in_type>(out_value) != *in_data)) {
-        ctx->SetStatus(Status::Invalid("Floating point value truncated"));
-      }
-      return out_value;
+      codegen::ScalarUnary<O, I, FloatToIntegerNoTruncate>::Exec(ctx, batch, out);
     }
   }
 };
@@ -240,29 +256,67 @@ struct CastFunctor<
     O, I,
     enable_if_t<is_safe_numeric_cast<O, I>::value && !is_float_truncate<O, I>::value &&
                 !is_number_downcast<O, I>::value>> {
-  using in_type = typename I::c_type;
-  using out_type = typename O::c_type;
-
-  out_type Call(KernelContext ctx, in_type val) {
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     // Due to various checks done via type-trait, the cast is safe and bear
     // no truncation.
-    return static_cast<out_type>(*in_data++);
+    codegen::ScalarUnary<O, I, StaticCast>::Exec(ctx, batch, out);
+  }
+};
+
+// ----------------------------------------------------------------------
+// Boolean to number
+
+struct BooleanToNumber {
+  template <typename OUT, typename ARG0>
+  static OUT Call(KernelContext*, ARG0 val) {
+    constexpr auto kOne = static_cast<OUT>(1);
+    constexpr auto kZero = static_cast<OUT>(0);
+    return val ? kOne : kZero;
+  }
+};
+
+template <typename O>
+struct CastFunctor<O, BooleanType, enable_if_number<O>> {
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    codegen::ScalarUnary<O, BooleanType, BooleanToNumber>::Exec(ctx, batch, out);
   }
 };
 
 // ----------------------------------------------------------------------
-// Decimals
+// String to number
 
-// Decimal to Integer
+template <typename OutType>
+struct ParseString {
+  template <typename OUT, typename ARG0>
+  static OUT Call(KernelContext* ctx, ARG0 val) {
+    OUT result;
+    if (ARROW_PREDICT_FALSE(!ParseValue<OutType>(val.data(), val.size(), &result))) {
+      ctx->SetStatus(Status::Invalid("Failed to parse string: ", val));
+    }
+    return result;
+  }
+};
+
+template <typename O, typename I>
+struct CastFunctor<O, I, enable_if_base_binary<I>> {
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    codegen::ScalarUnary<O, I, ParseString<O>>::Exec(ctx, batch, out);
+  }
+};
+
+// ----------------------------------------------------------------------
+// Decimal to integer
 
 template <typename O>
 struct CastFunctor<O, Decimal128Type, enable_if_t<is_integer_type<O>::value>> {
-  using OUT = typename O::c_type;
+  using out_type = typename O::c_type;
 
-  static OUT Call(KernelContext* ctx, Decimal128 val) {}
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const auto& options = static_cast<const CastState*>(ctx->state())->options;
+
+    const ArrayData& input = *batch[0].array();
+    ArrayData* output = out->mutable_array();
 
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
     const auto& in_type_inst = checked_cast<const Decimal128Type&>(*input.type);
     auto in_scale = in_type_inst.scale();
 
@@ -334,33 +388,145 @@ struct CastFunctor<O, Decimal128Type, enable_if_t<is_integer_type<O>::value>> {
   }
 };
 
-// ----------------------------------------------------------------------
-// Boolean to other things
+template <>
+struct CastFunctor<Decimal128Type, Decimal128Type> {
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const auto& options = static_cast<const CastState*>(ctx->state())->options;
+    const ArrayData& input = *batch[0].array();
+    ArrayData* output = out->mutable_array();
+    auto out_data = output->GetMutableValues<uint8_t>(1);
 
-struct BooleanToNumber {
-  template <typename OUT, typename ARG0>
-  static OUT Call(KernelContext*, ARG0 val) {
-    constexpr auto kOne = static_cast<OUT>(1);
-    constexpr auto kZero = static_cast<OUT>(0);
-    return val ? kOne : kZero;
-  }
-};
+    const auto& in_type_inst = checked_cast<const Decimal128Type&>(*input.type);
+    const auto& out_type_inst = checked_cast<const Decimal128Type&>(*output->type);
+    auto in_scale = in_type_inst.scale();
+    auto out_scale = out_type_inst.scale();
 
-// ----------------------------------------------------------------------
-// String to Number
+    const auto write_zero = [](uint8_t* out_data) { memset(out_data, 0, 16); };
 
-template <typename ArrowType>
-struct StringToNumber {
-  template <typename OUT, typename ARG0>
-  static OUT Call(KernelContext*, ARG0 val) {
-    OUT result;
-    if (ARROW_PREDICT_FALSE(
-            !::arrow::internal::ParseValue<ArrowType>(val.data(), val.size(), &result))) {
-      ctx->SetStatus(Status::Invalid("Failed to parse string: ", val));
+    if (options.allow_decimal_truncate) {
+      if (in_scale < out_scale) {
+        // Unsafe upscale
+        auto convert_value = [&](util::optional<util::string_view> v) {
+          if (v.has_value()) {
+            auto dec_value = Decimal128(reinterpret_cast<const uint8_t*>(v->data()));
+            dec_value.IncreaseScaleBy(out_scale - in_scale).ToBytes(out_data);
+          } else {
+            write_zero(out_data);
+          }
+          out_data += 16;
+        };
+        VisitArrayDataInline<Decimal128Type>(input, std::move(convert_value));
+      } else {
+        // Unsafe downscale
+        auto convert_value = [&](util::optional<util::string_view> v) {
+          if (v.has_value()) {
+            auto dec_value = Decimal128(reinterpret_cast<const uint8_t*>(v->data()));
+            dec_value.ReduceScaleBy(in_scale - out_scale, false).ToBytes(out_data);
+          } else {
+            write_zero(out_data);
+          }
+          out_data += 16;
+        };
+        VisitArrayDataInline<Decimal128Type>(input, std::move(convert_value));
+      }
+    } else {
+      // Safe rescale
+      auto convert_value = [&](util::optional<util::string_view> v) {
+        if (v.has_value()) {
+          auto dec_value = Decimal128(reinterpret_cast<const uint8_t*>(v->data()));
+          auto result = dec_value.Rescale(in_scale, out_scale);
+          if (ARROW_PREDICT_FALSE(!result.ok())) {
+            ctx->SetStatus(result.status());
+            write_zero(out_data);
+          } else {
+            (*std::move(result)).ToBytes(out_data);
+          }
+        } else {
+          write_zero(out_data);
+        }
+        out_data += 16;
+      };
+      VisitArrayDataInline<Decimal128Type>(input, std::move(convert_value));
     }
-    return result;
   }
 };
 
+template <typename OutType>
+void AddPrimitiveNumberCasts(const std::shared_ptr<DataType>& out_ty,
+                             CastFunction* func) {
+  AddCommonCasts<OutType>(out_ty, func);
+
+  // Cast from boolean
+  DCHECK_OK(func->AddKernel(out_ty->id(), {boolean()}, out_ty,
+                            CastFunctor<OutType, BooleanType>::Exec));
+
+  // Cast from other numbers
+  for (const std::shared_ptr<DataType>& in_ty : NumericTypes()) {
+    auto exec = codegen::Numeric<CastFunctor, OutType>(*in_ty);
+    DCHECK_OK(func->AddKernel(out_ty->id(), {in_ty}, out_ty, exec));
+  }
+
+  // Cast from other strings
+  for (const std::shared_ptr<DataType>& in_ty : BaseBinaryTypes()) {
+    auto exec = codegen::BaseBinary<CastFunctor, OutType>(*in_ty);
+    DCHECK_OK(func->AddKernel(out_ty->id(), {in_ty}, out_ty, exec));
+  }
+}
+
+template <typename OutType>
+std::shared_ptr<CastFunction> GetCastToInteger(std::string name) {
+  auto func = std::make_shared<CastFunction>(std::move(name), OutType::type_id);
+  auto out_ty = TypeTraits<OutType>::type_singleton();
+
+  // From other numbers to integer
+  AddPrimitiveNumberCasts<OutType>(out_ty, func.get());
+
+  // From decimal to integer
+  // TODO: Refactor to support casting decimal scalars to integer
+  DCHECK_OK(func->AddKernel(out_ty->id(), {InputType::Array(Type::DECIMAL)}, out_ty,
+                            CastFunctor<OutType, Decimal128Type>::Exec));
+  return func;
+}
+
+template <typename OutType>
+std::shared_ptr<CastFunction> GetCastToFloating(std::string name) {
+  auto func = std::make_shared<CastFunction>(std::move(name), OutType::type_id);
+  auto out_ty = TypeTraits<OutType>::type_singleton();
+
+  // From other numbers to integer
+  AddPrimitiveNumberCasts<OutType>(out_ty, func.get());
+  return func;
+}
+
+std::shared_ptr<CastFunction> GetCastToDecimal() {
+  // Cast to decimal
+  auto func = std::make_shared<CastFunction>("cast_decimal", Type::DECIMAL);
+  auto exec = CastFunctor<Decimal128Type, Decimal128Type>::Exec;
+  DCHECK_OK(func->AddKernel(Type::DECIMAL, {InputType::Array(Type::DECIMAL)},
+                            OutputType(FirstType), exec));
+  return func;
+}
+
+std::vector<std::shared_ptr<CastFunction>> GetNumericCasts() {
+  std::vector<std::shared_ptr<CastFunction>> functions;
+
+  functions.push_back(GetCastToInteger<Int8Type>("cast_int8"));
+  functions.push_back(GetCastToInteger<Int16Type>("cast_int16"));
+  functions.push_back(GetCastToInteger<Int32Type>("cast_int32"));
+  functions.push_back(GetCastToInteger<Int64Type>("cast_int64"));
+  functions.push_back(GetCastToInteger<UInt8Type>("cast_uint8"));
+  functions.push_back(GetCastToInteger<UInt16Type>("cast_uint16"));
+  functions.push_back(GetCastToInteger<UInt32Type>("cast_uint32"));
+  functions.push_back(GetCastToInteger<UInt64Type>("cast_uint64"));
+
+  functions.push_back(GetCastToFloating<FloatType>("cast_float"));
+  functions.push_back(GetCastToFloating<DoubleType>("cast_double"));
+
+  functions.push_back(GetCastToDecimal());
+
+  return functions;
+}
+
+}  // namespace internal
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
index 415c42c7f42..34c4f5b1c3c 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
@@ -27,12 +27,12 @@ template <typename I, typename O>
 struct CastFunctor<O, I,
                    enable_if_t<is_string_like_type<O>::value &&
                                (is_number_type<I>::value || is_boolean_type<I>::value)>> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
+  void operator()(KernelContext* ctx, const CastOptions& options, const ArrayData& input,
+                  ArrayData* output) {
     ctx->SetStatus(Convert(ctx, options, input, output));
   }
 
-  Status Convert(FunctionContext* ctx, const CastOptions& options, const ArrayData& input,
+  Status Convert(KernelContext* ctx, const CastOptions& options, const ArrayData& input,
                  ArrayData* output) {
     using value_type = typename TypeTraits<I>::CType;
     using BuilderType = typename TypeTraits<O>::BuilderType;
@@ -69,8 +69,8 @@ struct CastFunctor<O, I,
 
 template <typename I, typename O>
 struct BinaryToStringSameWidthCastFunctor {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
+  void operator()(KernelContext* ctx, const CastOptions& options, const ArrayData& input,
+                  ArrayData* output) {
     if (!options.allow_invalid_utf8) {
       util::InitializeUTF8();
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
index 6b79527dd9f..ac8569420a4 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
@@ -17,8 +17,22 @@
 
 // Implementation of casting to (or between) temporal types
 
+#include <utility>
+#include <vector>
+
+#include "arrow/compute/kernels/common.h"
+#include "arrow/compute/kernels/scalar_cast_internal.h"
+#include "arrow/util/time.h"
+#include "arrow/util/value_parsing.h"
+
 namespace arrow {
+
+using internal::BitmapReader;
+using internal::ParseTimestampContext;
+using internal::ParseValue;
+
 namespace compute {
+namespace internal {
 
 constexpr int64_t kMillisecondsInDay = 86400000;
 
@@ -26,9 +40,9 @@ constexpr int64_t kMillisecondsInDay = 86400000;
 // From one timestamp to another
 
 template <typename in_type, typename out_type>
-void ShiftTime(FunctionContext* ctx, const CastOptions& options,
-               const util::DivideOrMultiply factor_op, const int64_t factor,
-               const ArrayData& input, ArrayData* output) {
+void ShiftTime(KernelContext* ctx, const util::DivideOrMultiply factor_op,
+               const int64_t factor, const ArrayData& input, ArrayData* output) {
+  const CastOptions& options = static_cast<const CastState&>(*ctx->state()).options;
   const in_type* in_data = input.GetValues<in_type>(1);
   auto out_data = output->GetMutableValues<out_type>(1);
 
@@ -50,8 +64,7 @@ void ShiftTime(FunctionContext* ctx, const CastOptions& options,
       int64_t max_val = std::numeric_limits<int64_t>::max() / factor;
       int64_t min_val = std::numeric_limits<int64_t>::min() / factor;
       if (input.null_count != 0) {
-        internal::BitmapReader bit_reader(input.buffers[0]->data(), input.offset,
-                                          input.length);
+        BitmapReader bit_reader(input.buffers[0]->data(), input.offset, input.length);
         for (int64_t i = 0; i < input.length; i++) {
           if (bit_reader.IsSet() && (in_data[i] < min_val || in_data[i] > max_val)) {
             RAISE_OVERFLOW_CAST(in_data[i]);
@@ -83,8 +96,7 @@ void ShiftTime(FunctionContext* ctx, const CastOptions& options,
                                  output->type->ToString(), " would lose data: ", VAL));
 
       if (input.null_count != 0) {
-        internal::BitmapReader bit_reader(input.buffers[0]->data(), input.offset,
-                                          input.length);
+        BitmapReader bit_reader(input.buffers[0]->data(), input.offset, input.length);
         for (int64_t i = 0; i < input.length; i++) {
           out_data[i] = static_cast<out_type>(in_data[i] / factor);
           if (bit_reader.IsSet() && (out_data[i] * factor != in_data[i])) {
@@ -114,28 +126,25 @@ struct CastFunctor<
     O, I,
     enable_if_t<(is_timestamp_type<O>::value && is_timestamp_type<I>::value) ||
                 (is_duration_type<O>::value && is_duration_type<I>::value)>> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const ArrayData& input = *batch[0].array();
+    ArrayData* output = out->mutable_array();
+
     // If units are the same, zero copy, otherwise convert
-    const auto& in_type = checked_cast<const I&>(*input.type);
+    const auto& in_type = checked_cast<const I&>(*batch[0].type());
     const auto& out_type = checked_cast<const O&>(*output->type);
-
-    if (in_type.unit() == out_type.unit()) {
-      ZeroCopyData(input, output);
-      return;
-    }
-
     auto conversion = util::kTimestampConversionTable[static_cast<int>(in_type.unit())]
                                                      [static_cast<int>(out_type.unit())];
-    ShiftTime<int64_t, int64_t>(ctx, options, conversion.first, conversion.second, input,
-                                output);
+    ShiftTime<int64_t, int64_t>(ctx, conversion.first, conversion.second, input, output);
   }
 };
 
 template <>
 struct CastFunctor<Date32Type, TimestampType> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const ArrayData& input = *batch[0].array();
+    ArrayData* output = out->mutable_array();
+
     const auto& in_type = checked_cast<const TimestampType&>(*input.type);
 
     static const int64_t kTimestampToDateFactors[4] = {
@@ -146,20 +155,21 @@ struct CastFunctor<Date32Type, TimestampType> {
     };
 
     const int64_t factor = kTimestampToDateFactors[static_cast<int>(in_type.unit())];
-    ShiftTime<int64_t, int32_t>(ctx, options, util::DIVIDE, factor, input, output);
+    ShiftTime<int64_t, int32_t>(ctx, util::DIVIDE, factor, input, output);
   }
 };
 
 template <>
 struct CastFunctor<Date64Type, TimestampType> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const CastOptions& options = static_cast<const CastState&>(*ctx->state()).options;
+    const ArrayData& input = *batch[0].array();
+    ArrayData* output = out->mutable_array();
     const auto& in_type = checked_cast<const TimestampType&>(*input.type);
 
     auto conversion = util::kTimestampConversionTable[static_cast<int>(in_type.unit())]
                                                      [static_cast<int>(TimeUnit::MILLI)];
-    ShiftTime<int64_t, int64_t>(ctx, options, conversion.first, conversion.second, input,
-                                output);
+    ShiftTime<int64_t, int64_t>(ctx, conversion.first, conversion.second, input, output);
     if (!ctx->status().ok()) {
       return;
     }
@@ -168,8 +178,7 @@ struct CastFunctor<Date64Type, TimestampType> {
     auto out_data = output->GetMutableValues<int64_t>(1);
 
     if (input.null_count != 0) {
-      internal::BitmapReader bit_reader(input.buffers[0]->data(), input.offset,
-                                        input.length);
+      BitmapReader bit_reader(input.buffers[0]->data(), input.offset, input.length);
 
       for (int64_t i = 0; i < input.length; ++i) {
         const int64_t remainder = out_data[i] % kMillisecondsInDay;
@@ -201,25 +210,20 @@ struct CastFunctor<Date64Type, TimestampType> {
 
 template <typename O, typename I>
 struct CastFunctor<O, I, enable_if_t<is_time_type<I>::value && is_time_type<O>::value>> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    using in_t = typename I::c_type;
-    using out_t = typename O::c_type;
+  using in_t = typename I::c_type;
+  using out_t = typename O::c_type;
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const ArrayData& input = *batch[0].array();
+    ArrayData* output = out->mutable_array();
 
     // If units are the same, zero copy, otherwise convert
     const auto& in_type = checked_cast<const I&>(*input.type);
     const auto& out_type = checked_cast<const O&>(*output->type);
-
-    if (in_type.unit() == out_type.unit()) {
-      ZeroCopyData(input, output);
-      return;
-    }
-
     auto conversion = util::kTimestampConversionTable[static_cast<int>(in_type.unit())]
                                                      [static_cast<int>(out_type.unit())];
 
-    ShiftTime<in_t, out_t>(ctx, options, conversion.first, conversion.second, input,
-                           output);
+    ShiftTime<in_t, out_t>(ctx, conversion.first, conversion.second, input, output);
   }
 };
 
@@ -228,49 +232,214 @@ struct CastFunctor<O, I, enable_if_t<is_time_type<I>::value && is_time_type<O>::
 
 template <>
 struct CastFunctor<Date64Type, Date32Type> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    ShiftTime<int32_t, int64_t>(ctx, options, util::MULTIPLY, kMillisecondsInDay, input,
-                                output);
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    ShiftTime<int32_t, int64_t>(ctx, util::MULTIPLY, kMillisecondsInDay,
+                                *batch[0].array(), out->mutable_array());
   }
 };
 
 template <>
 struct CastFunctor<Date32Type, Date64Type> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    ShiftTime<int64_t, int32_t>(ctx, options, util::DIVIDE, kMillisecondsInDay, input,
-                                output);
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    ShiftTime<int64_t, int32_t>(ctx, util::DIVIDE, kMillisecondsInDay, *batch[0].array(),
+                                out->mutable_array());
   }
 };
 
 // ----------------------------------------------------------------------
 // String to Timestamp
 
-template <typename I>
-struct CastFunctor<TimestampType, I, enable_if_t<is_string_like_type<I>::value>> {
-  void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    using out_type = TimestampType::c_type;
-
-    typename TypeTraits<I>::ArrayType input_array(input.Copy());
-    auto out_data = output->GetMutableValues<out_type>(1);
-    internal::StringConverter<TimestampType> converter(output->type);
-
-    for (int64_t i = 0; i < input.length; ++i, ++out_data) {
-      if (input_array.IsNull(i)) {
-        continue;
-      }
+struct ParseTimestamp {
+  explicit ParseTimestamp(TimeUnit::type unit) : unit(unit) {}
 
-      const auto str = input_array.GetView(i);
-      if (!converter(str.data(), str.length(), out_data)) {
-        ctx->SetStatus(Status::Invalid("Failed to cast String '", str, "' into ",
-                                       output->type->ToString()));
-        return;
-      }
+  template <typename OUT, typename ARG0>
+  OUT Call(KernelContext* ctx, ARG0 val) {
+    ParseTimestampContext parse_ctx{this->unit};
+    OUT result;
+    if (ARROW_PREDICT_FALSE(
+            !ParseValue<TimestampType>(val.data(), val.size(), &result, &parse_ctx))) {
+      ctx->SetStatus(Status::Invalid("Failed to parse string: ", val));
     }
+    return result;
+  }
+
+  TimeUnit::type unit;
+};
+
+template <typename I>
+struct CastFunctor<TimestampType, I, enable_if_t<is_base_binary_type<I>::value>> {
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const CastOptions& options = static_cast<const CastState&>(*ctx->state()).options;
+    const auto& out_type = checked_cast<const TimestampType&>(*options.to_type);
+    codegen::ScalarUnaryNotNullStateful<TimestampType, I, ParseTimestamp> kernel(
+        ParseTimestamp(out_type.unit()));
+    return kernel.Exec(ctx, batch, out);
   }
 };
 
+/// You will see some of these kernels with
+///
+/// ResolveOutputFromOptions
+///
+/// for their output type resolution. This is somewhat of an eyesore but the
+/// easiest initial way to get the requested cast type including the TimeUnit
+/// to the kernel (which is needed to compute the output) was through
+/// CastOptions
+
+static OutputType kOutputTargetType(ResolveOutputFromOptions);
+
+template <typename T>
+void AddBetweenUnitCast(CastFunction* func) {
+  auto sig = KernelSignature::Make({InputType(T::type_id)}, kOutputTargetType);
+  ScalarKernel kernel;
+  kernel.exec = CastFunctor<T, T>::Exec;
+  kernel.init = CastInit;
+  kernel.signature = sig;
+
+  // Turn off memory allocation
+  kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+  kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+
+  DCHECK_OK(func->AddKernel(T::type_id, std::move(kernel)));
+}
+
+std::shared_ptr<CastFunction> GetDate32Cast() {
+  auto func = std::make_shared<CastFunction>("cast_date32", Type::DATE32);
+  auto out_ty = date32();
+  AddCommonCasts<Date32Type>(out_ty, func.get());
+
+  // int64 -> date64
+  AddZeroCopyCast(int32(), date32(), func.get());
+
+  // date64 -> date32
+  AddSimpleCast<Date64Type, Date32Type>(date64(), date32(), func.get());
+  return func;
+}
+
+std::shared_ptr<CastFunction> GetDate64Cast() {
+  auto func = std::make_shared<CastFunction>("cast_date64", Type::DATE64);
+  auto out_ty = date64();
+  AddCommonCasts<Date64Type>(out_ty, func.get());
+
+  // int64 -> date64
+  AddZeroCopyCast(int64(), date64(), func.get());
+
+  // date32 -> date64
+  AddSimpleCast<Date32Type, Date64Type>(date32(), date64(), func.get());
+  return func;
+}
+
+std::shared_ptr<CastFunction> GetDurationCast() {
+  auto func = std::make_shared<CastFunction>("cast_duration", Type::DURATION);
+  AddCommonCasts<DurationType>(kOutputTargetType, func.get());
+
+  auto seconds = duration(TimeUnit::SECOND);
+  auto millis = duration(TimeUnit::MILLI);
+  auto micros = duration(TimeUnit::MICRO);
+  auto nanos = duration(TimeUnit::NANO);
+
+  // Zero copy when the unit is the same
+  AddZeroCopyCast(seconds, seconds, func.get());
+  AddZeroCopyCast(millis, millis, func.get());
+  AddZeroCopyCast(micros, micros, func.get());
+  AddZeroCopyCast(nanos, nanos, func.get());
+
+  // Same integer representation
+  AddZeroCopyCast(/*in_type=*/int64(), /*out_type=*/seconds, func.get());
+  AddZeroCopyCast(int64(), millis, func.get());
+  AddZeroCopyCast(int64(), micros, func.get());
+  AddZeroCopyCast(int64(), nanos, func.get());
+
+  // Between durations
+  AddBetweenUnitCast<DurationType>(func.get());
+
+  return func;
+}
+
+std::shared_ptr<CastFunction> GetTime32Cast() {
+  auto func = std::make_shared<CastFunction>("cast_time32", Type::TIME32);
+  AddCommonCasts<Date32Type>(kOutputTargetType, func.get());
+
+  auto seconds = time32(TimeUnit::SECOND);
+  auto millis = time32(TimeUnit::MILLI);
+
+  // Zero copy when the unit is the same or same integer representation
+  AddZeroCopyCast(/*in_type=*/seconds, /*out_type=*/seconds, func.get());
+  AddZeroCopyCast(millis, millis, func.get());
+  AddZeroCopyCast(int32(), seconds, func.get());
+  AddZeroCopyCast(int32(), millis, func.get());
+
+  // time64 -> time32
+  AddSimpleCast<Time64Type, Time32Type>(InputType(Type::TIME64), kOutputTargetType,
+                                        func.get());
+  return func;
+}
+
+std::shared_ptr<CastFunction> GetTime64Cast() {
+  auto func = std::make_shared<CastFunction>("cast_time64", Type::TIME64);
+  AddCommonCasts<Time64Type>(kOutputTargetType, func.get());
+
+  auto micros = time64(TimeUnit::MICRO);
+  auto nanos = time64(TimeUnit::NANO);
+
+  // Zero copy when the unit is the same or same integer representation
+  AddZeroCopyCast(/*in_type=*/micros, /*out_type=*/micros, func.get());
+  AddZeroCopyCast(nanos, nanos, func.get());
+  AddZeroCopyCast(int64(), micros, func.get());
+  AddZeroCopyCast(int64(), nanos, func.get());
+
+  // time32 -> time64
+  AddSimpleCast<Time32Type, Time64Type>(InputType(Type::TIME32), kOutputTargetType,
+                                        func.get());
+  return func;
+}
+
+std::shared_ptr<CastFunction> GetTimestampCast() {
+  auto func = std::make_shared<CastFunction>("cast_timestamp", Type::TIMESTAMP);
+  AddCommonCasts<TimestampType>(kOutputTargetType, func.get());
+
+  auto seconds = timestamp(TimeUnit::SECOND);
+  auto millis = timestamp(TimeUnit::MILLI);
+  auto micros = timestamp(TimeUnit::MICRO);
+  auto nanos = timestamp(TimeUnit::NANO);
+
+  // Zero copy when the unit is the same
+  AddZeroCopyCast(seconds, seconds, func.get());
+  AddZeroCopyCast(millis, millis, func.get());
+  AddZeroCopyCast(micros, micros, func.get());
+  AddZeroCopyCast(nanos, nanos, func.get());
+
+  // Same integer representation
+  AddZeroCopyCast(/*in_type=*/int64(), /*out_type=*/seconds, func.get());
+  AddZeroCopyCast(int64(), millis, func.get());
+  AddZeroCopyCast(int64(), micros, func.get());
+  AddZeroCopyCast(int64(), nanos, func.get());
+
+  // From date types
+  // TODO: ARROW-8876, these casts are not implemented
+  // AddSimpleCast<Date32Type, TimestampType>(InputType(Type::DATE32),
+  //                                          kOutputTargetType, func.get());
+  // AddSimpleCast<Date64Type, TimestampType>(InputType(Type::DATE64),
+  //                                          kOutputTargetType, func.get());
+
+  // Between timestamps
+  AddBetweenUnitCast<TimestampType>(func.get());
+
+  return func;
+}
+
+std::vector<std::shared_ptr<CastFunction>> GetTemporalCasts() {
+  std::vector<std::shared_ptr<CastFunction>> functions;
+
+  functions.push_back(GetDate32Cast());
+  functions.push_back(GetDate64Cast());
+  functions.push_back(GetDurationCast());
+  functions.push_back(GetTime32Cast());
+  functions.push_back(GetTime64Cast());
+  functions.push_back(GetTimestampCast());
+  return functions;
+}
+
+}  // namespace internal
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/scalar_compare.cc b/cpp/src/arrow/compute/kernels/scalar_compare.cc
index 39c27b78e49..613994a9268 100644
--- a/cpp/src/arrow/compute/kernels/scalar_compare.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_compare.cc
@@ -24,39 +24,6 @@ using internal::checked_pointer_cast;
 using util::string_view;
 
 namespace compute {
-namespace codegen {
-
-// Currently implemented types for comparison
-// * Boolean
-// * Temporal
-// * BaseBinary
-
-template <typename Op, typename FlippedOp = Op>
-void MakeCompareFunction(std::string name, FunctionRegistry* registry) {
-  auto func = std::make_shared<ScalarFunction>(name, /*arity=*/2);
-
-  auto out_ty = boolean();
-  DCHECK_OK(func->AddKernel(
-      {boolean(), boolean()}, out_ty,
-      ScalarBinary<BooleanType, BooleanType, BooleanType, Op, FlippedOp>::Exec));
-
-  for (const std::shared_ptr<DataType>& ty : NumericTypes()) {
-    auto exec = Numeric<ScalarBinaryEqualTypes, BooleanType, Op, FlippedOp>(*ty);
-    DCHECK_OK(func->AddKernel({ty, ty}, out_ty, exec));
-  }
-  for (const std::shared_ptr<DataType>& ty : TemporalTypes()) {
-    auto exec = Temporal<ScalarBinaryEqualTypes, BooleanType, Op, FlippedOp>(*ty);
-    DCHECK_OK(func->AddKernel({ty, ty}, out_ty, exec));
-  }
-  for (const std::shared_ptr<DataType>& ty : BaseBinaryTypes()) {
-    auto exec = BaseBinary<ScalarBinaryEqualTypes, BooleanType, Op, FlippedOp>(*ty);
-    DCHECK_OK(func->AddKernel({ty, ty}, out_ty, exec));
-  }
-  DCHECK_OK(registry->AddFunction(std::move(func)));
-}
-
-}  // namespace codegen
-
 namespace internal {
 
 struct Equal {
@@ -101,13 +68,43 @@ struct LessEqual {
   }
 };
 
+template <typename Op, typename FlippedOp = Op>
+void MakeCompareFunction(std::string name, FunctionRegistry* registry) {
+  auto func = std::make_shared<ScalarFunction>(name, /*arity=*/2);
+
+  auto out_ty = boolean();
+  DCHECK_OK(func->AddKernel(
+      {boolean(), boolean()}, out_ty,
+      codegen::ScalarBinary<BooleanType, BooleanType, BooleanType, Op, FlippedOp>::Exec));
+
+  for (const std::shared_ptr<DataType>& ty : NumericTypes()) {
+    auto exec =
+        codegen::Numeric<codegen::ScalarBinaryEqualTypes, BooleanType, Op, FlippedOp>(
+            *ty);
+    DCHECK_OK(func->AddKernel({ty, ty}, out_ty, exec));
+  }
+  for (const std::shared_ptr<DataType>& ty : TemporalTypes()) {
+    auto exec =
+        codegen::Temporal<codegen::ScalarBinaryEqualTypes, BooleanType, Op, FlippedOp>(
+            *ty);
+    DCHECK_OK(func->AddKernel({ty, ty}, out_ty, exec));
+  }
+  for (const std::shared_ptr<DataType>& ty : BaseBinaryTypes()) {
+    auto exec =
+        codegen::BaseBinary<codegen::ScalarBinaryEqualTypes, BooleanType, Op, FlippedOp>(
+            *ty);
+    DCHECK_OK(func->AddKernel({ty, ty}, out_ty, exec));
+  }
+  DCHECK_OK(registry->AddFunction(std::move(func)));
+}
+
 void RegisterScalarComparison(FunctionRegistry* registry) {
-  codegen::MakeCompareFunction<Equal>("==", registry);
-  codegen::MakeCompareFunction<NotEqual>("!=", registry);
-  codegen::MakeCompareFunction<Less, Greater>("<", registry);
-  codegen::MakeCompareFunction<LessEqual, GreaterEqual>("<=", registry);
-  codegen::MakeCompareFunction<Greater, Less>(">", registry);
-  codegen::MakeCompareFunction<GreaterEqual, LessEqual>(">=", registry);
+  MakeCompareFunction<Equal>("==", registry);
+  MakeCompareFunction<NotEqual>("!=", registry);
+  MakeCompareFunction<Less, Greater>("<", registry);
+  MakeCompareFunction<LessEqual, GreaterEqual>("<=", registry);
+  MakeCompareFunction<Greater, Less>(">", registry);
+  MakeCompareFunction<GreaterEqual, LessEqual>(">=", registry);
 }
 
 }  // namespace internal
diff --git a/cpp/src/arrow/compute/kernels/vector_filter.cc b/cpp/src/arrow/compute/kernels/vector_filter.cc
index 089209c1d28..fe82747ef52 100644
--- a/cpp/src/arrow/compute/kernels/vector_filter.cc
+++ b/cpp/src/arrow/compute/kernels/vector_filter.cc
@@ -116,7 +116,7 @@ struct FilterFunctor {
     BooleanArray filter(batch[1].array());
     const int64_t output_size = FilterOutputSize(NullSelection, filter);
     std::shared_ptr<Array> result;
-    CTX_RETURN_IF_ERROR(ctx, Select(ctx, values, IS(filter, output_size), &result));
+    KERNEL_ABORT_IF_ERROR(ctx, Select(ctx, values, IS(filter, output_size), &result));
     out->value = result->data();
   }
 
diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc
index 9a0ef76c74b..e6448f7df82 100644
--- a/cpp/src/arrow/compute/kernels/vector_hash.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash.cc
@@ -37,6 +37,7 @@ namespace arrow {
 class MemoryPool;
 
 using internal::DictionaryTraits;
+using internal::HashTraits;
 
 namespace compute {
 
@@ -333,7 +334,7 @@ class RegularHashKernelImpl : public HashKernelImpl {
   }
 
  protected:
-  using MemoTable = typename internal::HashTraits<Type>::MemoTableType;
+  using MemoTable = typename HashTraits<Type>::MemoTableType;
 
   MemoryPool* pool_;
   std::shared_ptr<DataType> type_;
@@ -438,21 +439,21 @@ struct HashInitVisitor {
 
 void HashExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
   auto hash_impl = checked_cast<HashKernel*>(ctx->state());
-  CTX_RETURN_IF_ERROR(ctx, hash_impl->Append(ctx, *batch[0].array()));
-  CTX_RETURN_IF_ERROR(ctx, hash_impl->Flush(out));
+  KERNEL_ABORT_IF_ERROR(ctx, hash_impl->Append(ctx, *batch[0].array()));
+  KERNEL_ABORT_IF_ERROR(ctx, hash_impl->Flush(out));
 }
 
 void UniqueFinalize(KernelContext* ctx, std::vector<Datum>* out) {
   auto hash_impl = checked_cast<HashKernel*>(ctx->state());
   std::shared_ptr<ArrayData> uniques;
-  CTX_RETURN_IF_ERROR(ctx, hash_impl->GetDictionary(&uniques));
+  KERNEL_ABORT_IF_ERROR(ctx, hash_impl->GetDictionary(&uniques));
   *out = {Datum(uniques)};
 }
 
 void DictEncodeFinalize(KernelContext* ctx, std::vector<Datum>* out) {
   auto hash_impl = checked_cast<HashKernel*>(ctx->state());
   std::shared_ptr<ArrayData> uniques;
-  CTX_RETURN_IF_ERROR(ctx, hash_impl->GetDictionary(&uniques));
+  KERNEL_ABORT_IF_ERROR(ctx, hash_impl->GetDictionary(&uniques));
   auto dict_type = dictionary(int32(), uniques->type);
   auto dict = MakeArray(uniques);
   for (size_t i = 0; i < out->size(); ++i) {
@@ -464,10 +465,10 @@ void DictEncodeFinalize(KernelContext* ctx, std::vector<Datum>* out) {
 void ValueCountsFinalize(KernelContext* ctx, std::vector<Datum>* out) {
   auto hash_impl = checked_cast<HashKernel*>(ctx->state());
   std::shared_ptr<ArrayData> uniques;
-  CTX_RETURN_IF_ERROR(ctx, hash_impl->GetDictionary(&uniques));
+  KERNEL_ABORT_IF_ERROR(ctx, hash_impl->GetDictionary(&uniques));
 
   Datum value_counts;
-  CTX_RETURN_IF_ERROR(ctx, hash_impl->FlushFinal(&value_counts));
+  KERNEL_ABORT_IF_ERROR(ctx, hash_impl->FlushFinal(&value_counts));
   auto data_type =
       struct_({field(kValuesFieldName, uniques->type), field(kCountsFieldName, int64())});
   ArrayVector children = {MakeArray(uniques), value_counts.make_array()};
@@ -475,11 +476,11 @@ void ValueCountsFinalize(KernelContext* ctx, std::vector<Datum>* out) {
   *out = {Datum(result)};
 }
 
-Result<ValueDescr> DictEncodeOutput(const std::vector<ValueDescr>& descrs) {
+ValueDescr DictEncodeOutput(KernelContext*, const std::vector<ValueDescr>& descrs) {
   return ValueDescr::Array(dictionary(int32(), descrs[0].type));
 }
 
-Result<ValueDescr> ValueCountsOutput(const std::vector<ValueDescr>& descrs) {
+ValueDescr ValueCountsOutput(KernelContext*, const std::vector<ValueDescr>& descrs) {
   return ValueDescr::Array(struct_(
       {field(kValuesFieldName, descrs[0].type), field(kCountsFieldName, int64())}));
 }
diff --git a/cpp/src/arrow/compute/kernels/vector_take.cc b/cpp/src/arrow/compute/kernels/vector_take.cc
index 8cc42df2ebb..e3ee318c759 100644
--- a/cpp/src/arrow/compute/kernels/vector_take.cc
+++ b/cpp/src/arrow/compute/kernels/vector_take.cc
@@ -49,7 +49,7 @@ struct TakeFunctor {
     ValueArrayType values(batch[0].array());
     IndexArrayType indices(batch[1].array());
     std::shared_ptr<Array> result;
-    CTX_RETURN_IF_ERROR(ctx, Select(ctx, values, IS(indices), &result));
+    KERNEL_ABORT_IF_ERROR(ctx, Select(ctx, values, IS(indices), &result));
     out->value = result->data();
   }
 };
diff --git a/cpp/src/arrow/compute/registry.cc b/cpp/src/arrow/compute/registry.cc
index 5a867078ee3..27da78758e1 100644
--- a/cpp/src/arrow/compute/registry.cc
+++ b/cpp/src/arrow/compute/registry.cc
@@ -23,7 +23,7 @@
 #include <unordered_map>
 
 #include "arrow/compute/function.h"
-#include "arrow/compute/kernels/registry.h"
+#include "arrow/compute/registry_internal.h"
 #include "arrow/status.h"
 #include "arrow/util/logging.h"
 
@@ -105,6 +105,7 @@ static void CreateBuiltInRegistry() {
   RegisterScalarBoolean(g_registry.get());
   RegisterScalarComparison(g_registry.get());
   RegisterScalarSetLookup(g_registry.get());
+  RegisterScalarCasts(g_registry.get());
 
   // Aggregate functions
   RegisterScalarAggregateBasic(g_registry.get());
diff --git a/cpp/src/arrow/compute/kernels/registry.h b/cpp/src/arrow/compute/registry_internal.h
similarity index 91%
rename from cpp/src/arrow/compute/kernels/registry.h
rename to cpp/src/arrow/compute/registry_internal.h
index c8b66a79f4e..7951356875d 100644
--- a/cpp/src/arrow/compute/kernels/registry.h
+++ b/cpp/src/arrow/compute/registry_internal.h
@@ -15,24 +15,22 @@
 // specific language governing permissions and limitations
 // under the License.
 
-// Internal APIs for adding kernels to the central registry
-
 #pragma once
 
-#include "arrow/status.h"
-
 namespace arrow {
 namespace compute {
+
+class FunctionRegistry;
+
 namespace internal {
 
 // Built-in scalar / elementwise functions
 void RegisterScalarArithmetic(FunctionRegistry* registry);
 void RegisterScalarBoolean(FunctionRegistry* registry);
+void RegisterScalarCasts(FunctionRegistry* registry);
 void RegisterScalarComparison(FunctionRegistry* registry);
 void RegisterScalarSetLookup(FunctionRegistry* registry);
 
-void RegisterScalarCastBoolean(FunctionRegistry* registry);
-
 // Vector functions
 void RegisterVectorFilter(FunctionRegistry* registry);
 void RegisterVectorHash(FunctionRegistry* registry);
diff --git a/cpp/src/arrow/dataset/filter.cc b/cpp/src/arrow/dataset/filter.cc
index 391dc92d73b..ed645d77160 100644
--- a/cpp/src/arrow/dataset/filter.cc
+++ b/cpp/src/arrow/dataset/filter.cc
@@ -1218,7 +1218,7 @@ struct TreeEvaluator::Impl {
 
 Result<Datum> TreeEvaluator::Evaluate(const Expression& expr, const RecordBatch& batch,
                                       MemoryPool* pool) const {
-  return VisitExpression(expr, Impl{this, batch, compute::FunctionContext{pool}});
+  return VisitExpression(expr, Impl{this, batch, compute::ExecContext{pool}});
 }
 
 Result<std::shared_ptr<RecordBatch>> TreeEvaluator::Filter(
diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h
index ef83d7de2d7..c06e7f885c1 100644
--- a/cpp/src/arrow/type_traits.h
+++ b/cpp/src/arrow/type_traits.h
@@ -617,11 +617,11 @@ template <typename T, typename R = void>
 using enable_if_8bit_int = enable_if_t<is_8bit_int<T>::value, R>;
 
 template <typename T>
-using is_paramater_free_type =
+using is_parameter_free_type =
     std::integral_constant<bool, TypeTraits<T>::is_parameter_free>;
 
 template <typename T, typename R = void>
-using enable_if_parameter_free = enable_if_t<is_paramater_free_type<T>::value, R>;
+using enable_if_parameter_free = enable_if_t<is_parameter_free_type<T>::value, R>;
 
 // Physical representation quirks
 

From e62c7fa07a640f071b2087b2c149362b8d2e8bf6 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Wed, 20 May 2020 17:20:21 -0500
Subject: [PATCH 10/41] Cast tests compiling again

---
 cpp/src/arrow/CMakeLists.txt                  |   1 +
 cpp/src/arrow/compute/api_scalar.h            |   1 +
 cpp/src/arrow/compute/cast.cc                 |  18 ++-
 cpp/src/arrow/compute/cast.h                  |   3 +-
 cpp/src/arrow/compute/exec.cc                 |   2 +-
 cpp/src/arrow/compute/kernel.h                |  16 +--
 cpp/src/arrow/compute/kernel_test.cc          |  17 ++-
 cpp/src/arrow/compute/kernels/CMakeLists.txt  |   1 +
 .../compute/kernels/scalar_cast_internal.cc   |   3 +-
 .../compute/kernels/scalar_cast_string.cc     |  98 ++++++++++----
 .../arrow/compute/kernels/scalar_cast_test.cc | 125 +-----------------
 11 files changed, 116 insertions(+), 169 deletions(-)

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index a48580c910f..c51bd256708 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -335,6 +335,7 @@ if(ARROW_COMPUTE)
               compute/kernels/scalar_cast_boolean.cc
               compute/kernels/scalar_cast_internal.cc
               compute/kernels/scalar_cast_numeric.cc
+              compute/kernels/scalar_cast_string.cc
               compute/kernels/scalar_cast_temporal.cc
               compute/kernels/scalar_compare.cc
               compute/kernels/scalar_set_lookup.cc
diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h
index 848f3c361bf..d36dfb556d1 100644
--- a/cpp/src/arrow/compute/api_scalar.h
+++ b/cpp/src/arrow/compute/api_scalar.h
@@ -22,6 +22,7 @@
 
 #include <memory>
 
+#include "arrow/compute/cast.h"
 #include "arrow/compute/exec.h"
 #include "arrow/compute/options.h"
 #include "arrow/datum.h"
diff --git a/cpp/src/arrow/compute/cast.cc b/cpp/src/arrow/compute/cast.cc
index b283d7a6478..b1c43add4ff 100644
--- a/cpp/src/arrow/compute/cast.cc
+++ b/cpp/src/arrow/compute/cast.cc
@@ -46,6 +46,7 @@ void AddCastFunctions(const std::vector<std::shared_ptr<CastFunction>>& funcs) {
 void InitCastTable() {
   AddCastFunctions(GetBooleanCasts());
   AddCastFunctions(GetNumericCasts());
+  AddCastFunctions(GetStringCasts());
   AddCastFunctions(GetTemporalCasts());
 }
 
@@ -94,14 +95,19 @@ bool CastFunction::CanCastTo(const DataType& out_type) const {
   return impl_->in_types.find(out_type.id()) != impl_->in_types.end();
 }
 
-Result<std::shared_ptr<Array>> Cast(const Array& value, std::shared_ptr<DataType> to_type,
-                                    const CastOptions& options, ExecContext* context) {
-  return Status::NotImplemented("NYI");
+Result<Datum> Cast(const Datum& value, std::shared_ptr<DataType> to_type,
+                   const CastOptions& options, ExecContext* ctx) {
+  CastOptions options_with_to_type;
+  options_with_to_type.to_type = to_type;
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<const CastFunction> cast_func,
+                        GetCastFunction(value.type(), to_type));
+  return cast_func->Execute({Datum(value)}, &options_with_to_type, ctx);
 }
 
-Result<Datum> Cast(const Datum& value, std::shared_ptr<DataType> to_type,
-                   const CastOptions& options, ExecContext* context) {
-  return Status::NotImplemented("NYI");
+Result<std::shared_ptr<Array>> Cast(const Array& value, std::shared_ptr<DataType> to_type,
+                                    const CastOptions& options, ExecContext* ctx) {
+  ARROW_ASSIGN_OR_RAISE(Datum result, Cast(Datum(value), to_type, options, ctx));
+  return result.make_array();
 }
 
 Result<std::shared_ptr<const CastFunction>> GetCastFunction(
diff --git a/cpp/src/arrow/compute/cast.h b/cpp/src/arrow/compute/cast.h
index 64298ef74bb..59a3358947c 100644
--- a/cpp/src/arrow/compute/cast.h
+++ b/cpp/src/arrow/compute/cast.h
@@ -60,8 +60,7 @@ class CastFunction : public ScalarFunction {
 
 ARROW_EXPORT
 Result<std::shared_ptr<const CastFunction>> GetCastFunction(
-    const std::shared_ptr<DataType>& from_type,
-    const std::shared_ptr<DataType>& to_type);
+    const std::shared_ptr<DataType>& from_type, const std::shared_ptr<DataType>& to_type);
 
 /// \brief Return true if a cast function is defined
 ARROW_EXPORT
diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc
index d7fb0745e9b..9bf3d5ad61b 100644
--- a/cpp/src/arrow/compute/exec.cc
+++ b/cpp/src/arrow/compute/exec.cc
@@ -484,7 +484,7 @@ class FunctionExecutorImpl : public FunctionExecutor {
 
     // Resolve the output descriptor for this kernel
     ARROW_ASSIGN_OR_RAISE(output_descr_, kernel_->signature->out_type().Resolve(
-        &kernel_ctx_, input_descrs_));
+                                             &kernel_ctx_, input_descrs_));
 
     return SetupArgIteration(args);
   }
diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h
index 28a11fd1fc6..701813b81ba 100644
--- a/cpp/src/arrow/compute/kernel.h
+++ b/cpp/src/arrow/compute/kernel.h
@@ -92,13 +92,13 @@ class ARROW_EXPORT KernelContext {
   KernelState* state_;
 };
 
-#define ARROW_CTX_RETURN_IF_ERROR(CTX)              \
-  do {                                              \
-    if (ARROW_PREDICT_FALSE((CTX)->HasError())) {   \
-      Status s = (CTX)->status();                   \
-      (CTX)->ResetStatus();                         \
-      return s;                                     \
-    }                                               \
+#define ARROW_CTX_RETURN_IF_ERROR(CTX)            \
+  do {                                            \
+    if (ARROW_PREDICT_FALSE((CTX)->HasError())) { \
+      Status s = (CTX)->status();                 \
+      (CTX)->ResetStatus();                       \
+      return s;                                   \
+    }                                             \
   } while (0)
 
 /// A standard function taking zero or more Array/Scalar values and returning
@@ -238,7 +238,7 @@ class ARROW_EXPORT OutputType {
   /// that SHOULD be performed one or more layers above. May make use of kernel
   /// state to know what type to output
   using Resolver =
-    std::function<Result<ValueDescr>(KernelContext*, const std::vector<ValueDescr>&)>;
+      std::function<Result<ValueDescr>(KernelContext*, const std::vector<ValueDescr>&)>;
 
   OutputType(std::shared_ptr<DataType> type)  // NOLINT implicit construction
       : kind_(FIXED), type_(std::move(type)) {}
diff --git a/cpp/src/arrow/compute/kernel_test.cc b/cpp/src/arrow/compute/kernel_test.cc
index ba8422a6df4..0b3bb004b6f 100644
--- a/cpp/src/arrow/compute/kernel_test.cc
+++ b/cpp/src/arrow/compute/kernel_test.cc
@@ -249,13 +249,13 @@ TEST(OutputType, Resolve) {
   ASSERT_OK_AND_ASSIGN(ValueDescr descr, ty1.Resolve(nullptr, {}));
   ASSERT_EQ(ValueDescr::Scalar(int32()), descr);
 
-  ASSERT_OK_AND_ASSIGN(descr, ty1.Resolve(nullptr,
-                                          {ValueDescr(int8(), ValueDescr::SCALAR)}));
+  ASSERT_OK_AND_ASSIGN(descr,
+                       ty1.Resolve(nullptr, {ValueDescr(int8(), ValueDescr::SCALAR)}));
   ASSERT_EQ(ValueDescr::Scalar(int32()), descr);
 
-  ASSERT_OK_AND_ASSIGN(descr, ty1.Resolve(nullptr,
-                                          {ValueDescr(int8(), ValueDescr::SCALAR),
-                                           ValueDescr(int8(), ValueDescr::ARRAY)}));
+  ASSERT_OK_AND_ASSIGN(descr,
+                       ty1.Resolve(nullptr, {ValueDescr(int8(), ValueDescr::SCALAR),
+                                             ValueDescr(int8(), ValueDescr::ARRAY)}));
   ASSERT_EQ(ValueDescr::Array(int32()), descr);
 
   OutputType ty2([](KernelContext*, const std::vector<ValueDescr>& args) {
@@ -423,10 +423,9 @@ TEST(KernelSignature, ToString) {
   ASSERT_EQ("(scalar[int8], array[decimal*], any[string]) -> any[string]",
             sig.ToString());
 
-  OutputType out_type(
-      [](KernelContext*, const std::vector<ValueDescr>& args) {
-        return Status::Invalid("NYI");
-      });
+  OutputType out_type([](KernelContext*, const std::vector<ValueDescr>& args) {
+    return Status::Invalid("NYI");
+  });
   KernelSignature sig2({int8(), Type::DECIMAL}, out_type);
   ASSERT_EQ("(any[int8], any[decimal*]) -> computed", sig2.ToString());
 }
diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt
index dd309ad4e85..703236ff9d0 100644
--- a/cpp/src/arrow/compute/kernels/CMakeLists.txt
+++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt
@@ -22,6 +22,7 @@ add_arrow_compute_test(scalar_test
                        SOURCES
                        scalar_arithmetic_test.cc
                        scalar_boolean_test.cc
+                       scalar_cast_test.cc
                        scalar_compare_test.cc
                        scalar_set_lookup_test.cc)
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
index 78b1165f806..4dc23b07336 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
@@ -37,7 +37,8 @@ void CastFromExtension(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
 
   // TODO: Finish implementing this
 
-  // KERNEL_ABORT_IF_ERROR(ctx, cast_func->Execute(*batch[0]->array(), out->mutable_array()));
+  // KERNEL_ABORT_IF_ERROR(ctx, cast_func->Execute(*batch[0]->array(),
+  // out->mutable_array()));
 }
 
 std::unique_ptr<KernelState> CastInit(KernelContext* ctx, const KernelInitArgs& args) {
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
index 34c4f5b1c3c..6bef4e7d0b0 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
@@ -17,8 +17,22 @@
 
 // Implementation of casting to integer or floating point types
 
+#include <utility>
+#include <vector>
+
+#include "arrow/compute/kernels/common.h"
+#include "arrow/compute/kernels/scalar_cast_internal.h"
+#include "arrow/util/formatting.h"
+#include "arrow/util/utf8.h"
+
 namespace arrow {
+
+using internal::StringFormatter;
+using util::InitializeUTF8;
+using util::ValidateUTF8;
+
 namespace compute {
+namespace internal {
 
 // ----------------------------------------------------------------------
 // Number / Boolean to String
@@ -27,20 +41,19 @@ template <typename I, typename O>
 struct CastFunctor<O, I,
                    enable_if_t<is_string_like_type<O>::value &&
                                (is_number_type<I>::value || is_boolean_type<I>::value)>> {
-  void operator()(KernelContext* ctx, const CastOptions& options, const ArrayData& input,
-                  ArrayData* output) {
-    ctx->SetStatus(Convert(ctx, options, input, output));
+  using value_type = typename TypeTraits<I>::CType;
+  using BuilderType = typename TypeTraits<O>::BuilderType;
+  using FormatterType = StringFormatter<I>;
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const ArrayData& input = *batch[0].array();
+    ArrayData* output = out->mutable_array();
+    ctx->SetStatus(Convert(ctx, input, output));
   }
 
-  Status Convert(KernelContext* ctx, const CastOptions& options, const ArrayData& input,
-                 ArrayData* output) {
-    using value_type = typename TypeTraits<I>::CType;
-    using BuilderType = typename TypeTraits<O>::BuilderType;
-    using FormatterType = typename internal::StringFormatter<I>;
-
+  static Status Convert(KernelContext* ctx, const ArrayData& input, ArrayData* output) {
     FormatterType formatter(input.type);
     BuilderType builder(input.type, ctx->memory_pool());
-
     auto convert_value = [&](util::optional<value_type> v) {
       if (v.has_value()) {
         return formatter(*v, [&](util::string_view v) { return builder.Append(v); });
@@ -67,30 +80,34 @@ struct CastFunctor<O, I,
 #pragma warning(disable : 4101)
 #endif
 
+struct Utf8Validator {
+  Status VisitNull() { return Status::OK(); }
+
+  Status VisitValue(util::string_view str) {
+    if (ARROW_PREDICT_FALSE(!ValidateUTF8(str))) {
+      return Status::Invalid("Invalid UTF8 payload");
+    }
+    return Status::OK();
+  }
+};
+
 template <typename I, typename O>
 struct BinaryToStringSameWidthCastFunctor {
-  void operator()(KernelContext* ctx, const CastOptions& options, const ArrayData& input,
-                  ArrayData* output) {
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const CastOptions& options = static_cast<const CastState&>(*ctx->state()).options;
     if (!options.allow_invalid_utf8) {
-      util::InitializeUTF8();
+      InitializeUTF8();
+      const ArrayData& input = *batch[0].array();
 
       ArrayDataVisitor<I> visitor;
-      Status st = visitor.Visit(input, this);
+      Utf8Validator validator;
+      Status st = visitor.Visit(input, &validator);
       if (!st.ok()) {
         ctx->SetStatus(st);
         return;
       }
     }
-    ZeroCopyData(input, output);
-  }
-
-  Status VisitNull() { return Status::OK(); }
-
-  Status VisitValue(util::string_view str) {
-    if (ARROW_PREDICT_FALSE(!arrow::util::ValidateUTF8(str))) {
-      return Status::Invalid("Invalid UTF8 payload");
-    }
-    return Status::OK();
+    ZeroCopyCastExec(ctx, batch, out);
   }
 };
 
@@ -106,5 +123,38 @@ struct CastFunctor<LargeStringType, LargeBinaryType>
 #pragma warning(pop)
 #endif
 
+// String casts available
+//
+// * Numbers and boolean to String / LargeString
+// * Binary / LargeBinary to String / LargeString with UTF8 validation
+
+template <typename OutType>
+void AddNumberToStringCasts(std::shared_ptr<DataType> out_ty, CastFunction* func) {
+  DCHECK_OK(func->AddKernel(Type::BOOL, {boolean()}, out_ty,
+                            CastFunctor<OutType, BooleanType>::Exec));
+
+  for (const std::shared_ptr<DataType>& in_ty : NumericTypes()) {
+    DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, out_ty,
+                              codegen::Numeric<CastFunctor, OutType>(*in_ty)));
+  }
+}
+
+std::vector<std::shared_ptr<CastFunction>> GetStringCasts() {
+  auto cast_string = std::make_shared<CastFunction>("cast_string", Type::STRING);
+  AddNumberToStringCasts<StringType>(utf8(), cast_string.get());
+  DCHECK_OK(cast_string->AddKernel(Type::BINARY, {binary()}, utf8(),
+                                   CastFunctor<StringType, BinaryType>::Exec));
+
+  auto cast_large_string =
+      std::make_shared<CastFunction>("cast_large_string", Type::LARGE_STRING);
+  AddNumberToStringCasts<LargeStringType>(large_utf8(), cast_large_string.get());
+  DCHECK_OK(
+      cast_large_string->AddKernel(Type::LARGE_BINARY, {large_binary()}, large_utf8(),
+                                   CastFunctor<LargeStringType, LargeBinaryType>::Exec));
+
+  return {cast_string, cast_large_string};
+}
+
+}  // namespace internal
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
index 3efab54fe55..1e500593f4d 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
@@ -40,10 +40,10 @@
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/decimal.h"
 
+#include "arrow/compute/api_vector.h"
 #include "arrow/compute/cast.h"
 #include "arrow/compute/kernel.h"
 #include "arrow/compute/test_util.h"
-#include "arrow/compute/util_internal.h"
 
 namespace arrow {
 namespace compute {
@@ -1238,104 +1238,6 @@ TEST_F(TestCast, DateTimeZeroCopy) {
   CheckZeroCopy(*arr, duration(TimeUnit::MILLI));
 }
 
-TEST_F(TestCast, PreallocatedMemory) {
-  CastOptions options;
-  options.allow_int_overflow = false;
-
-  std::vector<bool> is_valid = {true, false, true, true, true};
-
-  const int64_t length = 5;
-
-  std::shared_ptr<Array> arr;
-  std::vector<int32_t> v1 = {0, 70000, 2000, 1000, 0};
-  std::vector<int64_t> e1 = {0, 70000, 2000, 1000, 0};
-  ArrayFromVector<Int32Type, int32_t>(int32(), is_valid, v1, &arr);
-
-  auto out_type = int64();
-
-  std::unique_ptr<UnaryKernel> kernel;
-  ASSERT_OK(GetCastFunction(*int32(), out_type, options, &kernel));
-
-  auto out_data = ArrayData::Make(out_type, length);
-
-  ASSERT_OK_AND_ASSIGN(auto out_values, AllocateBuffer(length * sizeof(int64_t)));
-
-  out_data->buffers.push_back(arr->data()->buffers[0]);
-  out_data->buffers.push_back(out_values);
-
-  // TODO
-  Datum out(out_data);
-  ASSERT_OK(kernel->Call(&this->ctx_, arr, &out));
-
-  // Buffer address unchanged
-  ASSERT_EQ(out_values.get(), out_data->buffers[1].get());
-
-  std::shared_ptr<Array> result = MakeArray(out_data);
-  std::shared_ptr<Array> expected;
-  ArrayFromVector<Int64Type, int64_t>(int64(), is_valid, e1, &expected);
-
-  ASSERT_ARRAYS_EQUAL(*expected, *result);
-}
-
-template <typename InType, typename InT, typename OutType, typename OutT>
-void CheckOffsetOutputCase(const std::shared_ptr<DataType>& in_type,
-                           const std::vector<InT>& in_values,
-                           const std::shared_ptr<DataType>& out_type,
-                           const std::vector<OutT>& out_values) {
-  using OutTraits = TypeTraits<OutType>;
-
-  CastOptions options;
-
-  const int64_t length = static_cast<int64_t>(in_values.size());
-
-  std::shared_ptr<Array> arr, expected;
-  ArrayFromVector<InType, InT>(in_type, in_values, &arr);
-  ArrayFromVector<OutType, OutT>(out_type, out_values, &expected);
-
-  ASSERT_OK_AND_ASSIGN(auto out_buffer,
-                       AllocateBuffer(OutTraits::bytes_required(length)));
-
-  std::unique_ptr<UnaryKernel> kernel;
-  ASSERT_OK(GetCastFunction(*in_type, out_type, options, &kernel));
-
-  const int64_t first_half = length / 2;
-
-  auto out_data = ArrayData::Make(out_type, length, {nullptr, out_buffer});
-  auto out_second_data = out_data->Copy();
-  out_second_data->offset = first_half;
-
-  Datum out_first(out_data);
-  Datum out_second(out_second_data);
-
-  // Cast each bit
-  ASSERT_OK(kernel->Call(ctx, arr->Slice(0, first_half), &out_first));
-  ASSERT_OK(kernel->Call(ctx, arr->Slice(first_half), &out_second));
-
-  std::shared_ptr<Array> result = MakeArray(out_data);
-
-  ASSERT_ARRAYS_EQUAL(*expected, *result);
-}
-
-TEST_F(TestCast, OffsetOutputBuffer) {
-  // ARROW-1735
-  std::vector<int32_t> v1 = {0, 10000, 2000, 1000, 0};
-  std::vector<int64_t> e1 = {0, 10000, 2000, 1000, 0};
-
-  auto in_type = int32();
-  auto out_type = int64();
-  CheckOffsetOutputCase<Int32Type, int32_t, Int64Type, int64_t>(in_type, v1, out_type,
-                                                                e1);
-
-  std::vector<bool> e2 = {false, true, true, true, false};
-
-  out_type = boolean();
-  CheckOffsetOutputCase<Int32Type, int32_t, BooleanType, bool>(in_type, v1, boolean(),
-                                                               e2);
-
-  std::vector<int16_t> e3 = {0, 10000, 2000, 1000, 0};
-  CheckOffsetOutputCase<Int32Type, int32_t, Int16Type, int16_t>(in_type, v1, int16(), e3);
-}
-
 TEST_F(TestCast, StringToBoolean) {
   CastOptions options;
 
@@ -1560,7 +1462,7 @@ TYPED_TEST(TestNullCast, FromNull) {
 
   NullArray arr(length);
 
-  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> result, Cast(&this->ctx_, arr, out_type));
+  ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> result, Cast(arr, out_type));
   ASSERT_OK(result->ValidateFull());
 
   ASSERT_TRUE(result->type()->Equals(*out_type));
@@ -1600,12 +1502,13 @@ TYPED_TEST(TestDictionaryCast, NoNulls) {
     return;
   }
 
+  CastOptions options;
   std::shared_ptr<Array> plain_array =
       TestBase::MakeRandomArray<typename TypeTraits<TypeParam>::ArrayType>(10, 0);
   ASSERT_EQ(plain_array->null_count(), 0);
 
   // Dict-encode the plain array
-  ASSERT_OK_AND_ASSIGN(Datum encoded, DictionaryEncode(&this->ctx_, plain_array->data()));
+  ASSERT_OK_AND_ASSIGN(Datum encoded, DictionaryEncode(plain_array->data()));
 
   // Make a new dict array with nullptr bitmap buffer
   auto data = encoded.array()->Copy();
@@ -1614,7 +1517,7 @@ TYPED_TEST(TestDictionaryCast, NoNulls) {
   std::shared_ptr<Array> dict_array = std::make_shared<DictionaryArray>(data);
   ASSERT_OK(dict_array->ValidateFull());
 
-  this->CheckPass(*dict_array, *plain_array, plain_array->type());
+  this->CheckPass(*dict_array, *plain_array, plain_array->type(), options);
 }
 
 TYPED_TEST(TestDictionaryCast, OutTypeError) {
@@ -1624,26 +1527,12 @@ TYPED_TEST(TestDictionaryCast, OutTypeError) {
   auto in_type = dictionary(int32(), plain_array->type());
   // Test an output type that's not the plain input type but still part of TestTypes.
   auto out_type = (plain_array->type()->id() == Type::INT8) ? binary() : int8();
-  std::unique_ptr<UnaryKernel> kernel;
-  ASSERT_RAISES(NotImplemented,
-                GetCastFunction(*in_type, out_type, CastOptions(), &kernel));
+  ASSERT_RAISES(NotImplemented, GetCastFunction(in_type, out_type));
   // Test an output type that's not part of TestTypes.
   out_type = list(in_type);
-  ASSERT_RAISES(NotImplemented,
-                GetCastFunction(*in_type, out_type, CastOptions(), &kernel));
+  ASSERT_RAISES(NotImplemented, GetCastFunction(in_type, out_type));
 }
 
-/*TYPED_TEST(TestDictionaryCast, Reverse) {
-  CastOptions options;
-  std::shared_ptr<Array> plain_array =
-      TestBase::MakeRandomArray<typename TypeTraits<TypeParam>::ArrayType>(10, 2);
-
-  std::shared_ptr<Array> dict_array;
-  ASSERT_OK(EncodeArrayToDictionary(*plain_array, this->pool_, &dict_array));
-
-  this->CheckPass(*plain_array, *dict_array, dict_array->type(), options);
-}*/
-
 std::shared_ptr<Array> SmallintArrayFromJSON(const std::string& json_data) {
   auto arr = ArrayFromJSON(int16(), json_data);
   auto ext_data = arr->data()->Copy();

From 1e9d0fceb40b02d33af8fa0f3dd56f6eaa25cc33 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Wed, 20 May 2020 17:24:14 -0500
Subject: [PATCH 11/41] Fix compare_benchmark.cc

---
 cpp/src/arrow/compute/kernels/scalar_compare_benchmark.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_compare_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_compare_benchmark.cc
index f20c5199b51..edd7a89f189 100644
--- a/cpp/src/arrow/compute/kernels/scalar_compare_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_compare_benchmark.cc
@@ -20,7 +20,7 @@
 #include <vector>
 
 #include "arrow/compute/benchmark_util.h"
-#include "arrow/compute/kernel.h"
+#include "arrow/compute/api_scalar.h"
 #include "arrow/compute/test_util.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/random.h"
@@ -41,7 +41,7 @@ static void CompareArrayScalarKernel(benchmark::State& state) {
   CompareOptions ge{GREATER_EQUAL};
 
   for (auto _ : state) {
-    ABORT_NOT_OK(Compare(array, int64_t(0), ge).status());
+    ABORT_NOT_OK(Compare(array, Datum(int64_t(0)), ge).status());
   }
 
   state.counters["size"] = static_cast<double>(memory_size);

From 7039e3d89861b4e19484e455ca6da568a8561b5f Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Wed, 20 May 2020 18:13:07 -0500
Subject: [PATCH 12/41] Don't yield empty batches from ExecBatchIterator

---
 cpp/src/arrow/compute/exec.cc         | 20 ++++++--------------
 cpp/src/arrow/compute/exec.h          |  5 +++--
 cpp/src/arrow/compute/exec_internal.h |  1 -
 3 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc
index 9bf3d5ad61b..cf1d59dbe98 100644
--- a/cpp/src/arrow/compute/exec.cc
+++ b/cpp/src/arrow/compute/exec.cc
@@ -79,8 +79,7 @@ ExecBatchIterator::ExecBatchIterator(std::vector<Datum> args, int64_t length,
     : args_(std::move(args)),
       position_(0),
       length_(length),
-      max_chunksize_(max_chunksize),
-      finished_(false) {
+      max_chunksize_(max_chunksize) {
   chunk_indexes_.resize(args_.size(), 0);
   chunk_positions_.resize(args_.size(), 0);
 }
@@ -123,7 +122,9 @@ Result<std::unique_ptr<ExecBatchIterator>> ExecBatchIterator::Make(
 }
 
 bool ExecBatchIterator::Next(ExecBatch* batch) {
-  if (finished_) return false;
+  if (position_ == length_) {
+    return false;
+  }
 
   // Determine how large the common contiguous "slice" of all the arguments is
   int64_t iteration_size = std::min(length_ - position_, max_chunksize_);
@@ -162,22 +163,13 @@ bool ExecBatchIterator::Next(ExecBatch* batch) {
       batch->values[i] = args_[i].array()->Slice(position_, iteration_size);
     } else {
       const ChunkedArray& carr = *args_[i].chunked_array();
-      if (carr.num_chunks() > 0) {
-        const auto& chunk = carr.chunk(chunk_indexes_[i]);
-        batch->values[i] = chunk->data()->Slice(chunk_positions_[i], iteration_size);
-      } else {
-        // Degenerate case of a ChunkedArray with zero chunks
-        DCHECK_EQ(0, length_);
-        batch->values[i] = ArrayData::Make(carr.type(), 0);
-      }
+      const auto& chunk = carr.chunk(chunk_indexes_[i]);
+      batch->values[i] = chunk->data()->Slice(chunk_positions_[i], iteration_size);
       chunk_positions_[i] += iteration_size;
     }
   }
   position_ += iteration_size;
   DCHECK_LE(position_, length_);
-  if (position_ == length_) {
-    finished_ = true;
-  }
   return true;
 }
 
diff --git a/cpp/src/arrow/compute/exec.h b/cpp/src/arrow/compute/exec.h
index 721d9e8d99e..1770135b7d2 100644
--- a/cpp/src/arrow/compute/exec.h
+++ b/cpp/src/arrow/compute/exec.h
@@ -46,8 +46,9 @@ struct FunctionOptions;
 class FunctionRegistry;
 
 // It seems like 64K might be a good default chunksize to use for execution
-// based on the experience of other query processing systems, so using this for
-// now.
+// based on the experience of other query processing systems. The current
+// default is not to chunk contiguous arrays, though, but this may change in
+// the future once parallel execution is implemented
 static constexpr int64_t kDefaultExecChunksize = UINT16_MAX;
 
 /// \brief Context for expression-global variables and options used by
diff --git a/cpp/src/arrow/compute/exec_internal.h b/cpp/src/arrow/compute/exec_internal.h
index 77157ed3ef7..650bfc531d2 100644
--- a/cpp/src/arrow/compute/exec_internal.h
+++ b/cpp/src/arrow/compute/exec_internal.h
@@ -68,7 +68,6 @@ class ARROW_EXPORT ExecBatchIterator {
   int64_t position_;
   int64_t length_;
   int64_t max_chunksize_;
-  bool finished_;
 };
 
 // "Push" / listener API like IPC reader so that consumers can receive

From f7da85f99206f2eb5faf146424bbeaa7ee0093a0 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Wed, 20 May 2020 20:34:11 -0500
Subject: [PATCH 13/41] Most casts working now, working on remaining

---
 cpp/src/arrow/compute/cast.cc                 | 84 ++++++++++++++++---
 cpp/src/arrow/compute/cast.h                  | 20 +++--
 cpp/src/arrow/compute/cast_internal.h         |  8 +-
 cpp/src/arrow/compute/exec_test.cc            | 14 +++-
 cpp/src/arrow/compute/function.h              |  3 +-
 cpp/src/arrow/compute/kernel.cc               | 11 ++-
 cpp/src/arrow/compute/kernel_test.cc          |  2 +
 .../arrow/compute/kernels/codegen_internal.h  | 67 ++++++++++++---
 .../compute/kernels/scalar_cast_boolean.cc    | 11 ++-
 .../compute/kernels/scalar_cast_internal.cc   | 30 ++-----
 .../compute/kernels/scalar_cast_internal.h    | 12 +--
 .../compute/kernels/scalar_cast_numeric.cc    | 55 +++++++-----
 .../compute/kernels/scalar_cast_string.cc     | 16 ++--
 .../compute/kernels/scalar_cast_temporal.cc   | 80 ++++++++----------
 .../arrow/compute/kernels/scalar_cast_test.cc |  6 +-
 .../kernels/scalar_compare_benchmark.cc       |  2 +-
 cpp/src/arrow/compute/registry.cc             |  1 -
 cpp/src/arrow/compute/registry_internal.h     |  1 -
 18 files changed, 274 insertions(+), 149 deletions(-)

diff --git a/cpp/src/arrow/compute/cast.cc b/cpp/src/arrow/compute/cast.cc
index b1c43add4ff..5cab14d7013 100644
--- a/cpp/src/arrow/compute/cast.cc
+++ b/cpp/src/arrow/compute/cast.cc
@@ -27,6 +27,8 @@
 
 #include "arrow/compute/cast_internal.h"
 #include "arrow/compute/exec.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/compute/options.h"
 #include "arrow/compute/registry.h"
 
 namespace arrow {
@@ -76,31 +78,87 @@ CastFunction::~CastFunction() {}
 
 Type::type CastFunction::out_type_id() const { return impl_->out_type; }
 
-Status CastFunction::AddKernel(Type::type in_type_id, std::vector<InputType> in_types,
-                               OutputType out_type, ArrayKernelExec exec,
-                               KernelInit init) {
-  RETURN_NOT_OK(
-      ScalarFunction::AddKernel(std::move(in_types), std::move(out_type), exec, init));
-  impl_->in_types.insert(in_type_id);
-  return Status::OK();
+std::unique_ptr<KernelState> CastInit(KernelContext* ctx, const KernelInitArgs& args) {
+  // NOTE: TakeOptions are currently unused, but we pass it through anyway
+  auto cast_options = static_cast<const CastOptions*>(args.options);
+
+  // Ensure that the requested type to cast to was attached to the options
+  DCHECK(cast_options->to_type);
+  return std::unique_ptr<KernelState>(new internal::CastState(*cast_options));
 }
 
 Status CastFunction::AddKernel(Type::type in_type_id, ScalarKernel kernel) {
+  // We use the same KernelInit for every cast
+  kernel.init = CastInit;
   RETURN_NOT_OK(ScalarFunction::AddKernel(kernel));
   impl_->in_types.insert(in_type_id);
   return Status::OK();
 }
 
+Status CastFunction::AddKernel(Type::type in_type_id, std::vector<InputType> in_types,
+                               OutputType out_type, ArrayKernelExec exec,
+                               NullHandling::type null_handling,
+                               MemAllocation::type mem_allocation) {
+  ScalarKernel kernel;
+  kernel.signature = KernelSignature::Make(std::move(in_types), std::move(out_type));
+  kernel.exec = exec;
+  kernel.null_handling = null_handling;
+  kernel.mem_allocation = mem_allocation;
+  return AddKernel(in_type_id, std::move(kernel));
+}
+
 bool CastFunction::CanCastTo(const DataType& out_type) const {
   return impl_->in_types.find(out_type.id()) != impl_->in_types.end();
 }
 
+Result<const ScalarKernel*> CastFunction::DispatchExact(
+    const std::vector<ValueDescr>& values) const {
+  const int passed_num_args = static_cast<int>(values.size());
+
+  // Validate arity
+  if (passed_num_args != 1) {
+    return Status::Invalid("Cast sunctions accept 1 argument but passed ",
+                           passed_num_args);
+  }
+  std::vector<const ScalarKernel*> candidate_kernels;
+  for (const auto& kernel : kernels_) {
+    if (kernel.signature->MatchesInputs(values)) {
+      candidate_kernels.push_back(&kernel);
+    }
+  }
+
+  if (candidate_kernels.size() == 0) {
+    return Status::NotImplemented("Function ", this->name(),
+                                  " has no kernel matching input type ",
+                                  values[0].ToString());
+  } else if (candidate_kernels.size() == 1) {
+    // One match, return it
+    return candidate_kernels[0];
+  } else {
+    // Now we are in a casting scenario where we may have both a EXACT_TYPE and
+    // a SAME_TYPE_ID. So we will see if there is an exact match among the
+    // candidate kernels and if not we will just return the first one
+    for (auto kernel : candidate_kernels) {
+      const InputType& arg0 = kernel->signature->in_types()[0];
+      if (arg0.kind() == InputType::EXACT_TYPE) {
+        // Bingo. Return it
+        return kernel;
+      }
+    }
+    // We didn't find an exact match. So just return some kernel that matches
+    return candidate_kernels[0];
+  }
+}
+
 Result<Datum> Cast(const Datum& value, std::shared_ptr<DataType> to_type,
                    const CastOptions& options, ExecContext* ctx) {
-  CastOptions options_with_to_type;
+  if (value.type()->Equals(*to_type)) {
+    return value;
+  }
+  CastOptions options_with_to_type = options;
   options_with_to_type.to_type = to_type;
   ARROW_ASSIGN_OR_RAISE(std::shared_ptr<const CastFunction> cast_func,
-                        GetCastFunction(value.type(), to_type));
+                        GetCastFunction(to_type));
   return cast_func->Execute({Datum(value)}, &options_with_to_type, ctx);
 }
 
@@ -111,12 +169,12 @@ Result<std::shared_ptr<Array>> Cast(const Array& value, std::shared_ptr<DataType
 }
 
 Result<std::shared_ptr<const CastFunction>> GetCastFunction(
-    const std::shared_ptr<DataType>& from_type,
     const std::shared_ptr<DataType>& to_type) {
-  auto it = internal::g_cast_table.find(from_type->id());
+  internal::EnsureInitCastTable();
+  auto it = internal::g_cast_table.find(to_type->id());
   if (it == internal::g_cast_table.end()) {
-    return Status::NotImplemented("No cast implemented from ", from_type->ToString(),
-                                  " to ", to_type->ToString());
+    return Status::NotImplemented("No cast function available to cast to ",
+                                  to_type->ToString());
   }
   return it->second;
 }
diff --git a/cpp/src/arrow/compute/cast.h b/cpp/src/arrow/compute/cast.h
index 59a3358947c..9edc0220682 100644
--- a/cpp/src/arrow/compute/cast.h
+++ b/cpp/src/arrow/compute/cast.h
@@ -32,13 +32,8 @@ namespace compute {
 
 class ExecContext;
 
-class Caster {
- public:
-  virtual ~Caster() = default;
-
-  virtual Status Cast(const ArrayData& input, ArrayData* output) = 0;
-};
-
+// Cast functions are _not_ registered in the FunctionRegistry, though they use
+// the same execution machinery
 class CastFunction : public ScalarFunction {
  public:
   CastFunction(std::string name, Type::type out_type);
@@ -47,12 +42,19 @@ class CastFunction : public ScalarFunction {
   Type::type out_type_id() const;
 
   Status AddKernel(Type::type in_type_id, std::vector<InputType> in_types,
-                   OutputType out_type, ArrayKernelExec exec, KernelInit init = NULLPTR);
+                   OutputType out_type, ArrayKernelExec exec,
+                   NullHandling::type = NullHandling::INTERSECTION,
+                   MemAllocation::type = MemAllocation::PREALLOCATE);
 
+  // Note, this function toggles off memory allocation and sets the init
+  // function to CastInit
   Status AddKernel(Type::type in_type_id, ScalarKernel kernel);
 
   bool CanCastTo(const DataType& out_type) const;
 
+  Result<const ScalarKernel*> DispatchExact(
+      const std::vector<ValueDescr>& values) const override;
+
  private:
   struct CastFunctionImpl;
   std::unique_ptr<CastFunctionImpl> impl_;
@@ -60,7 +62,7 @@ class CastFunction : public ScalarFunction {
 
 ARROW_EXPORT
 Result<std::shared_ptr<const CastFunction>> GetCastFunction(
-    const std::shared_ptr<DataType>& from_type, const std::shared_ptr<DataType>& to_type);
+    const std::shared_ptr<DataType>& to_type);
 
 /// \brief Return true if a cast function is defined
 ARROW_EXPORT
diff --git a/cpp/src/arrow/compute/cast_internal.h b/cpp/src/arrow/compute/cast_internal.h
index 20478cca8be..13c46bcaedb 100644
--- a/cpp/src/arrow/compute/cast_internal.h
+++ b/cpp/src/arrow/compute/cast_internal.h
@@ -20,12 +20,18 @@
 #include <memory>
 #include <vector>
 
-#include "arrow/compute/cast.h"  // IWYU pragma: keep
+#include "arrow/compute/cast.h"    // IWYU pragma: keep
+#include "arrow/compute/kernel.h"  // IWYU pragma: keep
 
 namespace arrow {
 namespace compute {
 namespace internal {
 
+struct CastState : public KernelState {
+  explicit CastState(const CastOptions& options) : options(options) {}
+  CastOptions options;
+};
+
 // See scalar_cast_*.cc for these
 std::vector<std::shared_ptr<CastFunction>> GetBooleanCasts();
 std::vector<std::shared_ptr<CastFunction>> GetNumericCasts();
diff --git a/cpp/src/arrow/compute/exec_test.cc b/cpp/src/arrow/compute/exec_test.cc
index 794207461dc..1fb18e5a15a 100644
--- a/cpp/src/arrow/compute/exec_test.cc
+++ b/cpp/src/arrow/compute/exec_test.cc
@@ -512,20 +512,26 @@ TEST_F(TestExecBatchIterator, ChunkedArrays) {
   CheckIteration(args, /*chunksize=*/30, {15, 5, 10});
 }
 
-TEST_F(TestExecBatchIterator, ZeroLengthCases) {
+TEST_F(TestExecBatchIterator, ZeroLengthInputs) {
   auto carr = std::shared_ptr<ChunkedArray>(new ChunkedArray({}, int32()));
 
+  auto CheckArgs = [&](const std::vector<Datum>& args) {
+    auto iterator = ExecBatchIterator::Make(args).ValueOrDie();
+    ExecBatch batch;
+    ASSERT_FALSE(iterator->Next(&batch));
+  };
+
   // Zero-length ChunkedArray with zero chunks
   std::vector<Datum> args = {Datum(carr)};
-  CheckIteration(args, /*chunksize=*/10, {0});
+  CheckArgs(args);
 
   // Zero-length array
   args = {Datum(GetInt32Array(0))};
-  CheckIteration(args, /*chunksize=*/10, {0});
+  CheckArgs(args);
 
   // ChunkedArray with single empty chunk
   args = {Datum(GetInt32Chunked({0}))};
-  CheckIteration(args, /*chunksize=*/10, {0});
+  CheckArgs(args);
 }
 
 // ----------------------------------------------------------------------
diff --git a/cpp/src/arrow/compute/function.h b/cpp/src/arrow/compute/function.h
index dfbacff3c22..2720ea4ba65 100644
--- a/cpp/src/arrow/compute/function.h
+++ b/cpp/src/arrow/compute/function.h
@@ -150,7 +150,8 @@ class ARROW_EXPORT ScalarFunction : public detail::FunctionImpl<ScalarKernel> {
   /// \brief Return the first kernel that can execute the function given the
   /// exact argument types (without implicit type casts or scalar->array
   /// promotions)
-  Result<const ScalarKernel*> DispatchExact(const std::vector<ValueDescr>& values) const;
+  virtual Result<const ScalarKernel*> DispatchExact(
+      const std::vector<ValueDescr>& values) const;
 };
 
 /// \brief A function that executes general array operations that may yield
diff --git a/cpp/src/arrow/compute/kernel.cc b/cpp/src/arrow/compute/kernel.cc
index 9cee13175e2..c5bc601ece4 100644
--- a/cpp/src/arrow/compute/kernel.cc
+++ b/cpp/src/arrow/compute/kernel.cc
@@ -169,8 +169,15 @@ const std::shared_ptr<DataType>& InputType::type() const {
 }
 
 Type::type InputType::type_id() const {
-  DCHECK_EQ(InputType::SAME_TYPE_ID, kind_);
-  return type_id_;
+  switch (kind_) {
+    case InputType::EXACT_TYPE:
+      return type_->id();
+    case InputType::SAME_TYPE_ID:
+      return type_id_;
+    default:
+      DCHECK(false);
+      return Type::NA;
+  }
 }
 
 // ----------------------------------------------------------------------
diff --git a/cpp/src/arrow/compute/kernel_test.cc b/cpp/src/arrow/compute/kernel_test.cc
index 0b3bb004b6f..147a3c81bf4 100644
--- a/cpp/src/arrow/compute/kernel_test.cc
+++ b/cpp/src/arrow/compute/kernel_test.cc
@@ -52,6 +52,7 @@ TEST(InputType, Constructors) {
   ASSERT_EQ(InputType::EXACT_TYPE, ty1.kind());
   ASSERT_EQ(ValueDescr::ANY, ty1.shape());
   AssertTypeEqual(*int8(), *ty1.type());
+  ASSERT_EQ(Type::INT8, ty1.type_id());
 
   InputType ty1_implicit = int8();
   ASSERT_TRUE(ty1.Equals(ty1_implicit));
@@ -65,6 +66,7 @@ TEST(InputType, Constructors) {
   // Same type id constructor
   InputType ty2 = Type::DECIMAL;
   ASSERT_EQ(InputType::SAME_TYPE_ID, ty2.kind());
+  ASSERT_EQ(Type::DECIMAL, ty2.type_id());
 
   InputType ty2_array(Type::DECIMAL, ValueDescr::ARRAY);
   ASSERT_EQ(ValueDescr::ARRAY, ty2_array.shape());
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h
index ad6e00f4ac7..67966a3c35e 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.h
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -27,10 +27,12 @@
 #include "arrow/util/logging.h"
 #include "arrow/util/optional.h"
 #include "arrow/util/string_view.h"
+#include "arrow/visitor_inline.h"
 
 namespace arrow {
 
 using internal::BitmapReader;
+using internal::FirstTimeBitmapWriter;
 using internal::GenerateBitsUnrolled;
 
 namespace compute {
@@ -329,24 +331,56 @@ struct ScalarUnary {
 // array
 template <typename OutType, typename Arg0Type, typename Op>
 struct ScalarUnaryNotNullStateful {
+  using ThisType = ScalarUnaryNotNullStateful<OutType, Arg0Type, Op>;
   using OutScalar = typename TypeTraits<OutType>::ScalarType;
   using OUT = typename CodegenTraits<OutType>::value_type;
   using ARG0 = typename CodegenTraits<Arg0Type>::value_type;
 
   Op op;
-
   ScalarUnaryNotNullStateful(Op op) : op(std::move(op)) {}
 
-  void Array(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    ArrayData* out_arr = out->mutable_array();
-    auto out_data = out_arr->GetMutableValues<OUT>(kPrimitiveData);
-    VisitArrayDataInline(*batch[0].array(), [&](util::optional<ARG0> v) {
-        if (v.has_value()) {
-          *out_data = this->op.template Call<OUT, ARG0>(ctx, *v);
-        }
-        ++out_data;
-      });
-  }
+  template <typename Type, typename Enable = void>
+  struct ArrayExec {
+    static void Exec(const ThisType& functor, KernelContext* ctx, const ExecBatch& batch,
+                     Datum* out) {
+      DCHECK(false);
+    }
+  };
+
+  template <typename Type>
+  struct ArrayExec<Type, enable_if_t<has_c_type<Type>::value &&
+                                     !is_boolean_type<Type>::value>> {
+    static void Exec(const ThisType& functor, KernelContext* ctx, const ExecBatch& batch,
+                     Datum* out) {
+      ArrayData* out_arr = out->mutable_array();
+      auto out_data = out_arr->GetMutableValues<OUT>(kPrimitiveData);
+      VisitArrayDataInline<Arg0Type>(*batch[0].array(), [&](util::optional<ARG0> v) {
+          if (v.has_value()) {
+            *out_data = functor.op.template Call<OUT, ARG0>(ctx, *v);
+          }
+          ++out_data;
+        });
+    }
+  };
+
+  template <typename Type>
+  struct ArrayExec<Type, enable_if_t<is_boolean_type<Type>::value>> {
+    static void Exec(const ThisType& functor, KernelContext* ctx, const ExecBatch& batch,
+                     Datum* out) {
+      ArrayData* out_arr = out->mutable_array();
+      FirstTimeBitmapWriter out_writer(out_arr->buffers[1]->mutable_data(),
+                                       out_arr->offset, out_arr->length);
+      VisitArrayDataInline<Arg0Type>(*batch[0].array(), [&](util::optional<ARG0> v) {
+          if (v.has_value()) {
+            if (functor.op.template Call<OUT, ARG0>(ctx, *v)) {
+              out_writer.Set();
+            }
+          }
+          out_writer.Next();
+        });
+      out_writer.Finish();
+    }
+  };
 
   void Scalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     if (batch[0].scalar()->is_valid) {
@@ -360,13 +394,22 @@ struct ScalarUnaryNotNullStateful {
 
   void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     if (batch[0].kind() == Datum::ARRAY) {
-      return Array(ctx, batch, out);
+      ArrayExec<OutType>::Exec(*this, ctx, batch, out);
     } else {
       return Scalar(ctx, batch, out);
     }
   }
 };
 
+template <typename OutType, typename Arg0Type, typename Op>
+struct ScalarUnaryNotNull {
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    // Seed kernel with dummy state
+    ScalarUnaryNotNullStateful<OutType, Arg0Type, Op> kernel({});
+    return kernel.Exec(ctx, batch, out);
+  }
+};
+
 template <typename OutType, typename Arg0Type, typename Arg1Type, typename Op,
           typename FlippedOp = Op>
 struct ScalarBinary {
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
index 7d1f34415e0..6bb231feca2 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
@@ -23,6 +23,9 @@
 #include "arrow/util/value_parsing.h"
 
 namespace arrow {
+
+using internal::ParseValue;
+
 namespace compute {
 namespace internal {
 
@@ -36,9 +39,8 @@ struct IsNonZero {
 struct ParseBooleanString {
   template <typename OUT, typename ARG0>
   static OUT Call(KernelContext* ctx, ARG0 val) {
-    bool result;
-    if (ARROW_PREDICT_FALSE(!::arrow::internal::ParseValue<BooleanType>(
-            val.data(), val.size(), &result))) {
+    bool result = false;
+    if (ARROW_PREDICT_FALSE(!ParseValue<BooleanType>(val.data(), val.size(), &result))) {
       ctx->SetStatus(Status::Invalid("Failed to parse value: ", val));
     }
     return result;
@@ -55,7 +57,8 @@ std::vector<std::shared_ptr<CastFunction>> GetBooleanCasts() {
   }
   for (const auto& ty : BaseBinaryTypes()) {
     auto exec =
-        codegen::BaseBinary<codegen::ScalarUnary, BooleanType, ParseBooleanString>(*ty);
+        codegen::BaseBinary<codegen::ScalarUnaryNotNull, BooleanType, ParseBooleanString>(
+            *ty);
     DCHECK_OK(func->AddKernel(ty->id(), {ty}, boolean(), exec));
   }
   return {func};
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
index 4dc23b07336..0b985928095 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
@@ -15,21 +15,23 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "arrow/compute/kernels/scalar_cast_internal.h"
+#include <utility>
+
 #include "arrow/compute/kernels/common.h"
+#include "arrow/compute/kernels/scalar_cast_internal.h"
 
 namespace arrow {
 namespace compute {
 namespace internal {
 
 void CastFromExtension(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  // const CastOptions& options = static_cast<const CastState*>(ctx->state())->options;
+  // const CastOptions& options = checked_cast<const CastState*>(ctx->state())->options;
 
   const DataType& in_type = *batch[0].type();
   const auto storage_type = checked_cast<const ExtensionType&>(in_type).storage_type();
 
   std::shared_ptr<const CastFunction> cast_func;
-  Status s = GetCastFunction(storage_type, out->type()).Value(&cast_func);
+  Status s = GetCastFunction(out->type()).Value(&cast_func);
   if (!s.ok()) {
     ctx->SetStatus(s);
     return;
@@ -41,18 +43,9 @@ void CastFromExtension(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
   // out->mutable_array()));
 }
 
-std::unique_ptr<KernelState> CastInit(KernelContext* ctx, const KernelInitArgs& args) {
-  // NOTE: TakeOptions are currently unused, but we pass it through anyway
-  auto cast_options = static_cast<const CastOptions*>(args.options);
-
-  // Ensure that the requested type to cast to was attached to the options
-  DCHECK(cast_options->to_type);
-  return std::unique_ptr<KernelState>(new CastState(*cast_options));
-}
-
 Result<ValueDescr> ResolveOutputFromOptions(KernelContext* ctx,
                                             const std::vector<ValueDescr>& args) {
-  const CastOptions& options = static_cast<const CastState&>(*ctx->state()).options;
+  const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
   return ValueDescr(options.to_type, args[0].shape);
 }
 
@@ -74,20 +67,15 @@ void ZeroCopyCastExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
   }
 }
 
-void AddZeroCopyCast(const std::shared_ptr<DataType>& in_type,
-                     const std::shared_ptr<DataType>& out_type, CastFunction* func) {
+void AddZeroCopyCast(InputType in_type, const std::shared_ptr<DataType>& out_type,
+                     CastFunction* func) {
   auto sig = KernelSignature::Make({in_type}, out_type);
-
   ScalarKernel kernel;
-  kernel.init = CastInit;
   kernel.exec = ZeroCopyCastExec;
   kernel.signature = sig;
-
-  // Turn off memory allocation
   kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
   kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
-
-  DCHECK_OK(func->AddKernel(in_type->id(), std::move(kernel)));
+  DCHECK_OK(func->AddKernel(in_type.type_id(), std::move(kernel)));
 }
 
 }  // namespace internal
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
index 7efee6f6d60..6929dc65098 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
@@ -22,6 +22,7 @@
 
 #include "arrow/builder.h"
 #include "arrow/compute/cast.h"
+#include "arrow/compute/cast_internal.h"
 #include "arrow/compute/kernels/common.h"
 
 namespace arrow {
@@ -31,11 +32,6 @@ using internal::checked_cast;
 namespace compute {
 namespace internal {
 
-struct CastState : public KernelState {
-  CastState(const CastOptions& options) : options(options) {}
-  CastOptions options;
-};
-
 template <typename OutType, typename InType, typename Enable = void>
 struct CastFunctor {};
 
@@ -245,10 +241,8 @@ void AddSimpleCast(InputType in_ty, OutputType out_ty, CastFunction* func) {
 
 void ZeroCopyCastExec(KernelContext* ctx, const ExecBatch& batch, Datum* out);
 
-void AddZeroCopyCast(const std::shared_ptr<DataType>& in_type,
-                     const std::shared_ptr<DataType>& out_type, CastFunction* func);
-
-std::unique_ptr<KernelState> CastInit(KernelContext* ctx, const KernelInitArgs& args);
+void AddZeroCopyCast(InputType in_type, const std::shared_ptr<DataType>& out_type,
+                     CastFunction* func);
 
 // OutputType::Resolver that returns a descr with the shape of the input
 // argument and the type from CastOptions
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
index 2106d88c117..b8335e88b89 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
@@ -193,7 +193,7 @@ struct IntegerDowncastNoOverflow {
   static constexpr InT kMin = SafeMinimum<O, I>();
 
   template <typename OutT, typename InT>
-  static OutT Call(KernelContext* ctx, InT val) {
+  OutT Call(KernelContext* ctx, InT val) const {
     if (ARROW_PREDICT_FALSE(val > kMax || val < kMin)) {
       ctx->SetStatus(Status::Invalid("Integer value out of bounds"));
     }
@@ -215,9 +215,10 @@ struct CastFunctor<O, I,
                                is_integral_signed_to_unsigned<O, I>::value ||
                                is_integral_unsigned_to_signed<O, I>::value>> {
   static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    const auto& options = static_cast<const CastState*>(ctx->state())->options;
+    const auto& options = checked_cast<const CastState*>(ctx->state())->options;
     if (!options.allow_int_overflow) {
-      codegen::ScalarUnary<O, I, IntegerDowncastNoOverflow<O, I>>::Exec(ctx, batch, out);
+      codegen::ScalarUnaryNotNull<O, I, IntegerDowncastNoOverflow<O, I>>::Exec(ctx, batch,
+                                                                               out);
     } else {
       codegen::ScalarUnary<O, I, StaticCast>::Exec(ctx, batch, out);
     }
@@ -229,7 +230,8 @@ struct CastFunctor<O, I,
 
 struct FloatToIntegerNoTruncate {
   template <typename OutT, typename InT>
-  static OutT Call(KernelContext* ctx, InT val) {
+  ARROW_DISABLE_UBSAN("float-cast-overflow")
+  OutT Call(KernelContext* ctx, InT val) const {
     auto out_value = static_cast<OutT>(val);
     if (ARROW_PREDICT_FALSE(static_cast<InT>(out_value) != val)) {
       ctx->SetStatus(Status::Invalid("Floating point value truncated"));
@@ -242,11 +244,11 @@ template <typename O, typename I>
 struct CastFunctor<O, I, enable_if_t<is_float_truncate<O, I>::value>> {
   ARROW_DISABLE_UBSAN("float-cast-overflow")
   static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    const auto& options = static_cast<const CastState*>(ctx->state())->options;
+    const auto& options = checked_cast<const CastState*>(ctx->state())->options;
     if (options.allow_float_truncate) {
       codegen::ScalarUnary<O, I, StaticCast>::Exec(ctx, batch, out);
     } else {
-      codegen::ScalarUnary<O, I, FloatToIntegerNoTruncate>::Exec(ctx, batch, out);
+      codegen::ScalarUnaryNotNull<O, I, FloatToIntegerNoTruncate>::Exec(ctx, batch, out);
     }
   }
 };
@@ -288,8 +290,8 @@ struct CastFunctor<O, BooleanType, enable_if_number<O>> {
 template <typename OutType>
 struct ParseString {
   template <typename OUT, typename ARG0>
-  static OUT Call(KernelContext* ctx, ARG0 val) {
-    OUT result;
+  OUT Call(KernelContext* ctx, ARG0 val) const {
+    OUT result = OUT(0);
     if (ARROW_PREDICT_FALSE(!ParseValue<OutType>(val.data(), val.size(), &result))) {
       ctx->SetStatus(Status::Invalid("Failed to parse string: ", val));
     }
@@ -300,7 +302,7 @@ struct ParseString {
 template <typename O, typename I>
 struct CastFunctor<O, I, enable_if_base_binary<I>> {
   static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    codegen::ScalarUnary<O, I, ParseString<O>>::Exec(ctx, batch, out);
+    codegen::ScalarUnaryNotNull<O, I, ParseString<O>>::Exec(ctx, batch, out);
   }
 };
 
@@ -312,7 +314,7 @@ struct CastFunctor<O, Decimal128Type, enable_if_t<is_integer_type<O>::value>> {
   using out_type = typename O::c_type;
 
   static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    const auto& options = static_cast<const CastState*>(ctx->state())->options;
+    const auto& options = checked_cast<const CastState*>(ctx->state())->options;
 
     const ArrayData& input = *batch[0].array();
     ArrayData* output = out->mutable_array();
@@ -391,7 +393,7 @@ struct CastFunctor<O, Decimal128Type, enable_if_t<is_integer_type<O>::value>> {
 template <>
 struct CastFunctor<Decimal128Type, Decimal128Type> {
   static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    const auto& options = static_cast<const CastState*>(ctx->state())->options;
+    const auto& options = checked_cast<const CastState*>(ctx->state())->options;
     const ArrayData& input = *batch[0].array();
     ArrayData* output = out->mutable_array();
     auto out_data = output->GetMutableValues<uint8_t>(1);
@@ -456,20 +458,20 @@ void AddPrimitiveNumberCasts(const std::shared_ptr<DataType>& out_ty,
                              CastFunction* func) {
   AddCommonCasts<OutType>(out_ty, func);
 
-  // Cast from boolean
-  DCHECK_OK(func->AddKernel(out_ty->id(), {boolean()}, out_ty,
+  // Cast from boolean to number
+  DCHECK_OK(func->AddKernel(Type::BOOL, {boolean()}, out_ty,
                             CastFunctor<OutType, BooleanType>::Exec));
 
   // Cast from other numbers
   for (const std::shared_ptr<DataType>& in_ty : NumericTypes()) {
     auto exec = codegen::Numeric<CastFunctor, OutType>(*in_ty);
-    DCHECK_OK(func->AddKernel(out_ty->id(), {in_ty}, out_ty, exec));
+    DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, out_ty, exec));
   }
 
   // Cast from other strings
   for (const std::shared_ptr<DataType>& in_ty : BaseBinaryTypes()) {
     auto exec = codegen::BaseBinary<CastFunctor, OutType>(*in_ty);
-    DCHECK_OK(func->AddKernel(out_ty->id(), {in_ty}, out_ty, exec));
+    DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, out_ty, exec));
   }
 }
 
@@ -483,7 +485,7 @@ std::shared_ptr<CastFunction> GetCastToInteger(std::string name) {
 
   // From decimal to integer
   // TODO: Refactor to support casting decimal scalars to integer
-  DCHECK_OK(func->AddKernel(out_ty->id(), {InputType::Array(Type::DECIMAL)}, out_ty,
+  DCHECK_OK(func->AddKernel(Type::DECIMAL, {InputType::Array(Type::DECIMAL)}, out_ty,
                             CastFunctor<OutType, Decimal128Type>::Exec));
   return func;
 }
@@ -502,8 +504,10 @@ std::shared_ptr<CastFunction> GetCastToDecimal() {
   // Cast to decimal
   auto func = std::make_shared<CastFunction>("cast_decimal", Type::DECIMAL);
   auto exec = CastFunctor<Decimal128Type, Decimal128Type>::Exec;
+
+  // We resolve the output type of this kernel from the CastOptions
   DCHECK_OK(func->AddKernel(Type::DECIMAL, {InputType::Array(Type::DECIMAL)},
-                            OutputType(FirstType), exec));
+                            OutputType(ResolveOutputFromOptions), exec));
   return func;
 }
 
@@ -512,8 +516,21 @@ std::vector<std::shared_ptr<CastFunction>> GetNumericCasts() {
 
   functions.push_back(GetCastToInteger<Int8Type>("cast_int8"));
   functions.push_back(GetCastToInteger<Int16Type>("cast_int16"));
-  functions.push_back(GetCastToInteger<Int32Type>("cast_int32"));
-  functions.push_back(GetCastToInteger<Int64Type>("cast_int64"));
+
+  auto cast_int32 = GetCastToInteger<Int32Type>("cast_int32");
+  // Convert DATE32 or TIME32 to INT32 zero copy
+  AddZeroCopyCast(date32(), int32(), cast_int32.get());
+  AddZeroCopyCast(Type::TIME32, int32(), cast_int32.get());
+  functions.push_back(cast_int32);
+
+  auto cast_int64 = GetCastToInteger<Int64Type>("cast_int64");
+  // Convert DATE64, DURATION, TIMESTAMP, TIME64 to INT64 zero copy
+  AddZeroCopyCast(Type::DATE64, int64(), cast_int64.get());
+  AddZeroCopyCast(Type::DURATION, int64(), cast_int64.get());
+  AddZeroCopyCast(Type::TIMESTAMP, int64(), cast_int64.get());
+  AddZeroCopyCast(Type::TIME64, int64(), cast_int64.get());
+  functions.push_back(cast_int64);
+
   functions.push_back(GetCastToInteger<UInt8Type>("cast_uint8"));
   functions.push_back(GetCastToInteger<UInt16Type>("cast_uint16"));
   functions.push_back(GetCastToInteger<UInt32Type>("cast_uint32"));
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
index 6bef4e7d0b0..31a1b1e101a 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
@@ -94,7 +94,7 @@ struct Utf8Validator {
 template <typename I, typename O>
 struct BinaryToStringSameWidthCastFunctor {
   static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    const CastOptions& options = static_cast<const CastState&>(*ctx->state()).options;
+    const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
     if (!options.allow_invalid_utf8) {
       InitializeUTF8();
       const ArrayData& input = *batch[0].array();
@@ -107,6 +107,8 @@ struct BinaryToStringSameWidthCastFunctor {
         return;
       }
     }
+    // It's OK to call this because base binary types do not preallocate
+    // anything
     ZeroCopyCastExec(ctx, batch, out);
   }
 };
@@ -131,11 +133,13 @@ struct CastFunctor<LargeStringType, LargeBinaryType>
 template <typename OutType>
 void AddNumberToStringCasts(std::shared_ptr<DataType> out_ty, CastFunction* func) {
   DCHECK_OK(func->AddKernel(Type::BOOL, {boolean()}, out_ty,
-                            CastFunctor<OutType, BooleanType>::Exec));
+                            CastFunctor<OutType, BooleanType>::Exec,
+                            NullHandling::COMPUTED_NO_PREALLOCATE));
 
   for (const std::shared_ptr<DataType>& in_ty : NumericTypes()) {
     DCHECK_OK(func->AddKernel(in_ty->id(), {in_ty}, out_ty,
-                              codegen::Numeric<CastFunctor, OutType>(*in_ty)));
+                              codegen::Numeric<CastFunctor, OutType>(*in_ty),
+                              NullHandling::COMPUTED_NO_PREALLOCATE));
   }
 }
 
@@ -143,14 +147,16 @@ std::vector<std::shared_ptr<CastFunction>> GetStringCasts() {
   auto cast_string = std::make_shared<CastFunction>("cast_string", Type::STRING);
   AddNumberToStringCasts<StringType>(utf8(), cast_string.get());
   DCHECK_OK(cast_string->AddKernel(Type::BINARY, {binary()}, utf8(),
-                                   CastFunctor<StringType, BinaryType>::Exec));
+                                   CastFunctor<StringType, BinaryType>::Exec,
+                                   NullHandling::COMPUTED_NO_PREALLOCATE));
 
   auto cast_large_string =
       std::make_shared<CastFunction>("cast_large_string", Type::LARGE_STRING);
   AddNumberToStringCasts<LargeStringType>(large_utf8(), cast_large_string.get());
   DCHECK_OK(
       cast_large_string->AddKernel(Type::LARGE_BINARY, {large_binary()}, large_utf8(),
-                                   CastFunctor<LargeStringType, LargeBinaryType>::Exec));
+                                   CastFunctor<LargeStringType, LargeBinaryType>::Exec,
+                                   NullHandling::COMPUTED_NO_PREALLOCATE));
 
   return {cast_string, cast_large_string};
 }
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
index ac8569420a4..a355c499df4 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
@@ -42,7 +42,7 @@ constexpr int64_t kMillisecondsInDay = 86400000;
 template <typename in_type, typename out_type>
 void ShiftTime(KernelContext* ctx, const util::DivideOrMultiply factor_op,
                const int64_t factor, const ArrayData& input, ArrayData* output) {
-  const CastOptions& options = static_cast<const CastState&>(*ctx->state()).options;
+  const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
   const in_type* in_data = input.GetValues<in_type>(1);
   auto out_data = output->GetMutableValues<out_type>(1);
 
@@ -133,6 +133,9 @@ struct CastFunctor<
     // If units are the same, zero copy, otherwise convert
     const auto& in_type = checked_cast<const I&>(*batch[0].type());
     const auto& out_type = checked_cast<const O&>(*output->type);
+
+    DCHECK_NE(in_type.unit(), out_type.unit()) << "Do not cast equal types";
+
     auto conversion = util::kTimestampConversionTable[static_cast<int>(in_type.unit())]
                                                      [static_cast<int>(out_type.unit())];
     ShiftTime<int64_t, int64_t>(ctx, conversion.first, conversion.second, input, output);
@@ -162,7 +165,7 @@ struct CastFunctor<Date32Type, TimestampType> {
 template <>
 struct CastFunctor<Date64Type, TimestampType> {
   static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    const CastOptions& options = static_cast<const CastState&>(*ctx->state()).options;
+    const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
     const ArrayData& input = *batch[0].array();
     ArrayData* output = out->mutable_array();
     const auto& in_type = checked_cast<const TimestampType&>(*input.type);
@@ -220,6 +223,7 @@ struct CastFunctor<O, I, enable_if_t<is_time_type<I>::value && is_time_type<O>::
     // If units are the same, zero copy, otherwise convert
     const auto& in_type = checked_cast<const I&>(*input.type);
     const auto& out_type = checked_cast<const O&>(*output->type);
+    DCHECK_NE(in_type.unit(), out_type.unit()) << "Do not cast equal types";
     auto conversion = util::kTimestampConversionTable[static_cast<int>(in_type.unit())]
                                                      [static_cast<int>(out_type.unit())];
 
@@ -269,7 +273,7 @@ struct ParseTimestamp {
 template <typename I>
 struct CastFunctor<TimestampType, I, enable_if_t<is_base_binary_type<I>::value>> {
   static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    const CastOptions& options = static_cast<const CastState&>(*ctx->state()).options;
+    const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
     const auto& out_type = checked_cast<const TimestampType&>(*options.to_type);
     codegen::ScalarUnaryNotNullStateful<TimestampType, I, ParseTimestamp> kernel(
         ParseTimestamp(out_type.unit()));
@@ -288,19 +292,12 @@ struct CastFunctor<TimestampType, I, enable_if_t<is_base_binary_type<I>::value>>
 
 static OutputType kOutputTargetType(ResolveOutputFromOptions);
 
-template <typename T>
-void AddBetweenUnitCast(CastFunction* func) {
-  auto sig = KernelSignature::Make({InputType(T::type_id)}, kOutputTargetType);
+template <typename Type>
+void AddCrossUnitCast(CastFunction* func) {
   ScalarKernel kernel;
-  kernel.exec = CastFunctor<T, T>::Exec;
-  kernel.init = CastInit;
-  kernel.signature = sig;
-
-  // Turn off memory allocation
-  kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
-  kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
-
-  DCHECK_OK(func->AddKernel(T::type_id, std::move(kernel)));
+  kernel.exec = CastFunctor<Type, Type>::Exec;
+  kernel.signature = KernelSignature::Make({InputType(Type::type_id)}, kOutputTargetType);
+  DCHECK_OK(func->AddKernel(Type::type_id, std::move(kernel)));
 }
 
 std::shared_ptr<CastFunction> GetDate32Cast() {
@@ -313,6 +310,10 @@ std::shared_ptr<CastFunction> GetDate32Cast() {
 
   // date64 -> date32
   AddSimpleCast<Date64Type, Date32Type>(date64(), date32(), func.get());
+
+  // timestamp -> date32
+  AddSimpleCast<TimestampType, Date32Type>(InputType(Type::TIMESTAMP), date32(),
+                                           func.get());
   return func;
 }
 
@@ -326,6 +327,11 @@ std::shared_ptr<CastFunction> GetDate64Cast() {
 
   // date32 -> date64
   AddSimpleCast<Date32Type, Date64Type>(date32(), date64(), func.get());
+
+  // timestamp -> date64
+  AddSimpleCast<TimestampType, Date64Type>(InputType(Type::TIMESTAMP), date64(),
+                                           func.get());
+
   return func;
 }
 
@@ -338,12 +344,6 @@ std::shared_ptr<CastFunction> GetDurationCast() {
   auto micros = duration(TimeUnit::MICRO);
   auto nanos = duration(TimeUnit::NANO);
 
-  // Zero copy when the unit is the same
-  AddZeroCopyCast(seconds, seconds, func.get());
-  AddZeroCopyCast(millis, millis, func.get());
-  AddZeroCopyCast(micros, micros, func.get());
-  AddZeroCopyCast(nanos, nanos, func.get());
-
   // Same integer representation
   AddZeroCopyCast(/*in_type=*/int64(), /*out_type=*/seconds, func.get());
   AddZeroCopyCast(int64(), millis, func.get());
@@ -351,7 +351,7 @@ std::shared_ptr<CastFunction> GetDurationCast() {
   AddZeroCopyCast(int64(), nanos, func.get());
 
   // Between durations
-  AddBetweenUnitCast<DurationType>(func.get());
+  AddCrossUnitCast<DurationType>(func.get());
 
   return func;
 }
@@ -364,14 +364,16 @@ std::shared_ptr<CastFunction> GetTime32Cast() {
   auto millis = time32(TimeUnit::MILLI);
 
   // Zero copy when the unit is the same or same integer representation
-  AddZeroCopyCast(/*in_type=*/seconds, /*out_type=*/seconds, func.get());
-  AddZeroCopyCast(millis, millis, func.get());
   AddZeroCopyCast(int32(), seconds, func.get());
   AddZeroCopyCast(int32(), millis, func.get());
 
   // time64 -> time32
   AddSimpleCast<Time64Type, Time32Type>(InputType(Type::TIME64), kOutputTargetType,
                                         func.get());
+
+  // time32 -> time32
+  AddCrossUnitCast<Time32Type>(func.get());
+
   return func;
 }
 
@@ -383,14 +385,16 @@ std::shared_ptr<CastFunction> GetTime64Cast() {
   auto nanos = time64(TimeUnit::NANO);
 
   // Zero copy when the unit is the same or same integer representation
-  AddZeroCopyCast(/*in_type=*/micros, /*out_type=*/micros, func.get());
-  AddZeroCopyCast(nanos, nanos, func.get());
   AddZeroCopyCast(int64(), micros, func.get());
   AddZeroCopyCast(int64(), nanos, func.get());
 
   // time32 -> time64
   AddSimpleCast<Time32Type, Time64Type>(InputType(Type::TIME32), kOutputTargetType,
                                         func.get());
+
+  // Between durations
+  AddCrossUnitCast<Time64Type>(func.get());
+
   return func;
 }
 
@@ -398,22 +402,12 @@ std::shared_ptr<CastFunction> GetTimestampCast() {
   auto func = std::make_shared<CastFunction>("cast_timestamp", Type::TIMESTAMP);
   AddCommonCasts<TimestampType>(kOutputTargetType, func.get());
 
-  auto seconds = timestamp(TimeUnit::SECOND);
-  auto millis = timestamp(TimeUnit::MILLI);
-  auto micros = timestamp(TimeUnit::MICRO);
-  auto nanos = timestamp(TimeUnit::NANO);
-
-  // Zero copy when the unit is the same
-  AddZeroCopyCast(seconds, seconds, func.get());
-  AddZeroCopyCast(millis, millis, func.get());
-  AddZeroCopyCast(micros, micros, func.get());
-  AddZeroCopyCast(nanos, nanos, func.get());
-
   // Same integer representation
-  AddZeroCopyCast(/*in_type=*/int64(), /*out_type=*/seconds, func.get());
-  AddZeroCopyCast(int64(), millis, func.get());
-  AddZeroCopyCast(int64(), micros, func.get());
-  AddZeroCopyCast(int64(), nanos, func.get());
+  AddZeroCopyCast(/*in_type=*/int64(), /*out_type=*/timestamp(TimeUnit::SECOND),
+                  func.get());
+  AddZeroCopyCast(int64(), timestamp(TimeUnit::MILLI), func.get());
+  AddZeroCopyCast(int64(), timestamp(TimeUnit::MICRO), func.get());
+  AddZeroCopyCast(int64(), timestamp(TimeUnit::NANO), func.get());
 
   // From date types
   // TODO: ARROW-8876, these casts are not implemented
@@ -422,8 +416,8 @@ std::shared_ptr<CastFunction> GetTimestampCast() {
   // AddSimpleCast<Date64Type, TimestampType>(InputType(Type::DATE64),
   //                                          kOutputTargetType, func.get());
 
-  // Between timestamps
-  AddBetweenUnitCast<TimestampType>(func.get());
+  // From one timestamp to another
+  AddCrossUnitCast<TimestampType>(func.get());
 
   return func;
 }
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
index 1e500593f4d..c2c4a44e00f 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
@@ -67,7 +67,7 @@ class TestCast : public TestBase {
                  const std::shared_ptr<DataType>& out_type, const CastOptions& options) {
     ASSERT_OK_AND_ASSIGN(std::shared_ptr<Array> result, Cast(input, out_type, options));
     ASSERT_OK(result->ValidateFull());
-    ASSERT_ARRAYS_EQUAL(expected, *result);
+    AssertArraysEqual(expected, *result, /*verbose=*/true);
   }
 
   template <typename InType, typename I_TYPE>
@@ -1527,10 +1527,10 @@ TYPED_TEST(TestDictionaryCast, OutTypeError) {
   auto in_type = dictionary(int32(), plain_array->type());
   // Test an output type that's not the plain input type but still part of TestTypes.
   auto out_type = (plain_array->type()->id() == Type::INT8) ? binary() : int8();
-  ASSERT_RAISES(NotImplemented, GetCastFunction(in_type, out_type));
+  ASSERT_RAISES(NotImplemented, GetCastFunction(out_type));
   // Test an output type that's not part of TestTypes.
   out_type = list(in_type);
-  ASSERT_RAISES(NotImplemented, GetCastFunction(in_type, out_type));
+  ASSERT_RAISES(NotImplemented, GetCastFunction(out_type));
 }
 
 std::shared_ptr<Array> SmallintArrayFromJSON(const std::string& json_data) {
diff --git a/cpp/src/arrow/compute/kernels/scalar_compare_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_compare_benchmark.cc
index edd7a89f189..90b6c276df6 100644
--- a/cpp/src/arrow/compute/kernels/scalar_compare_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_compare_benchmark.cc
@@ -19,8 +19,8 @@
 
 #include <vector>
 
-#include "arrow/compute/benchmark_util.h"
 #include "arrow/compute/api_scalar.h"
+#include "arrow/compute/benchmark_util.h"
 #include "arrow/compute/test_util.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/random.h"
diff --git a/cpp/src/arrow/compute/registry.cc b/cpp/src/arrow/compute/registry.cc
index 27da78758e1..ac7f9884d73 100644
--- a/cpp/src/arrow/compute/registry.cc
+++ b/cpp/src/arrow/compute/registry.cc
@@ -105,7 +105,6 @@ static void CreateBuiltInRegistry() {
   RegisterScalarBoolean(g_registry.get());
   RegisterScalarComparison(g_registry.get());
   RegisterScalarSetLookup(g_registry.get());
-  RegisterScalarCasts(g_registry.get());
 
   // Aggregate functions
   RegisterScalarAggregateBasic(g_registry.get());
diff --git a/cpp/src/arrow/compute/registry_internal.h b/cpp/src/arrow/compute/registry_internal.h
index 7951356875d..75e53c793fe 100644
--- a/cpp/src/arrow/compute/registry_internal.h
+++ b/cpp/src/arrow/compute/registry_internal.h
@@ -27,7 +27,6 @@ namespace internal {
 // Built-in scalar / elementwise functions
 void RegisterScalarArithmetic(FunctionRegistry* registry);
 void RegisterScalarBoolean(FunctionRegistry* registry);
-void RegisterScalarCasts(FunctionRegistry* registry);
 void RegisterScalarComparison(FunctionRegistry* registry);
 void RegisterScalarSetLookup(FunctionRegistry* registry);
 

From 23665d09633a30d50f2336f15e075ac82ff09eea Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Wed, 20 May 2020 22:25:48 -0500
Subject: [PATCH 14/41] All tests passing again

---
 cpp/src/arrow/CMakeLists.txt                  |  1 +
 cpp/src/arrow/compute/cast.cc                 | 25 +++---
 cpp/src/arrow/compute/cast_internal.h         |  4 +-
 cpp/src/arrow/compute/exec.cc                 | 18 +++-
 .../arrow/compute/kernels/codegen_internal.h  |  6 +-
 .../arrow/compute/kernels/scalar_boolean.cc   | 10 ++-
 .../compute/kernels/scalar_boolean_test.cc    |  4 +-
 .../compute/kernels/scalar_cast_internal.cc   | 18 ++--
 .../compute/kernels/scalar_cast_internal.h    | 30 ++++---
 .../compute/kernels/scalar_cast_nested.cc     | 82 ++++++++++++-------
 .../compute/kernels/scalar_cast_numeric.cc    | 16 +++-
 .../compute/kernels/scalar_cast_string.cc     | 18 +++-
 .../compute/kernels/scalar_cast_temporal.cc   | 13 ++-
 .../arrow/compute/kernels/scalar_cast_test.cc | 10 +--
 cpp/src/arrow/dataset/filter_test.cc          |  8 +-
 cpp/src/arrow/datum.cc                        |  2 +-
 cpp/src/arrow/datum_test.cc                   |  4 +-
 cpp/src/arrow/type_traits.h                   | 15 ++++
 18 files changed, 183 insertions(+), 101 deletions(-)

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index c51bd256708..30c4c737081 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -334,6 +334,7 @@ if(ARROW_COMPUTE)
               compute/kernels/scalar_boolean.cc
               compute/kernels/scalar_cast_boolean.cc
               compute/kernels/scalar_cast_internal.cc
+              compute/kernels/scalar_cast_nested.cc
               compute/kernels/scalar_cast_numeric.cc
               compute/kernels/scalar_cast_string.cc
               compute/kernels/scalar_cast_temporal.cc
diff --git a/cpp/src/arrow/compute/cast.cc b/cpp/src/arrow/compute/cast.cc
index 5cab14d7013..a9e0f6033ca 100644
--- a/cpp/src/arrow/compute/cast.cc
+++ b/cpp/src/arrow/compute/cast.cc
@@ -36,19 +36,20 @@ namespace compute {
 
 namespace internal {
 
-std::unordered_map<Type::type, std::shared_ptr<const CastFunction>> g_cast_table;
+std::unordered_map<int, std::shared_ptr<const CastFunction>> g_cast_table;
 static std::once_flag cast_table_initialized;
 
 void AddCastFunctions(const std::vector<std::shared_ptr<CastFunction>>& funcs) {
   for (const auto& func : funcs) {
-    g_cast_table[func->out_type_id()] = func;
+    g_cast_table[static_cast<int>(func->out_type_id())] = func;
   }
 }
 
 void InitCastTable() {
   AddCastFunctions(GetBooleanCasts());
+  AddCastFunctions(GetBinaryLikeCasts());
+  AddCastFunctions(GetNestedCasts());
   AddCastFunctions(GetNumericCasts());
-  AddCastFunctions(GetStringCasts());
   AddCastFunctions(GetTemporalCasts());
 }
 
@@ -65,7 +66,7 @@ void RegisterScalarCasts(FunctionRegistry* registry) {
 
 struct CastFunction::CastFunctionImpl {
   Type::type out_type;
-  std::unordered_set<Type::type> in_types;
+  std::unordered_set<int> in_types;
 };
 
 CastFunction::CastFunction(std::string name, Type::type out_type)
@@ -91,7 +92,7 @@ Status CastFunction::AddKernel(Type::type in_type_id, ScalarKernel kernel) {
   // We use the same KernelInit for every cast
   kernel.init = CastInit;
   RETURN_NOT_OK(ScalarFunction::AddKernel(kernel));
-  impl_->in_types.insert(in_type_id);
+  impl_->in_types.insert(static_cast<int>(in_type_id));
   return Status::OK();
 }
 
@@ -108,7 +109,7 @@ Status CastFunction::AddKernel(Type::type in_type_id, std::vector<InputType> in_
 }
 
 bool CastFunction::CanCastTo(const DataType& out_type) const {
-  return impl_->in_types.find(out_type.id()) != impl_->in_types.end();
+  return impl_->in_types.find(static_cast<int>(out_type.id())) != impl_->in_types.end();
 }
 
 Result<const ScalarKernel*> CastFunction::DispatchExact(
@@ -128,9 +129,8 @@ Result<const ScalarKernel*> CastFunction::DispatchExact(
   }
 
   if (candidate_kernels.size() == 0) {
-    return Status::NotImplemented("Function ", this->name(),
-                                  " has no kernel matching input type ",
-                                  values[0].ToString());
+    return Status::Invalid("Function ", this->name(),
+                           " has no kernel matching input type ", values[0].ToString());
   } else if (candidate_kernels.size() == 1) {
     // One match, return it
     return candidate_kernels[0];
@@ -171,17 +171,16 @@ Result<std::shared_ptr<Array>> Cast(const Array& value, std::shared_ptr<DataType
 Result<std::shared_ptr<const CastFunction>> GetCastFunction(
     const std::shared_ptr<DataType>& to_type) {
   internal::EnsureInitCastTable();
-  auto it = internal::g_cast_table.find(to_type->id());
+  auto it = internal::g_cast_table.find(static_cast<int>(to_type->id()));
   if (it == internal::g_cast_table.end()) {
-    return Status::NotImplemented("No cast function available to cast to ",
-                                  to_type->ToString());
+    return Status::Invalid("No cast function available to cast to ", to_type->ToString());
   }
   return it->second;
 }
 
 bool CanCast(const DataType& from_type, const DataType& to_type) {
   // TODO
-  auto it = internal::g_cast_table.find(from_type.id());
+  auto it = internal::g_cast_table.find(static_cast<int>(from_type.id()));
   if (it == internal::g_cast_table.end()) {
     return false;
   }
diff --git a/cpp/src/arrow/compute/cast_internal.h b/cpp/src/arrow/compute/cast_internal.h
index 13c46bcaedb..be64359e4ab 100644
--- a/cpp/src/arrow/compute/cast_internal.h
+++ b/cpp/src/arrow/compute/cast_internal.h
@@ -32,11 +32,11 @@ struct CastState : public KernelState {
   CastOptions options;
 };
 
-// See scalar_cast_*.cc for these
+// See kernels/scalar_cast_*.cc for these
 std::vector<std::shared_ptr<CastFunction>> GetBooleanCasts();
 std::vector<std::shared_ptr<CastFunction>> GetNumericCasts();
 std::vector<std::shared_ptr<CastFunction>> GetTemporalCasts();
-std::vector<std::shared_ptr<CastFunction>> GetStringCasts();
+std::vector<std::shared_ptr<CastFunction>> GetBinaryLikeCasts();
 std::vector<std::shared_ptr<CastFunction>> GetNestedCasts();
 
 }  // namespace internal
diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc
index cf1d59dbe98..452369470d8 100644
--- a/cpp/src/arrow/compute/exec.cc
+++ b/cpp/src/arrow/compute/exec.cc
@@ -60,7 +60,11 @@ Result<std::shared_ptr<Buffer>> AllocateDataBuffer(KernelContext* ctx, int64_t l
   return Status::OK();
 }
 
-bool CanPreallocate(const DataType& type) { return is_fixed_width(type.id()); }
+bool CanPreallocate(const DataType& type) {
+  // There are currently cases where NullType is the output type, so we disable
+  // any preallocation logic when this occurs
+  return is_fixed_width(type.id()) && type.id() != Type::NA;
+}
 
 Status GetValueDescriptors(const std::vector<Datum>& args,
                            std::vector<ValueDescr>* descrs) {
@@ -641,8 +645,13 @@ class ScalarExecutor : public FunctionExecutorImpl<ScalarFunction> {
         // batch
         ARROW_ASSIGN_OR_RAISE(out->value, PrepareOutput(batch.length));
       }
+    } else {
+      // For scalar outputs, we set a null scalar of the correct type to
+      // communicate the output type to the kernel if needed
+      //
+      // XXX: Is there some way to avoid this step?
+      out->value = MakeNullScalar(output_descr_.type);
     }
-    // Scalar outputs are the responsibility of the kernel
     return Status::OK();
   }
 
@@ -667,8 +676,6 @@ class ScalarExecutor : public FunctionExecutorImpl<ScalarFunction> {
         (exec_ctx_->preallocate_contiguous() && kernel_->can_write_into_slices &&
          data_preallocated_ && validity_preallocated_);
     if (preallocate_contiguous_) {
-      // TODO: Are there contiguous preallocation scenarios that are NOT
-      // primitive (2-buffer)?
       DCHECK_EQ(2, output_num_buffers_);
       ARROW_ASSIGN_OR_RAISE(preallocated_, PrepareOutput(total_length));
     }
@@ -880,6 +887,9 @@ Result<std::unique_ptr<FunctionExecutor>> FunctionExecutor::Make(
       return MakeExecutor<detail::VectorExecutor>(ctx, func, options);
     case Function::SCALAR_AGGREGATE:
       return MakeExecutor<detail::ScalarAggExecutor>(ctx, func, options);
+    default:
+      DCHECK(false);
+      return nullptr;
   }
 }
 
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h
index 67966a3c35e..1f504ebc520 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.h
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -312,7 +312,8 @@ struct ScalarUnary {
   static void Scalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     if (batch[0].scalar()->is_valid) {
       ARG0 arg0 = UnboxScalar<Arg0Type>::Unbox(batch[0]);
-      out->value = std::make_shared<OutScalar>(Op::template Call<OUT, ARG0>(ctx, arg0));
+      out->value = std::make_shared<OutScalar>(Op::template Call<OUT, ARG0>(ctx, arg0),
+                                               out->type());
     } else {
       out->value = MakeNullScalar(batch[0].type());
     }
@@ -386,7 +387,8 @@ struct ScalarUnaryNotNullStateful {
     if (batch[0].scalar()->is_valid) {
       ARG0 arg0 = UnboxScalar<Arg0Type>::Unbox(batch[0]);
       out->value = std::make_shared<OutScalar>(
-          this->op.template Call<OUT, ARG0>(ctx, arg0));
+          this->op.template Call<OUT, ARG0>(ctx, arg0),
+          out->type());
     } else {
       out->value = MakeNullScalar(batch[0].type());
     }
diff --git a/cpp/src/arrow/compute/kernels/scalar_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_boolean.cc
index 217772d4cff..6444a466c1d 100644
--- a/cpp/src/arrow/compute/kernels/scalar_boolean.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_boolean.cc
@@ -100,6 +100,7 @@ struct KleeneAnd {
   static void Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
                    ArrayData* out) {
     if (left.GetNullCount() == 0 && right.GetNullCount() == 0) {
+      BitUtil::SetBitsTo(out->buffers[0]->mutable_data(), out->offset, out->length, true);
       return And::Call(ctx, left, right, out);
     }
     auto compute_word = [](uint64_t left_true, uint64_t left_false, uint64_t right_true,
@@ -125,6 +126,7 @@ struct KleeneOr {
   static void Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
                    ArrayData* out) {
     if (left.GetNullCount() == 0 && right.GetNullCount() == 0) {
+      BitUtil::SetBitsTo(out->buffers[0]->mutable_data(), out->offset, out->length, true);
       return Or::Call(ctx, left, right, out);
     }
     static auto compute_word = [](uint64_t left_true, uint64_t left_false,
@@ -148,12 +150,14 @@ struct Xor {
 };
 
 void MakeFunction(std::string name, int arity, ArrayKernelExec exec,
-                  FunctionRegistry* registry, bool can_write_into_slices = true) {
+                  FunctionRegistry* registry, bool can_write_into_slices = true,
+                  NullHandling::type null_handling = NullHandling::INTERSECTION) {
   auto func = std::make_shared<ScalarFunction>(name, arity);
 
   // Scalar arguments not yet supported
   std::vector<InputType> in_types(arity, InputType::Array(boolean()));
   ScalarKernel kernel(std::move(in_types), boolean(), exec);
+  kernel.null_handling = null_handling;
   kernel.can_write_into_slices = can_write_into_slices;
 
   DCHECK_OK(func->AddKernel(kernel));
@@ -175,9 +179,9 @@ void RegisterScalarBoolean(FunctionRegistry* registry) {
 
   // The Kleene logic kernels cannot write into sliced output bitmaps
   MakeFunction("and_kleene", 2, SimpleExec::Binary<KleeneAnd>, registry,
-               /*can_write_into_slices=*/false);
+               /*can_write_into_slices=*/false, NullHandling::COMPUTED_PREALLOCATE);
   MakeFunction("or_kleene", 2, SimpleExec::Binary<KleeneOr>, registry,
-               /*can_write_into_slices=*/false);
+               /*can_write_into_slices=*/false, NullHandling::COMPUTED_PREALLOCATE);
 }
 
 }  // namespace internal
diff --git a/cpp/src/arrow/compute/kernels/scalar_boolean_test.cc b/cpp/src/arrow/compute/kernels/scalar_boolean_test.cc
index 8b48480ae62..c3c0ea9d9a2 100644
--- a/cpp/src/arrow/compute/kernels/scalar_boolean_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_boolean_test.cc
@@ -45,13 +45,13 @@ class TestBooleanKernel : public TestBase {
     ASSERT_EQ(Datum::ARRAY, result.kind());
     std::shared_ptr<Array> result_array = result.make_array();
     ASSERT_OK(result_array->ValidateFull());
-    ASSERT_ARRAYS_EQUAL(*expected, *result_array);
+    AssertArraysEqual(*expected, *result_array, /*verbose=*/true);
 
     ASSERT_OK_AND_ASSIGN(result, kernel(right, left, &ctx_));
     ASSERT_EQ(Datum::ARRAY, result.kind());
     result_array = result.make_array();
     ASSERT_OK(result_array->ValidateFull());
-    ASSERT_ARRAYS_EQUAL(*expected, *result_array);
+    AssertArraysEqual(*expected, *result_array, /*verbose=*/true);
   }
 
   void TestChunkedArrayBinary(const BinaryKernelFunc& kernel,
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
index 0b985928095..640de80a1e2 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
@@ -25,22 +25,18 @@ namespace compute {
 namespace internal {
 
 void CastFromExtension(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  // const CastOptions& options = checked_cast<const CastState*>(ctx->state())->options;
+  const CastOptions& options = checked_cast<const CastState*>(ctx->state())->options;
 
   const DataType& in_type = *batch[0].type();
   const auto storage_type = checked_cast<const ExtensionType&>(in_type).storage_type();
 
-  std::shared_ptr<const CastFunction> cast_func;
-  Status s = GetCastFunction(out->type()).Value(&cast_func);
-  if (!s.ok()) {
-    ctx->SetStatus(s);
-    return;
-  }
-
-  // TODO: Finish implementing this
+  ExtensionArray extension(batch[0].array());
 
-  // KERNEL_ABORT_IF_ERROR(ctx, cast_func->Execute(*batch[0]->array(),
-  // out->mutable_array()));
+  Datum casted_storage;
+  KERNEL_ABORT_IF_ERROR(
+      ctx, Cast(*extension.storage(), out->type(), options, ctx->exec_context())
+               .Value(&casted_storage));
+  out->value = casted_storage.array();
 }
 
 Result<ValueDescr> ResolveOutputFromOptions(KernelContext* ctx,
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
index 6929dc65098..b96bf484c9d 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
@@ -105,11 +105,7 @@ struct FromDictVisitor<T, IndexType, enable_if_base_binary<T>> {
   Status Finish() {
     std::shared_ptr<Array> plain_array;
     RETURN_NOT_OK(binary_builder_->Finish(&plain_array));
-    // Copy all buffer except the valid bitmap
-    DCHECK_EQ(output_->buffers.size(), 1);
-    for (size_t i = 1; i < plain_array->data()->buffers.size(); i++) {
-      output_->buffers.push_back(plain_array->data()->buffers[i]);
-    }
+    output_->buffers = plain_array->data()->buffers;
     return Status::OK();
   }
 
@@ -249,6 +245,21 @@ void AddZeroCopyCast(InputType in_type, const std::shared_ptr<DataType>& out_typ
 Result<ValueDescr> ResolveOutputFromOptions(KernelContext* ctx,
                                             const std::vector<ValueDescr>& args);
 
+template <typename T, typename Enable = void>
+struct MaybeAddFromDictionary {
+  static void Add(const OutputType& out_ty, CastFunction* func) {}
+};
+
+template <typename T>
+struct MaybeAddFromDictionary<
+    T, enable_if_t<!is_boolean_type<T>::value && !is_nested_type<T>::value>> {
+  static void Add(const OutputType& out_ty, CastFunction* func) {
+    // Dictionary unpacking not implemented for boolean or nested types
+    DCHECK_OK(func->AddKernel(Type::DICTIONARY, {InputType::Array(Type::DICTIONARY)},
+                              out_ty, FromDictionaryCast<T>::Exec));
+  }
+};
+
 template <typename OutType>
 void AddCommonCasts(OutputType out_ty, CastFunction* func) {
   // From null to this type
@@ -256,15 +267,12 @@ void AddCommonCasts(OutputType out_ty, CastFunction* func) {
                             FromNullCast<OutType>::Exec));
 
   // From dictionary to this type
-  if (OutType::type_id != Type::BOOL) {
-    // Dictionary unpacking not implemented for boolean
-    DCHECK_OK(func->AddKernel(Type::DICTIONARY, {InputType::Array(Type::DICTIONARY)},
-                              out_ty, FromDictionaryCast<OutType>::Exec));
-  }
+  MaybeAddFromDictionary<OutType>::Add(out_ty, func);
 
   // From extension type to this type
   DCHECK_OK(func->AddKernel(Type::EXTENSION, {InputType::Array(Type::EXTENSION)}, out_ty,
-                            CastFromExtension));
+                            CastFromExtension, NullHandling::COMPUTED_NO_PREALLOCATE,
+                            MemAllocation::NO_PREALLOCATE));
 }
 
 }  // namespace internal
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
index bbb5ee7c79c..4fcc46bec8d 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
@@ -17,48 +17,68 @@
 
 // Implementation of casting to (or between) list types
 
+#include <utility>
+#include <vector>
+
+#include "arrow/compute/cast.h"
+#include "arrow/compute/kernels/common.h"
+#include "arrow/compute/kernels/scalar_cast_internal.h"
+
 namespace arrow {
 namespace compute {
+namespace internal {
+
+template <typename Type>
+void CastListExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
 
-template <typename TypeClass>
-class ListCastKernel : public CastKernelBase {
- public:
-  ListCastKernel(std::unique_ptr<UnaryKernel> child_caster,
-                 std::shared_ptr<DataType> out_type)
-      : CastKernelBase(std::move(out_type)), child_caster_(std::move(child_caster)) {}
+  const ArrayData& input = *batch[0].array();
+  ArrayData* result = out->mutable_array();
 
-  Status Call(KernelContext* ctx, const Datum& input, Datum* out) override {
-    DCHECK_EQ(Datum::ARRAY, input.kind());
+  if (input.offset != 0) {
+    ctx->SetStatus(Status::NotImplemented(
+        "Casting sliced lists (non-zero offset) not yet implemented"));
+    return;
+  }
+  // Copy buffers from parent
+  result->buffers = input.buffers;
 
-    const ArrayData& in_data = *input.array();
-    DCHECK_EQ(TypeClass::type_id, in_data.type->id());
-    ArrayData* result;
+  auto child_type = checked_cast<const Type&>(*result->type).value_type();
 
-    if (in_data.offset != 0) {
-      return Status::NotImplemented(
-          "Casting sliced lists (non-zero offset) not yet implemented");
-    }
+  Datum casted_child;
+  KERNEL_ABORT_IF_ERROR(
+      ctx, Cast(Datum(input.child_data[0]), child_type, options, ctx->exec_context())
+               .Value(&casted_child));
+  DCHECK_EQ(Datum::ARRAY, casted_child.kind());
+  result->child_data.push_back(casted_child.array());
+}
 
-    if (out->kind() == Datum::NONE) {
-      out->value = ArrayData::Make(out_type_, in_data.length);
-    }
+OutputType kOutputTargetType(ResolveOutputFromOptions);
 
-    result = out->array().get();
+template <typename Type>
+void AddListCast(CastFunction* func) {
+  ScalarKernel kernel;
+  kernel.exec = CastListExec<Type>;
+  kernel.signature = KernelSignature::Make({InputType(Type::type_id)}, kOutputTargetType);
+  kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+  DCHECK_OK(func->AddKernel(Type::type_id, std::move(kernel)));
+}
 
-    // Copy buffers from parent
-    result->buffers = in_data.buffers;
+std::vector<std::shared_ptr<CastFunction>> GetNestedCasts() {
+  // We use the list<T> from the CastOptions when resolving the output type
 
-    Datum casted_child;
-    RETURN_NOT_OK(InvokeWithAllocation(ctx, child_caster_.get(), in_data.child_data[0],
-                                       &casted_child));
-    DCHECK_EQ(Datum::ARRAY, casted_child.kind());
-    result->child_data.push_back(casted_child.array());
-    return Status::OK();
-  }
+  auto cast_list = std::make_shared<CastFunction>("cast_list", Type::LIST);
+  AddCommonCasts<ListType>(kOutputTargetType, cast_list.get());
+  AddListCast<ListType>(cast_list.get());
+
+  auto cast_large_list =
+      std::make_shared<CastFunction>("cast_large_list", Type::LARGE_LIST);
+  AddCommonCasts<LargeListType>(kOutputTargetType, cast_large_list.get());
+  AddListCast<LargeListType>(cast_large_list.get());
 
- private:
-  std::unique_ptr<UnaryKernel> child_caster_;
-};
+  return {cast_list, cast_large_list};
+}
 
+}  // namespace internal
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
index b8335e88b89..2816e4c9ffa 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
@@ -501,19 +501,29 @@ std::shared_ptr<CastFunction> GetCastToFloating(std::string name) {
 }
 
 std::shared_ptr<CastFunction> GetCastToDecimal() {
+  OutputType sig_out_ty(ResolveOutputFromOptions);
+
   // Cast to decimal
   auto func = std::make_shared<CastFunction>("cast_decimal", Type::DECIMAL);
-  auto exec = CastFunctor<Decimal128Type, Decimal128Type>::Exec;
+  AddCommonCasts<Decimal128Type>(sig_out_ty, func.get());
 
+  auto exec = CastFunctor<Decimal128Type, Decimal128Type>::Exec;
   // We resolve the output type of this kernel from the CastOptions
-  DCHECK_OK(func->AddKernel(Type::DECIMAL, {InputType::Array(Type::DECIMAL)},
-                            OutputType(ResolveOutputFromOptions), exec));
+  DCHECK_OK(func->AddKernel(Type::DECIMAL, {InputType::Array(Type::DECIMAL)}, sig_out_ty,
+                            exec));
   return func;
 }
 
 std::vector<std::shared_ptr<CastFunction>> GetNumericCasts() {
   std::vector<std::shared_ptr<CastFunction>> functions;
 
+  // Make a cast to null that does not do much. Not sure why we need to be able
+  // to cast from dict<null> -> null but there are unit tests for it
+  auto cast_null = std::make_shared<CastFunction>("cast_null", Type::NA);
+  DCHECK_OK(cast_null->AddKernel(Type::DICTIONARY, {InputType::Array(Type::DICTIONARY)},
+                                 null(), FromDictionaryCast<NullType>::Exec));
+  functions.push_back(cast_null);
+
   functions.push_back(GetCastToInteger<Int8Type>("cast_int8"));
   functions.push_back(GetCastToInteger<Int16Type>("cast_int16"));
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
index 31a1b1e101a..6d2519e5ca1 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
@@ -143,8 +143,21 @@ void AddNumberToStringCasts(std::shared_ptr<DataType> out_ty, CastFunction* func
   }
 }
 
-std::vector<std::shared_ptr<CastFunction>> GetStringCasts() {
+std::vector<std::shared_ptr<CastFunction>> GetBinaryLikeCasts() {
+  auto cast_binary = std::make_shared<CastFunction>("cast_binary", Type::BINARY);
+  AddCommonCasts<BinaryType>(binary(), cast_binary.get());
+
+  auto cast_large_binary =
+      std::make_shared<CastFunction>("cast_large_binary", Type::LARGE_BINARY);
+  AddCommonCasts<LargeBinaryType>(large_binary(), cast_large_binary.get());
+
+  auto cast_fsb =
+      std::make_shared<CastFunction>("cast_fixed_size_binary", Type::FIXED_SIZE_BINARY);
+  AddCommonCasts<FixedSizeBinaryType>(OutputType(ResolveOutputFromOptions),
+                                      cast_fsb.get());
+
   auto cast_string = std::make_shared<CastFunction>("cast_string", Type::STRING);
+  AddCommonCasts<StringType>(utf8(), cast_string.get());
   AddNumberToStringCasts<StringType>(utf8(), cast_string.get());
   DCHECK_OK(cast_string->AddKernel(Type::BINARY, {binary()}, utf8(),
                                    CastFunctor<StringType, BinaryType>::Exec,
@@ -152,13 +165,14 @@ std::vector<std::shared_ptr<CastFunction>> GetStringCasts() {
 
   auto cast_large_string =
       std::make_shared<CastFunction>("cast_large_string", Type::LARGE_STRING);
+  AddCommonCasts<LargeStringType>(large_utf8(), cast_large_string.get());
   AddNumberToStringCasts<LargeStringType>(large_utf8(), cast_large_string.get());
   DCHECK_OK(
       cast_large_string->AddKernel(Type::LARGE_BINARY, {large_binary()}, large_utf8(),
                                    CastFunctor<LargeStringType, LargeBinaryType>::Exec,
                                    NullHandling::COMPUTED_NO_PREALLOCATE));
 
-  return {cast_string, cast_large_string};
+  return {cast_binary, cast_fsb, cast_large_binary, cast_string, cast_large_string};
 }
 
 }  // namespace internal
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
index a355c499df4..6fc33dc2271 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
@@ -257,7 +257,7 @@ struct ParseTimestamp {
   explicit ParseTimestamp(TimeUnit::type unit) : unit(unit) {}
 
   template <typename OUT, typename ARG0>
-  OUT Call(KernelContext* ctx, ARG0 val) {
+  OUT Call(KernelContext* ctx, ARG0 val) const {
     ParseTimestampContext parse_ctx{this->unit};
     OUT result;
     if (ARROW_PREDICT_FALSE(
@@ -273,8 +273,7 @@ struct ParseTimestamp {
 template <typename I>
 struct CastFunctor<TimestampType, I, enable_if_t<is_base_binary_type<I>::value>> {
   static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
-    const auto& out_type = checked_cast<const TimestampType&>(*options.to_type);
+    const auto& out_type = checked_cast<const TimestampType&>(*out->type());
     codegen::ScalarUnaryNotNullStateful<TimestampType, I, ParseTimestamp> kernel(
         ParseTimestamp(out_type.unit()));
     return kernel.Exec(ctx, batch, out);
@@ -283,7 +282,7 @@ struct CastFunctor<TimestampType, I, enable_if_t<is_base_binary_type<I>::value>>
 
 /// You will see some of these kernels with
 ///
-/// ResolveOutputFromOptions
+/// kOutputTargetType
 ///
 /// for their output type resolution. This is somewhat of an eyesore but the
 /// easiest initial way to get the requested cast type including the TimeUnit
@@ -416,6 +415,12 @@ std::shared_ptr<CastFunction> GetTimestampCast() {
   // AddSimpleCast<Date64Type, TimestampType>(InputType(Type::DATE64),
   //                                          kOutputTargetType, func.get());
 
+  // string -> timestamp
+  AddSimpleCast<StringType, TimestampType>(utf8(), kOutputTargetType, func.get());
+  // large_string -> timestamp
+  AddSimpleCast<LargeStringType, TimestampType>(large_utf8(), kOutputTargetType,
+                                                func.get());
+
   // From one timestamp to another
   AddCrossUnitCast<TimestampType>(func.get());
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
index c2c4a44e00f..83a90d6317d 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
@@ -1216,7 +1216,7 @@ TEST_F(TestCast, UnsupportedTarget) {
   std::shared_ptr<Array> arr;
   ArrayFromVector<Int32Type, int32_t>(int32(), is_valid, v1, &arr);
 
-  ASSERT_RAISES(NotImplemented, Cast(*arr, list(utf8())));
+  ASSERT_RAISES(Invalid, Cast(*arr, list(utf8())));
 }
 
 TEST_F(TestCast, DateTimeZeroCopy) {
@@ -1520,17 +1520,17 @@ TYPED_TEST(TestDictionaryCast, NoNulls) {
   this->CheckPass(*dict_array, *plain_array, plain_array->type(), options);
 }
 
-TYPED_TEST(TestDictionaryCast, OutTypeError) {
+// TODO: See how this might cause problems post-refactor
+TYPED_TEST(TestDictionaryCast, DISABLED_OutTypeError) {
   // ARROW-7077: unsupported out type should return an error
   std::shared_ptr<Array> plain_array =
       TestBase::MakeRandomArray<typename TypeTraits<TypeParam>::ArrayType>(0, 0);
   auto in_type = dictionary(int32(), plain_array->type());
-  // Test an output type that's not the plain input type but still part of TestTypes.
+
   auto out_type = (plain_array->type()->id() == Type::INT8) ? binary() : int8();
-  ASSERT_RAISES(NotImplemented, GetCastFunction(out_type));
   // Test an output type that's not part of TestTypes.
   out_type = list(in_type);
-  ASSERT_RAISES(NotImplemented, GetCastFunction(out_type));
+  ASSERT_RAISES(Invalid, GetCastFunction(out_type));
 }
 
 std::shared_ptr<Array> SmallintArrayFromJSON(const std::string& json_data) {
diff --git a/cpp/src/arrow/dataset/filter_test.cc b/cpp/src/arrow/dataset/filter_test.cc
index 519fa420fa4..d377fa869ca 100644
--- a/cpp/src/arrow/dataset/filter_test.cc
+++ b/cpp/src/arrow/dataset/filter_test.cc
@@ -176,15 +176,13 @@ class FilterTest : public ::testing::Test {
   void AssertFilter(const Expression& expr, std::vector<std::shared_ptr<Field>> fields,
                     const std::string& batch_json) {
     std::shared_ptr<BooleanArray> expected_mask;
-    auto mask_res =
-        DoFilter(expr, std::move(fields), std::move(batch_json), &expected_mask);
-    ASSERT_OK(mask_res.status());
 
-    auto mask = std::move(mask_res).ValueOrDie();
+    ASSERT_OK_AND_ASSIGN(Datum mask, DoFilter(expr, std::move(fields),
+                                              std::move(batch_json), &expected_mask));
     ASSERT_TRUE(mask.type()->Equals(null()) || mask.type()->Equals(boolean()));
 
     if (mask.is_array()) {
-      ASSERT_ARRAYS_EQUAL(*expected_mask, *mask.make_array());
+      AssertArraysEqual(*expected_mask, *mask.make_array(), /*verbose=*/true);
       return;
     }
 
diff --git a/cpp/src/arrow/datum.cc b/cpp/src/arrow/datum.cc
index 9ad6564e34c..9ae0d45789f 100644
--- a/cpp/src/arrow/datum.cc
+++ b/cpp/src/arrow/datum.cc
@@ -197,7 +197,7 @@ std::string Datum::ToString() const {
     }
     default:
       DCHECK(false);
-      break;
+      return "";
   }
 }
 
diff --git a/cpp/src/arrow/datum_test.cc b/cpp/src/arrow/datum_test.cc
index 0b88758f3ef..a76ce56f6df 100644
--- a/cpp/src/arrow/datum_test.cc
+++ b/cpp/src/arrow/datum_test.cc
@@ -130,8 +130,8 @@ TEST(Datum, ToString) {
 
   Datum v1(arr);
   Datum v2(std::make_shared<Int8Scalar>(1));
-  ASSERT_EQ("array[int8]", v1.ToString());
-  ASSERT_EQ("scalar[int8]", v2.ToString());
+  ASSERT_EQ("Array", v1.ToString());
+  ASSERT_EQ("Scalar", v2.ToString());
 }
 
 TEST(ValueDescr, Basics) {
diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h
index c06e7f885c1..f5e32ba1343 100644
--- a/cpp/src/arrow/type_traits.h
+++ b/cpp/src/arrow/type_traits.h
@@ -768,4 +768,19 @@ static inline bool is_fixed_width(Type::type type_id) {
   return is_primitive(type_id) || is_dictionary(type_id) || is_fixed_size_binary(type_id);
 }
 
+static inline bool is_nested(Type::type type_id) {
+  switch (type_id) {
+    case Type::LIST:
+    case Type::LARGE_LIST:
+    case Type::FIXED_SIZE_LIST:
+    case Type::MAP:
+    case Type::STRUCT:
+    case Type::UNION:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
 }  // namespace arrow

From af13070b7c4c2ab460bb575d9c025f7289c435f2 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Wed, 20 May 2020 23:21:56 -0500
Subject: [PATCH 15/41] Update Python bindings, refine error types

---
 cpp/src/arrow/compute/api.h                   |  1 +
 cpp/src/arrow/compute/cast.cc                 |  8 +-
 .../compute/kernels/scalar_cast_internal.cc   |  3 +-
 .../compute/kernels/scalar_cast_internal.h    | 15 ++--
 .../compute/kernels/scalar_cast_nested.cc     | 11 ++-
 .../compute/kernels/scalar_cast_numeric.cc    |  6 ++
 .../compute/kernels/scalar_cast_temporal.cc   | 26 ++-----
 .../arrow/compute/kernels/scalar_cast_test.cc |  8 +-
 python/pyarrow/_compute.pyx                   |  6 +-
 python/pyarrow/array.pxi                      | 38 +++------
 python/pyarrow/includes/libarrow.pxd          | 78 +++++++++----------
 python/pyarrow/lib.pxd                        |  1 -
 python/pyarrow/table.pxi                      | 47 +++++------
 python/pyarrow/tests/test_array.py            |  8 +-
 python/pyarrow/tests/test_compute.py          | 12 +--
 python/pyarrow/tests/test_pandas.py           |  3 +
 16 files changed, 126 insertions(+), 145 deletions(-)

diff --git a/cpp/src/arrow/compute/api.h b/cpp/src/arrow/compute/api.h
index b31bbdc0a45..599c7ecbd3d 100644
--- a/cpp/src/arrow/compute/api.h
+++ b/cpp/src/arrow/compute/api.h
@@ -27,3 +27,4 @@
 #include "arrow/compute/function.h"       // IWYU pragma: export
 #include "arrow/compute/kernel.h"         // IWYU pragma: export
 #include "arrow/compute/registry.h"       // IWYU pragma: export
+#include "arrow/datum.h"                  // IWYU pragma: export
diff --git a/cpp/src/arrow/compute/cast.cc b/cpp/src/arrow/compute/cast.cc
index a9e0f6033ca..509fd6dbdd8 100644
--- a/cpp/src/arrow/compute/cast.cc
+++ b/cpp/src/arrow/compute/cast.cc
@@ -129,8 +129,9 @@ Result<const ScalarKernel*> CastFunction::DispatchExact(
   }
 
   if (candidate_kernels.size() == 0) {
-    return Status::Invalid("Function ", this->name(),
-                           " has no kernel matching input type ", values[0].ToString());
+    return Status::NotImplemented("Function ", this->name(),
+                                  " has no kernel matching input type ",
+                                  values[0].ToString());
   } else if (candidate_kernels.size() == 1) {
     // One match, return it
     return candidate_kernels[0];
@@ -173,7 +174,8 @@ Result<std::shared_ptr<const CastFunction>> GetCastFunction(
   internal::EnsureInitCastTable();
   auto it = internal::g_cast_table.find(static_cast<int>(to_type->id()));
   if (it == internal::g_cast_table.end()) {
-    return Status::Invalid("No cast function available to cast to ", to_type->ToString());
+    return Status::NotImplemented("No cast function available to cast to ",
+                                  to_type->ToString());
   }
   return it->second;
 }
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
index 640de80a1e2..44f7dce1491 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
@@ -63,8 +63,7 @@ void ZeroCopyCastExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
   }
 }
 
-void AddZeroCopyCast(InputType in_type, const std::shared_ptr<DataType>& out_type,
-                     CastFunction* func) {
+void AddZeroCopyCast(InputType in_type, OutputType out_type, CastFunction* func) {
   auto sig = KernelSignature::Make({in_type}, out_type);
   ScalarKernel kernel;
   kernel.exec = ZeroCopyCastExec;
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
index b96bf484c9d..44642d11645 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
@@ -172,9 +172,12 @@ struct FromDictionaryCast {
     const Array& dictionary = *input.dictionary;
     const DataType& values_type = *dictionary.type();
 
-    // Check if values and output type match
-    DCHECK(values_type.Equals(*output->type))
-        << "Dictionary type: " << values_type << " target type: " << (*output->type);
+    // ARROW-7077
+    if (!values_type.Equals(*output->type)) {
+      ctx->SetStatus(Status::Invalid("Cannot unpack dictionary of type ", type.ToString(),
+                                     " to type ", output->type->ToString()));
+      return;
+    }
 
     FromDictUnpackHelper<T> unpack_helper;
     switch (type.index_type()->id()) {
@@ -237,8 +240,7 @@ void AddSimpleCast(InputType in_ty, OutputType out_ty, CastFunction* func) {
 
 void ZeroCopyCastExec(KernelContext* ctx, const ExecBatch& batch, Datum* out);
 
-void AddZeroCopyCast(InputType in_type, const std::shared_ptr<DataType>& out_type,
-                     CastFunction* func);
+void AddZeroCopyCast(InputType in_type, OutputType out_type, CastFunction* func);
 
 // OutputType::Resolver that returns a descr with the shape of the input
 // argument and the type from CastOptions
@@ -252,7 +254,8 @@ struct MaybeAddFromDictionary {
 
 template <typename T>
 struct MaybeAddFromDictionary<
-    T, enable_if_t<!is_boolean_type<T>::value && !is_nested_type<T>::value>> {
+    T, enable_if_t<!is_boolean_type<T>::value && !is_nested_type<T>::value &&
+                   !std::is_same<DictionaryType, T>::value>> {
   static void Add(const OutputType& out_ty, CastFunction* func) {
     // Dictionary unpacking not implemented for boolean or nested types
     DCHECK_OK(func->AddKernel(Type::DICTIONARY, {InputType::Array(Type::DICTIONARY)},
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
index 4fcc46bec8d..4c893bc49da 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
@@ -76,7 +76,16 @@ std::vector<std::shared_ptr<CastFunction>> GetNestedCasts() {
   AddCommonCasts<LargeListType>(kOutputTargetType, cast_large_list.get());
   AddListCast<LargeListType>(cast_large_list.get());
 
-  return {cast_list, cast_large_list};
+  // FSL is a bit incomplete at the moment
+  auto cast_fsl =
+      std::make_shared<CastFunction>("cast_fixed_size_list", Type::FIXED_SIZE_LIST);
+  AddCommonCasts<FixedSizeListType>(kOutputTargetType, cast_fsl.get());
+
+  // So is struct
+  auto cast_struct = std::make_shared<CastFunction>("cast_struct", Type::STRUCT);
+  AddCommonCasts<StructType>(kOutputTargetType, cast_struct.get());
+
+  return {cast_list, cast_large_list, cast_fsl, cast_struct};
 }
 
 }  // namespace internal
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
index 2816e4c9ffa..a7b963af5ea 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
@@ -546,6 +546,12 @@ std::vector<std::shared_ptr<CastFunction>> GetNumericCasts() {
   functions.push_back(GetCastToInteger<UInt32Type>("cast_uint32"));
   functions.push_back(GetCastToInteger<UInt64Type>("cast_uint64"));
 
+  // HalfFloat is a bit brain-damaged for now
+  auto cast_half_float =
+      std::make_shared<CastFunction>("cast_half_float", Type::HALF_FLOAT);
+  AddCommonCasts<HalfFloatType>(float16(), cast_half_float.get());
+  functions.push_back(cast_half_float);
+
   functions.push_back(GetCastToFloating<FloatType>("cast_float"));
   functions.push_back(GetCastToFloating<DoubleType>("cast_double"));
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
index 6fc33dc2271..4bcee7b8ea2 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
@@ -134,7 +134,8 @@ struct CastFunctor<
     const auto& in_type = checked_cast<const I&>(*batch[0].type());
     const auto& out_type = checked_cast<const O&>(*output->type);
 
-    DCHECK_NE(in_type.unit(), out_type.unit()) << "Do not cast equal types";
+    // The units may be equal if the time zones are different. We might go to
+    // lengths to make this zero copy in the future but we leave it for now
 
     auto conversion = util::kTimestampConversionTable[static_cast<int>(in_type.unit())]
                                                      [static_cast<int>(out_type.unit())];
@@ -344,10 +345,7 @@ std::shared_ptr<CastFunction> GetDurationCast() {
   auto nanos = duration(TimeUnit::NANO);
 
   // Same integer representation
-  AddZeroCopyCast(/*in_type=*/int64(), /*out_type=*/seconds, func.get());
-  AddZeroCopyCast(int64(), millis, func.get());
-  AddZeroCopyCast(int64(), micros, func.get());
-  AddZeroCopyCast(int64(), nanos, func.get());
+  AddZeroCopyCast(/*in_type=*/int64(), kOutputTargetType, func.get());
 
   // Between durations
   AddCrossUnitCast<DurationType>(func.get());
@@ -359,12 +357,8 @@ std::shared_ptr<CastFunction> GetTime32Cast() {
   auto func = std::make_shared<CastFunction>("cast_time32", Type::TIME32);
   AddCommonCasts<Date32Type>(kOutputTargetType, func.get());
 
-  auto seconds = time32(TimeUnit::SECOND);
-  auto millis = time32(TimeUnit::MILLI);
-
   // Zero copy when the unit is the same or same integer representation
-  AddZeroCopyCast(int32(), seconds, func.get());
-  AddZeroCopyCast(int32(), millis, func.get());
+  AddZeroCopyCast(/*in_type=*/int32(), kOutputTargetType, func.get());
 
   // time64 -> time32
   AddSimpleCast<Time64Type, Time32Type>(InputType(Type::TIME64), kOutputTargetType,
@@ -380,12 +374,8 @@ std::shared_ptr<CastFunction> GetTime64Cast() {
   auto func = std::make_shared<CastFunction>("cast_time64", Type::TIME64);
   AddCommonCasts<Time64Type>(kOutputTargetType, func.get());
 
-  auto micros = time64(TimeUnit::MICRO);
-  auto nanos = time64(TimeUnit::NANO);
-
   // Zero copy when the unit is the same or same integer representation
-  AddZeroCopyCast(int64(), micros, func.get());
-  AddZeroCopyCast(int64(), nanos, func.get());
+  AddZeroCopyCast(/*in_type=*/int64(), kOutputTargetType, func.get());
 
   // time32 -> time64
   AddSimpleCast<Time32Type, Time64Type>(InputType(Type::TIME32), kOutputTargetType,
@@ -402,11 +392,7 @@ std::shared_ptr<CastFunction> GetTimestampCast() {
   AddCommonCasts<TimestampType>(kOutputTargetType, func.get());
 
   // Same integer representation
-  AddZeroCopyCast(/*in_type=*/int64(), /*out_type=*/timestamp(TimeUnit::SECOND),
-                  func.get());
-  AddZeroCopyCast(int64(), timestamp(TimeUnit::MILLI), func.get());
-  AddZeroCopyCast(int64(), timestamp(TimeUnit::MICRO), func.get());
-  AddZeroCopyCast(int64(), timestamp(TimeUnit::NANO), func.get());
+  AddZeroCopyCast(/*in_type=*/int64(), kOutputTargetType, func.get());
 
   // From date types
   // TODO: ARROW-8876, these casts are not implemented
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
index 83a90d6317d..89ca3667684 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc
@@ -1216,7 +1216,7 @@ TEST_F(TestCast, UnsupportedTarget) {
   std::shared_ptr<Array> arr;
   ArrayFromVector<Int32Type, int32_t>(int32(), is_valid, v1, &arr);
 
-  ASSERT_RAISES(Invalid, Cast(*arr, list(utf8())));
+  ASSERT_RAISES(NotImplemented, Cast(*arr, list(utf8())));
 }
 
 TEST_F(TestCast, DateTimeZeroCopy) {
@@ -1322,8 +1322,8 @@ TEST_F(TestCast, ListToPrimitive) {
   auto from_int = ArrayFromJSON(list(int8()), "[[1, 2], [3, 4]]");
   auto from_binary = ArrayFromJSON(list(binary()), "[[\"1\", \"2\"], [\"3\", \"4\"]]");
 
-  ASSERT_RAISES(Invalid, Cast(*from_int, uint8()));
-  ASSERT_RAISES(Invalid, Cast(*from_binary, utf8()));
+  ASSERT_RAISES(NotImplemented, Cast(*from_int, uint8()));
+  ASSERT_RAISES(NotImplemented, Cast(*from_binary, utf8()));
 }
 
 TEST_F(TestCast, ListToList) {
@@ -1530,7 +1530,7 @@ TYPED_TEST(TestDictionaryCast, DISABLED_OutTypeError) {
   auto out_type = (plain_array->type()->id() == Type::INT8) ? binary() : int8();
   // Test an output type that's not part of TestTypes.
   out_type = list(in_type);
-  ASSERT_RAISES(Invalid, GetCastFunction(out_type));
+  ASSERT_RAISES(NotImplemented, GetCastFunction(out_type));
 }
 
 std::shared_ptr<Array> SmallintArrayFromJSON(const std::string& json_data) {
diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx
index fedd82472ea..bb4ad916cf7 100644
--- a/python/pyarrow/_compute.pyx
+++ b/python/pyarrow/_compute.pyx
@@ -20,18 +20,18 @@
 from pyarrow.lib cimport (
     Array,
     wrap_datum,
-    _context,
     check_status,
     ChunkedArray
 )
 from pyarrow.includes.libarrow cimport CDatum, Sum
+from pyarrow.includes.common cimport *
 
 
 cdef _sum_array(array: Array):
     cdef CDatum out
 
     with nogil:
-        check_status(Sum(_context(), CDatum(array.sp_array), &out))
+        out = GetResultValue(Sum(CDatum(array.sp_array)))
 
     return wrap_datum(out)
 
@@ -40,7 +40,7 @@ cdef _sum_chunked_array(array: ChunkedArray):
     cdef CDatum out
 
     with nogil:
-        check_status(Sum(_context(), CDatum(array.sp_chunked_array), &out))
+        out = GetResultValue(Sum(CDatum(array.sp_chunked_array)))
 
     return wrap_datum(out)
 
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 5241f0b3297..ee10ff779dd 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -376,19 +376,6 @@ cdef Py_ssize_t _normalize_index(Py_ssize_t index,
     return index
 
 
-cdef class _FunctionContext:
-    cdef:
-        unique_ptr[CFunctionContext] ctx
-
-    def __cinit__(self):
-        self.ctx.reset(new CFunctionContext(c_default_memory_pool()))
-
-cdef _FunctionContext _global_ctx = _FunctionContext()
-
-cdef CFunctionContext* _context() nogil:
-    return _global_ctx.ctx.get()
-
-
 cdef wrap_datum(const CDatum& datum):
     if datum.kind() == DatumType_ARRAY:
         return pyarrow_wrap_array(MakeArray(datum.array()))
@@ -705,9 +692,7 @@ cdef class Array(_PandasConvertible):
             shared_ptr[CArray] result
 
         with nogil:
-            check_status(Cast(_context(), self.ap[0], type.sp_type,
-                              options, &result))
-
+            result = GetResultValue(Cast(self.ap[0], type.sp_type, options))
         return pyarrow_wrap_array(result)
 
     def view(self, object target_type):
@@ -736,10 +721,8 @@ cdef class Array(_PandasConvertible):
         Sum the values in a numerical array.
         """
         cdef CDatum out
-
         with nogil:
-            check_status(Sum(_context(), CDatum(self.sp_array), &out))
-
+            out = GetResultValue(Sum(CDatum(self.sp_array)))
         return wrap_datum(out)
 
     def unique(self):
@@ -749,7 +732,7 @@ cdef class Array(_PandasConvertible):
         cdef shared_ptr[CArray] result
 
         with nogil:
-            check_status(Unique(_context(), CDatum(self.sp_array), &result))
+            result = GetResultValue(Unique(CDatum(self.sp_array)))
 
         return pyarrow_wrap_array(result)
 
@@ -760,9 +743,7 @@ cdef class Array(_PandasConvertible):
         cdef CDatum out
 
         with nogil:
-            check_status(DictionaryEncode(_context(), CDatum(self.sp_array),
-                                          &out))
-
+            out = GetResultValue(DictionaryEncode(CDatum(self.sp_array)))
         return wrap_datum(out)
 
     def value_counts(self):
@@ -776,8 +757,7 @@ cdef class Array(_PandasConvertible):
         cdef shared_ptr[CArray] result
 
         with nogil:
-            check_status(ValueCounts(_context(), CDatum(self.sp_array),
-                                     &result))
+            result = GetResultValue(ValueCounts(CDatum(self.sp_array)))
         return pyarrow_wrap_array(result)
 
     @staticmethod
@@ -1040,8 +1020,8 @@ cdef class Array(_PandasConvertible):
         c_indices = asarray(indices)
 
         with nogil:
-            check_status(Take(_context(), CDatum(self.sp_array),
-                              CDatum(c_indices.sp_array), options, &out))
+            out = GetResultValue(Take(CDatum(self.sp_array),
+                                      CDatum(c_indices.sp_array), options))
 
         return wrap_datum(out)
 
@@ -1091,8 +1071,8 @@ cdef class Array(_PandasConvertible):
         options = _convert_filter_option(null_selection_behavior)
 
         with nogil:
-            check_status(FilterKernel(_context(), CDatum(self.sp_array),
-                                      CDatum(mask.sp_array), options, &out))
+            out = GetResultValue(FilterKernel(CDatum(self.sp_array),
+                                              CDatum(mask.sp_array), options))
 
         return wrap_datum(out)
 
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 8e1c512e535..58511fa97b9 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1453,9 +1453,9 @@ cdef extern from "arrow/json/reader.h" namespace "arrow::json" nogil:
 
 cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
 
-    cdef cppclass CFunctionContext" arrow::compute::FunctionContext":
-        CFunctionContext()
-        CFunctionContext(CMemoryPool* pool)
+    cdef cppclass CExecContext" arrow::compute::ExecContext":
+        CExecContext()
+        CExecContext(CMemoryPool* pool)
 
     cdef cppclass CCastOptions" arrow::compute::CastOptions":
         CCastOptions()
@@ -1485,16 +1485,16 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
     cdef cppclass CFilterOptions" arrow::compute::FilterOptions":
         CFilterNullSelectionBehavior null_selection_behavior
 
-    enum DatumType" arrow::compute::Datum::type":
-        DatumType_NONE" arrow::compute::Datum::NONE"
-        DatumType_SCALAR" arrow::compute::Datum::SCALAR"
-        DatumType_ARRAY" arrow::compute::Datum::ARRAY"
-        DatumType_CHUNKED_ARRAY" arrow::compute::Datum::CHUNKED_ARRAY"
-        DatumType_RECORD_BATCH" arrow::compute::Datum::RECORD_BATCH"
-        DatumType_TABLE" arrow::compute::Datum::TABLE"
-        DatumType_COLLECTION" arrow::compute::Datum::COLLECTION"
+    enum DatumType" arrow::Datum::type":
+        DatumType_NONE" arrow::Datum::NONE"
+        DatumType_SCALAR" arrow::Datum::SCALAR"
+        DatumType_ARRAY" arrow::Datum::ARRAY"
+        DatumType_CHUNKED_ARRAY" arrow::Datum::CHUNKED_ARRAY"
+        DatumType_RECORD_BATCH" arrow::Datum::RECORD_BATCH"
+        DatumType_TABLE" arrow::Datum::TABLE"
+        DatumType_COLLECTION" arrow::Datum::COLLECTION"
 
-    cdef cppclass CDatum" arrow::compute::Datum":
+    cdef cppclass CDatum" arrow::Datum":
         CDatum()
         CDatum(const shared_ptr[CArray]& value)
         CDatum(const shared_ptr[CChunkedArray]& value)
@@ -1509,43 +1509,39 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
         shared_ptr[CTable] table()
         shared_ptr[CScalar] scalar()
 
-    CStatus Cast(CFunctionContext* context, const CArray& array,
-                 const shared_ptr[CDataType]& to_type,
-                 const CCastOptions& options,
-                 shared_ptr[CArray]* out)
+    CResult[shared_ptr[CArray]] Cast" arrow::compute::Cast"(
+        const CArray& array,
+        const shared_ptr[CDataType]& to_type,
+        const CCastOptions& options)
 
-    CStatus Cast(CFunctionContext* context, const CDatum& value,
-                 const shared_ptr[CDataType]& to_type,
-                 const CCastOptions& options, CDatum* out)
+    CResult[CDatum] Cast(const CDatum& value,
+                         const shared_ptr[CDataType]& to_type,
+                         const CCastOptions& options)
 
-    CStatus Unique(CFunctionContext* context, const CDatum& value,
-                   shared_ptr[CArray]* out)
+    CResult[shared_ptr[CArray]] Unique(const CDatum& value)
 
-    CStatus DictionaryEncode(CFunctionContext* context, const CDatum& value,
-                             CDatum* out)
+    CResult[CDatum] DictionaryEncode(CDatum& value)
 
-    CStatus ValueCounts(CFunctionContext* context, const CDatum& value,
-                        shared_ptr[CArray]* out)
+    CResult[shared_ptr[CArray]] ValueCounts(const CDatum& value)
 
-    CStatus Sum(CFunctionContext* context, const CDatum& value, CDatum* out)
+    CResult[CDatum] Sum(const CDatum& value)
 
-    CStatus Take(CFunctionContext* context, const CDatum& values,
-                 const CDatum& indices, const CTakeOptions& options,
-                 CDatum* out)
-    CStatus Take(CFunctionContext* context, const CChunkedArray& values,
-                 const CArray& indices, const CTakeOptions& options,
-                 shared_ptr[CChunkedArray]* out)
-    CStatus Take(CFunctionContext* context, const CRecordBatch& batch,
-                 const CArray& indices, const CTakeOptions& options,
-                 shared_ptr[CRecordBatch]* out)
-    CStatus Take(CFunctionContext* context, const CTable& table,
-                 const CArray& indices, const CTakeOptions& options,
-                 shared_ptr[CTable]* out)
+    CResult[CDatum] Take(const CDatum& values, const CDatum& indices,
+                         const CTakeOptions& options)
+
+    CResult[shared_ptr[CChunkedArray]] Take(const CChunkedArray& values,
+                                            const CArray& indices,
+                                            const CTakeOptions& options)
+    CResult[shared_ptr[CRecordBatch]] Take(const CRecordBatch& batch,
+                                           const CArray& indices,
+                                           const CTakeOptions& options)
+    CResult[shared_ptr[CTable]] Take(const CTable& table,
+                                     const CArray& indices,
+                                     const CTakeOptions& options)
 
     # Filter clashes with gandiva.pyx::Filter
-    CStatus FilterKernel" arrow::compute::Filter"(
-        CFunctionContext* ctx, const CDatum& values,
-        const CDatum& filter, CFilterOptions options, CDatum* out)
+    CResult[CDatum] FilterKernel" arrow::compute::Filter"(
+        const CDatum& values, const CDatum& filter, CFilterOptions options)
 
     enum CCompareOperator "arrow::compute::CompareOperator":
         CCompareOperator_EQUAL "arrow::compute::CompareOperator::EQUAL"
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index e629c0efe9b..3487941cefd 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -28,7 +28,6 @@ cdef extern from "Python.h":
     int PySlice_Check(object)
 
 
-cdef CFunctionContext* _context() nogil
 cdef int check_status(const CStatus& status) nogil except -1
 
 cdef class Message:
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 74b0ae0eda7..b072063d632 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -260,8 +260,8 @@ cdef class ChunkedArray(_PandasConvertible):
             CDatum out
 
         with nogil:
-            check_status(Cast(_context(), CDatum(self.sp_chunked_array),
-                              type.sp_type, options, &out))
+            out = GetResultValue(Cast(CDatum(self.sp_chunked_array),
+                                      type.sp_type, options))
 
         return pyarrow_wrap_chunked_array(out.chunked_array())
 
@@ -275,12 +275,9 @@ cdef class ChunkedArray(_PandasConvertible):
             Same chunking as the input, all chunks share a common dictionary.
         """
         cdef CDatum out
-
         with nogil:
-            check_status(
-                DictionaryEncode(_context(), CDatum(self.sp_chunked_array),
-                                 &out))
-
+            out = GetResultValue(
+                DictionaryEncode(CDatum(self.sp_chunked_array)))
         return wrap_datum(out)
 
     def flatten(self, MemoryPool memory_pool=None):
@@ -317,8 +314,7 @@ cdef class ChunkedArray(_PandasConvertible):
         cdef shared_ptr[CArray] result
 
         with nogil:
-            check_status(
-                Unique(_context(), CDatum(self.sp_chunked_array), &result))
+            result = GetResultValue(Unique(CDatum(self.sp_chunked_array)))
 
         return pyarrow_wrap_array(result)
 
@@ -333,8 +329,7 @@ cdef class ChunkedArray(_PandasConvertible):
         cdef shared_ptr[CArray] result
 
         with nogil:
-            check_status(ValueCounts(_context(), CDatum(self.sp_chunked_array),
-                                     &result))
+            result = GetResultValue(ValueCounts(CDatum(self.sp_chunked_array)))
         return pyarrow_wrap_array(result)
 
     def slice(self, offset=0, length=None):
@@ -415,9 +410,9 @@ cdef class ChunkedArray(_PandasConvertible):
             filter = CDatum((<ChunkedArray> mask).sp_chunked_array)
 
         with nogil:
-            check_status(
-                FilterKernel(_context(), CDatum(self.sp_chunked_array),
-                             filter, options, &out))
+            out = GetResultValue(
+                FilterKernel(CDatum(self.sp_chunked_array),
+                             filter, options))
 
         return wrap_datum(out)
 
@@ -447,8 +442,8 @@ cdef class ChunkedArray(_PandasConvertible):
         c_indices = asarray(indices)
 
         with nogil:
-            check_status(Take(_context(), deref(self.sp_chunked_array),
-                              deref(c_indices.sp_array), options, &out))
+            out = GetResultValue(Take(deref(self.sp_chunked_array),
+                                      deref(c_indices.sp_array), options))
 
         return pyarrow_wrap_chunked_array(out)
 
@@ -880,9 +875,9 @@ cdef class RecordBatch(_PandasConvertible):
         options = _convert_filter_option(null_selection_behavior)
 
         with nogil:
-            check_status(
-                FilterKernel(_context(), CDatum(self.sp_batch),
-                             CDatum(mask.sp_array), options, &out)
+            out = GetResultValue(
+                FilterKernel(CDatum(self.sp_batch),
+                             CDatum(mask.sp_array), options)
             )
 
         return wrap_datum(out)
@@ -941,8 +936,8 @@ cdef class RecordBatch(_PandasConvertible):
         c_indices = asarray(indices)
 
         with nogil:
-            check_status(Take(_context(), deref(this_batch),
-                              deref(c_indices.sp_array), options, &out))
+            out = GetResultValue(Take(deref(this_batch),
+                                      deref(c_indices.sp_array), options))
 
         return pyarrow_wrap_batch(out)
 
@@ -1306,9 +1301,9 @@ cdef class Table(_PandasConvertible):
             filter = CDatum((<ChunkedArray> mask).sp_chunked_array)
 
         with nogil:
-            check_status(
-                FilterKernel(_context(), CDatum(self.sp_table),
-                             filter, options, &out)
+            out = GetResultValue(
+                FilterKernel(CDatum(self.sp_table),
+                             filter, options)
             )
 
         return wrap_datum(out)
@@ -1339,8 +1334,8 @@ cdef class Table(_PandasConvertible):
         c_indices = asarray(indices)
 
         with nogil:
-            check_status(Take(_context(), deref(self.table),
-                              deref(c_indices.sp_array), options, &out))
+            out = GetResultValue(Take(deref(self.table),
+                                      deref(c_indices.sp_array), options))
 
         return pyarrow_wrap_table(out)
 
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index dd9e9e60a8f..e5ccbf12b11 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -957,11 +957,11 @@ def test_cast_none():
 def test_cast_list_to_primitive():
     # ARROW-8070: cast segfaults on unsupported cast from list<binary> to utf8
     arr = pa.array([[1, 2], [3, 4]])
-    with pytest.raises(pa.ArrowInvalid):
+    with pytest.raises(NotImplementedError):
         arr.cast(pa.int8())
 
     arr = pa.array([[b"a", b"b"], [b"c"]], pa.list_(pa.binary()))
-    with pytest.raises(pa.ArrowInvalid):
+    with pytest.raises(NotImplementedError):
         arr.cast(pa.binary())
 
 
@@ -1246,7 +1246,7 @@ def test_cast_dictionary():
         pa.array([0, 1, None], type=pa.int32()),
         pa.array(["foo", "bar"]))
     assert arr.cast(pa.string()).equals(pa.array(["foo", "bar", None]))
-    with pytest.raises(NotImplementedError):
+    with pytest.raises(pa.ArrowInvalid):
         # Shouldn't crash (ARROW-7077)
         arr.cast(pa.int32())
 
@@ -2225,7 +2225,7 @@ def test_empty_cast():
             # ARROW-4766: Ensure that supported types conversion don't segfault
             # on empty arrays of common types
             pa.array([], type=t1).cast(t2)
-        except pa.lib.ArrowNotImplementedError:
+        except (pa.lib.ArrowNotImplementedError, pa.ArrowInvalid):
             continue
 
 
diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
index 366f3cb114f..d56baaf23ab 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -129,7 +129,7 @@ def test_take_indices_types():
 
     for indices_type in [pa.float32(), pa.float64()]:
         indices = pa.array([0, 4, 2], type=indices_type)
-        with pytest.raises(TypeError):
+        with pytest.raises(NotImplementedError):
             arr.take(indices)
 
 
@@ -158,12 +158,12 @@ def test_filter(ty, values):
 
     # non-boolean dtype
     mask = pa.array([0, 1, 0, 1, 0])
-    with pytest.raises(TypeError, match="got int64"):
+    with pytest.raises(NotImplementedError, match="no kernel matching"):
         arr.filter(mask)
 
     # wrong length
     mask = pa.array([True, False, True])
-    with pytest.raises(ValueError, match="must have identical lengths"):
+    with pytest.raises(ValueError, match="must all be the same length"):
         arr.filter(mask)
 
 
@@ -229,10 +229,12 @@ def test_filter_errors():
     for obj in [arr, batch, table]:
         # non-boolean dtype
         mask = pa.array([0, 1, 0, 1, 0])
-        with pytest.raises(TypeError, match="must be of boolean type"):
+        with pytest.raises(NotImplementedError,
+                           match="no kernel matching input types"):
             obj.filter(mask)
 
         # wrong length
         mask = pa.array([True, False, True])
-        with pytest.raises(ValueError, match="must have identical lengths"):
+        with pytest.raises(pa.ArrowInvalid,
+                           match="must all be the same length"):
             obj.filter(mask)
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
index 3a07ffa77b3..94ec398ad7e 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -3279,6 +3279,9 @@ def test_cast_timestamp_unit():
     arr = pa.array([123123], type='int64').cast(pa.timestamp('ms'))
     expected = pa.array([123], type='int64').cast(pa.timestamp('s'))
 
+    # sanity check that the cast worked right
+    assert arr.type == pa.timestamp('ms')
+
     target = pa.timestamp('s')
     with pytest.raises(ValueError):
         arr.cast(target)

From a10e77308a499c7ae76422b446235bd4bfd58ed6 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Wed, 20 May 2020 23:33:16 -0500
Subject: [PATCH 16/41] Port R C++ code, though there is a failing test

---
 r/src/compute.cpp | 75 ++++++++++-------------------------------------
 1 file changed, 16 insertions(+), 59 deletions(-)

diff --git a/r/src/compute.cpp b/r/src/compute.cpp
index 9f81b90a176..0a18cf33522 100644
--- a/r/src/compute.cpp
+++ b/r/src/compute.cpp
@@ -35,10 +35,7 @@ std::shared_ptr<arrow::Array> Array__cast(
     const std::shared_ptr<arrow::Array>& array,
     const std::shared_ptr<arrow::DataType>& target_type,
     const std::shared_ptr<arrow::compute::CastOptions>& options) {
-  std::shared_ptr<arrow::Array> out;
-  arrow::compute::FunctionContext context;
-  StopIfNotOk(arrow::compute::Cast(&context, *array, target_type, *options, &out));
-  return out;
+  return ValueOrStop(arrow::compute::Cast(*array, target_type, *options));
 }
 
 // [[arrow::export]]
@@ -46,10 +43,8 @@ std::shared_ptr<arrow::ChunkedArray> ChunkedArray__cast(
     const std::shared_ptr<arrow::ChunkedArray>& chunked_array,
     const std::shared_ptr<arrow::DataType>& target_type,
     const std::shared_ptr<arrow::compute::CastOptions>& options) {
-  arrow::compute::Datum value(chunked_array);
-  arrow::compute::Datum out;
-  arrow::compute::FunctionContext context;
-  StopIfNotOk(arrow::compute::Cast(&context, value, target_type, *options, &out));
+  arrow::Datum value(chunked_array);
+  arrow::Datum out = ValueOrStop(arrow::compute::Cast(value, target_type, *options));
   return out.chunked_array();
 }
 
@@ -86,95 +81,67 @@ std::shared_ptr<arrow::Table> Table__cast(
 // [[arrow::export]]
 std::shared_ptr<arrow::Array> Array__Take(const std::shared_ptr<arrow::Array>& values,
                                           const std::shared_ptr<arrow::Array>& indices) {
-  std::shared_ptr<arrow::Array> out;
-  arrow::compute::FunctionContext context;
   arrow::compute::TakeOptions options;
-  StopIfNotOk(arrow::compute::Take(&context, *values, *indices, options, &out));
-  return out;
+  return ValueOrStop(arrow::compute::Take(*values, *indices, options));
 }
 
 // [[arrow::export]]
 std::shared_ptr<arrow::ChunkedArray> Array__TakeChunked(
     const std::shared_ptr<arrow::Array>& values,
     const std::shared_ptr<arrow::ChunkedArray>& indices) {
-  std::shared_ptr<arrow::ChunkedArray> out;
-  arrow::compute::FunctionContext context;
   arrow::compute::TakeOptions options;
-
-  StopIfNotOk(arrow::compute::Take(&context, *values, *indices, options, &out));
-  return out;
+  return ValueOrStop(arrow::compute::Take(*values, *indices, options));
 }
 
 // [[arrow::export]]
 std::shared_ptr<arrow::RecordBatch> RecordBatch__Take(
     const std::shared_ptr<arrow::RecordBatch>& batch,
     const std::shared_ptr<arrow::Array>& indices) {
-  std::shared_ptr<arrow::RecordBatch> out;
-  arrow::compute::FunctionContext context;
   arrow::compute::TakeOptions options;
-  StopIfNotOk(arrow::compute::Take(&context, *batch, *indices, options, &out));
-  return out;
+  return ValueOrStop(arrow::compute::Take(*batch, *indices, options));
 }
 
 // [[arrow::export]]
 std::shared_ptr<arrow::ChunkedArray> ChunkedArray__Take(
     const std::shared_ptr<arrow::ChunkedArray>& values,
     const std::shared_ptr<arrow::Array>& indices) {
-  std::shared_ptr<arrow::ChunkedArray> out;
-  arrow::compute::FunctionContext context;
   arrow::compute::TakeOptions options;
-
-  StopIfNotOk(arrow::compute::Take(&context, *values, *indices, options, &out));
-  return out;
+  return ValueOrStop(arrow::compute::Take(*values, *indices, options));
 }
 
 // [[arrow::export]]
 std::shared_ptr<arrow::ChunkedArray> ChunkedArray__TakeChunked(
     const std::shared_ptr<arrow::ChunkedArray>& values,
     const std::shared_ptr<arrow::ChunkedArray>& indices) {
-  std::shared_ptr<arrow::ChunkedArray> out;
-  arrow::compute::FunctionContext context;
   arrow::compute::TakeOptions options;
-
-  StopIfNotOk(arrow::compute::Take(&context, *values, *indices, options, &out));
-  return out;
+  return ValueOrStop(arrow::compute::Take(*values, *indices, options));
 }
 
 // [[arrow::export]]
 std::shared_ptr<arrow::Table> Table__Take(const std::shared_ptr<arrow::Table>& table,
                                           const std::shared_ptr<arrow::Array>& indices) {
-  std::shared_ptr<arrow::Table> out;
-  arrow::compute::FunctionContext context;
   arrow::compute::TakeOptions options;
-
-  StopIfNotOk(arrow::compute::Take(&context, *table, *indices, options, &out));
-  return out;
+  return ValueOrStop(arrow::compute::Take(*table, *indices, options));
 }
 
 // [[arrow::export]]
 std::shared_ptr<arrow::Table> Table__TakeChunked(
     const std::shared_ptr<arrow::Table>& table,
     const std::shared_ptr<arrow::ChunkedArray>& indices) {
-  std::shared_ptr<arrow::Table> out;
-  arrow::compute::FunctionContext context;
   arrow::compute::TakeOptions options;
-
-  StopIfNotOk(arrow::compute::Take(&context, *table, *indices, options, &out));
-  return out;
+  return ValueOrStop(arrow::compute::Take(*table, *indices, options));
 }
 
 // [[arrow::export]]
 std::shared_ptr<arrow::Array> Array__Filter(const std::shared_ptr<arrow::Array>& values,
                                             const std::shared_ptr<arrow::Array>& filter,
                                             bool keep_na) {
-  arrow::compute::FunctionContext context;
-  arrow::compute::Datum out;
   // Use the EMIT_NULL filter option to match R's behavior in [
   arrow::compute::FilterOptions options;
   if (keep_na) {
     options.null_selection_behavior = arrow::compute::FilterOptions::EMIT_NULL;
   }
-  StopIfNotOk(arrow::compute::Filter(&context, values, filter, {}, &out));
+  arrow::Datum out = ValueOrStop(arrow::compute::Filter(values, filter, options));
   return out.make_array();
 }
 
@@ -182,14 +149,12 @@ std::shared_ptr<arrow::Array> Array__Filter(const std::shared_ptr<arrow::Array>&
 std::shared_ptr<arrow::RecordBatch> RecordBatch__Filter(
     const std::shared_ptr<arrow::RecordBatch>& batch,
     const std::shared_ptr<arrow::Array>& filter, bool keep_na) {
-  arrow::compute::FunctionContext context;
-  arrow::compute::Datum out;
   // Use the EMIT_NULL filter option to match R's behavior in [
   arrow::compute::FilterOptions options;
   if (keep_na) {
     options.null_selection_behavior = arrow::compute::FilterOptions::EMIT_NULL;
   }
-  StopIfNotOk(arrow::compute::Filter(&context, batch, filter, options, &out));
+  arrow::Datum out = ValueOrStop(arrow::compute::Filter(batch, filter, options));
   return out.record_batch();
 }
 
@@ -197,14 +162,12 @@ std::shared_ptr<arrow::RecordBatch> RecordBatch__Filter(
 std::shared_ptr<arrow::ChunkedArray> ChunkedArray__Filter(
     const std::shared_ptr<arrow::ChunkedArray>& values,
     const std::shared_ptr<arrow::Array>& filter, bool keep_na) {
-  arrow::compute::FunctionContext context;
-  arrow::compute::Datum out;
   // Use the EMIT_NULL filter option to match R's behavior in [
   arrow::compute::FilterOptions options;
   if (keep_na) {
     options.null_selection_behavior = arrow::compute::FilterOptions::EMIT_NULL;
   }
-  StopIfNotOk(arrow::compute::Filter(&context, values, filter, options, &out));
+  arrow::Datum out = ValueOrStop(arrow::compute::Filter(values, filter, options));
   return out.chunked_array();
 }
 
@@ -212,14 +175,12 @@ std::shared_ptr<arrow::ChunkedArray> ChunkedArray__Filter(
 std::shared_ptr<arrow::ChunkedArray> ChunkedArray__FilterChunked(
     const std::shared_ptr<arrow::ChunkedArray>& values,
     const std::shared_ptr<arrow::ChunkedArray>& filter, bool keep_na) {
-  arrow::compute::FunctionContext context;
-  arrow::compute::Datum out;
   // Use the EMIT_NULL filter option to match R's behavior in [
   arrow::compute::FilterOptions options;
   if (keep_na) {
     options.null_selection_behavior = arrow::compute::FilterOptions::EMIT_NULL;
   }
-  StopIfNotOk(arrow::compute::Filter(&context, values, filter, options, &out));
+  arrow::Datum out = ValueOrStop(arrow::compute::Filter(values, filter, options));
   return out.chunked_array();
 }
 
@@ -227,14 +188,12 @@ std::shared_ptr<arrow::ChunkedArray> ChunkedArray__FilterChunked(
 std::shared_ptr<arrow::Table> Table__Filter(const std::shared_ptr<arrow::Table>& table,
                                             const std::shared_ptr<arrow::Array>& filter,
                                             bool keep_na) {
-  arrow::compute::FunctionContext context;
-  arrow::compute::Datum out;
   // Use the EMIT_NULL filter option to match R's behavior in [
   arrow::compute::FilterOptions options;
   if (keep_na) {
     options.null_selection_behavior = arrow::compute::FilterOptions::EMIT_NULL;
   }
-  StopIfNotOk(arrow::compute::Filter(&context, table, filter, options, &out));
+  arrow::Datum out = ValueOrStop(arrow::compute::Filter(table, filter, options));
   std::shared_ptr<arrow::Table> tab = out.table();
   if (tab->num_rows() == 0) {
     // Slight hack: if there are no rows in the result, instead do a 0-length
@@ -249,14 +208,12 @@ std::shared_ptr<arrow::Table> Table__Filter(const std::shared_ptr<arrow::Table>&
 std::shared_ptr<arrow::Table> Table__FilterChunked(
     const std::shared_ptr<arrow::Table>& table,
     const std::shared_ptr<arrow::ChunkedArray>& filter, bool keep_na) {
-  arrow::compute::FunctionContext context;
-  arrow::compute::Datum out;
   // Use the EMIT_NULL filter option to match R's behavior in [
   arrow::compute::FilterOptions options;
   if (keep_na) {
     options.null_selection_behavior = arrow::compute::FilterOptions::EMIT_NULL;
   }
-  StopIfNotOk(arrow::compute::Filter(&context, table, filter, options, &out));
+  arrow::Datum out = ValueOrStop(arrow::compute::Filter(table, filter, options));
   std::shared_ptr<arrow::Table> tab = out.table();
   if (tab->num_rows() == 0) {
     // Slight hack: if there are no rows in the result, instead do a 0-length

From 1285e241317d7a3a54462fea525e570c8ebb644f Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Thu, 21 May 2020 22:01:13 -0500
Subject: [PATCH 17/41] Initial round of code review comments, add a bunch of
 comments

---
 cpp/CMakeLists.txt                            |   4 -
 cpp/src/arrow/compute/api.h                   |   1 +
 cpp/src/arrow/compute/api_aggregate.cc        |   4 -
 cpp/src/arrow/compute/api_aggregate.h         |  50 ++++-
 cpp/src/arrow/compute/api_scalar.h            |  37 +++-
 cpp/src/arrow/compute/api_vector.cc           |   1 -
 cpp/src/arrow/compute/api_vector.h            |  25 ++-
 cpp/src/arrow/compute/cast.cc                 |  13 --
 cpp/src/arrow/compute/cast.h                  |  46 ++++-
 cpp/src/arrow/compute/exec.cc                 |  14 +-
 cpp/src/arrow/compute/exec.h                  |  13 +-
 cpp/src/arrow/compute/exec_internal.h         |   5 -
 cpp/src/arrow/compute/exec_test.cc            |  11 +-
 cpp/src/arrow/compute/function.cc             |   5 +-
 cpp/src/arrow/compute/function.h              |   9 +-
 cpp/src/arrow/compute/function_test.cc        |   7 +-
 cpp/src/arrow/compute/kernel.cc               |   2 +-
 cpp/src/arrow/compute/kernel.h                |  13 +-
 cpp/src/arrow/compute/kernels/CMakeLists.txt  |  14 +-
 .../arrow/compute/kernels/aggregate_basic.cc  |   1 +
 .../arrow/compute/kernels/codegen_internal.cc |  41 ++--
 .../arrow/compute/kernels/codegen_internal.h  | 186 ++++++++++++------
 cpp/src/arrow/compute/kernels/common.h        |   1 -
 .../arrow/compute/kernels/scalar_boolean.cc   |  16 +-
 .../compute/kernels/scalar_boolean_test.cc    |   7 +-
 cpp/src/arrow/compute/kernels/scalar_cast.cc  |  54 -----
 .../compute/kernels/scalar_cast_internal.cc   |   2 +-
 .../compute/kernels/scalar_cast_internal.h    |   8 +-
 .../compute/kernels/scalar_cast_nested.cc     |   2 +-
 .../compute/kernels/scalar_set_lookup.cc      |   1 +
 .../arrow/compute/kernels/vector_filter.cc    |   2 +-
 .../kernels/vector_filter_benchmark.cc        |   3 +-
 cpp/src/arrow/compute/kernels/vector_hash.cc  |  12 +-
 .../kernels/vector_partition_benchmark.cc     |   3 +-
 .../kernels/vector_selection_internal.h       |   1 +
 cpp/src/arrow/compute/kernels/vector_sort.cc  |   1 +
 .../compute/kernels/vector_sort_benchmark.cc  |   2 +-
 cpp/src/arrow/compute/kernels/vector_take.cc  |   2 +-
 cpp/src/arrow/compute/options.h               | 155 ---------------
 cpp/src/arrow/compute/registry.cc             |  31 ++-
 cpp/src/arrow/compute/registry.h              |   1 -
 cpp/src/arrow/compute/registry_test.cc        |   5 -
 cpp/src/arrow/dataset/filter.h                |   3 +-
 cpp/src/arrow/type.cc                         |   1 +
 cpp/src/arrow/util/hash_util.h                |  66 +++++++
 cpp/src/arrow/util/hashing.h                  |  42 ----
 cpp/src/gandiva/filter.cc                     |   2 +-
 cpp/src/gandiva/function_signature.cc         |   3 +-
 cpp/src/gandiva/projector.cc                  |   3 +-
 49 files changed, 457 insertions(+), 474 deletions(-)
 delete mode 100644 cpp/src/arrow/compute/kernels/scalar_cast.cc
 delete mode 100644 cpp/src/arrow/compute/options.h
 create mode 100644 cpp/src/arrow/util/hash_util.h

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 662461ec89f..c5f65835499 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -309,10 +309,6 @@ if(ARROW_DATASET)
   set(ARROW_FILESYSTEM ON)
 endif()
 
-if(ARROW_GANDIVA)
-  set(ARROW_COMPUTE ON)
-endif()
-
 if(ARROW_PARQUET)
   set(ARROW_COMPUTE ON)
 endif()
diff --git a/cpp/src/arrow/compute/api.h b/cpp/src/arrow/compute/api.h
index 599c7ecbd3d..3fc6e22b4be 100644
--- a/cpp/src/arrow/compute/api.h
+++ b/cpp/src/arrow/compute/api.h
@@ -23,6 +23,7 @@
 #include "arrow/compute/api_aggregate.h"  // IWYU pragma: export
 #include "arrow/compute/api_scalar.h"     // IWYU pragma: export
 #include "arrow/compute/api_vector.h"     // IWYU pragma: export
+#include "arrow/compute/cast.h"           // IWYU pragma: export
 #include "arrow/compute/exec.h"           // IWYU pragma: export
 #include "arrow/compute/function.h"       // IWYU pragma: export
 #include "arrow/compute/kernel.h"         // IWYU pragma: export
diff --git a/cpp/src/arrow/compute/api_aggregate.cc b/cpp/src/arrow/compute/api_aggregate.cc
index fdcb8654c57..0a41e4c11f0 100644
--- a/cpp/src/arrow/compute/api_aggregate.cc
+++ b/cpp/src/arrow/compute/api_aggregate.cc
@@ -17,10 +17,6 @@
 
 #include "arrow/compute/api_aggregate.h"
 
-#include <sstream>
-#include <string>
-#include <utility>
-
 #include "arrow/compute/exec.h"
 
 namespace arrow {
diff --git a/cpp/src/arrow/compute/api_aggregate.h b/cpp/src/arrow/compute/api_aggregate.h
index 4a13e3c1fe9..82a4ebf76b6 100644
--- a/cpp/src/arrow/compute/api_aggregate.h
+++ b/cpp/src/arrow/compute/api_aggregate.h
@@ -20,19 +20,42 @@
 
 #pragma once
 
-#include <memory>
-
-#include "arrow/compute/exec.h"
-#include "arrow/compute/options.h"
+#include "arrow/compute/function.h"
 #include "arrow/datum.h"
 #include "arrow/result.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
 
 namespace arrow {
+
+class Array;
+
 namespace compute {
 
+class ExecContext;
+
 // ----------------------------------------------------------------------
 // Aggregate functions
 
+/// \class CountOptions
+///
+/// The user control the Count kernel behavior with this class. By default, the
+/// it will count all non-null values.
+struct ARROW_EXPORT CountOptions : public FunctionOptions {
+  enum mode {
+    // Count all non-null values.
+    COUNT_ALL = 0,
+    // Count all null values.
+    COUNT_NULL,
+  };
+
+  explicit CountOptions(enum mode count_mode) : count_mode(count_mode) {}
+
+  static CountOptions Defaults() { return CountOptions(COUNT_ALL); }
+
+  enum mode count_mode = COUNT_ALL;
+};
+
 /// \brief Count non-null (or null) values in an array.
 ///
 /// \param[in] options counting options, see CountOptions for more information
@@ -68,6 +91,25 @@ Result<Datum> Mean(const Datum& value, ExecContext* ctx = NULLPTR);
 ARROW_EXPORT
 Result<Datum> Sum(const Datum& value, ExecContext* ctx = NULLPTR);
 
+/// \class MinMaxOptions
+///
+/// The user can control the MinMax kernel behavior with this class. By default,
+/// it will skip null if there is a null value present.
+struct ARROW_EXPORT MinMaxOptions : public FunctionOptions {
+  enum mode {
+    /// skip null values
+    SKIP = 0,
+    /// any nulls will result in null output
+    OUTPUT_NULL
+  };
+
+  explicit MinMaxOptions(enum mode null_handling = SKIP) : null_handling(null_handling) {}
+
+  static MinMaxOptions Defaults() { return MinMaxOptions{}; }
+
+  enum mode null_handling = SKIP;
+};
+
 /// \brief Calculate the min / max of a numeric array
 ///
 /// This function returns both the min and max as a struct scalar, with type
diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h
index d36dfb556d1..0a067d0f64d 100644
--- a/cpp/src/arrow/compute/api_scalar.h
+++ b/cpp/src/arrow/compute/api_scalar.h
@@ -21,17 +21,20 @@
 #pragma once
 
 #include <memory>
+#include <utility>
 
-#include "arrow/compute/cast.h"
-#include "arrow/compute/exec.h"
-#include "arrow/compute/options.h"
+#include "arrow/compute/exec.h"  // IWYU pragma: keep
+#include "arrow/compute/function.h"
 #include "arrow/datum.h"
 #include "arrow/result.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
 
 namespace arrow {
-namespace compute {
 
-class ExecContext;
+class Array;
+
+namespace compute {
 
 // ----------------------------------------------------------------------
 
@@ -45,6 +48,21 @@ class ExecContext;
 ARROW_EXPORT
 Result<Datum> Add(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR);
 
+enum CompareOperator {
+  EQUAL,
+  NOT_EQUAL,
+  GREATER,
+  GREATER_EQUAL,
+  LESS,
+  LESS_EQUAL,
+};
+
+struct CompareOptions : public FunctionOptions {
+  explicit CompareOptions(CompareOperator op) : op(op) {}
+
+  enum CompareOperator op;
+};
+
 /// \brief Compare a numeric array with a scalar.
 ///
 /// \param[in] left datum to compare, must be an Array
@@ -136,6 +154,15 @@ Result<Datum> KleeneOr(const Datum& left, const Datum& right, ExecContext* ctx =
 ARROW_EXPORT
 Result<Datum> Xor(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR);
 
+/// For set lookup operations like IsIn, Match
+struct ARROW_EXPORT SetLookupOptions : public FunctionOptions {
+  explicit SetLookupOptions(std::shared_ptr<Array> value_set, bool skip_nulls)
+      : value_set(std::move(value_set)), skip_nulls(skip_nulls) {}
+
+  std::shared_ptr<Array> value_set;
+  bool skip_nulls;
+};
+
 /// \brief IsIn returns true for each element of `values` that is contained in
 /// `value_set`
 ///
diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc
index 7ac7a217fb0..1192ce01ec6 100644
--- a/cpp/src/arrow/compute/api_vector.cc
+++ b/cpp/src/arrow/compute/api_vector.cc
@@ -24,7 +24,6 @@
 #include "arrow/array/concatenate.h"
 #include "arrow/compute/exec.h"
 #include "arrow/compute/kernels/vector_selection_internal.h"
-#include "arrow/compute/options.h"
 #include "arrow/datum.h"
 #include "arrow/result.h"
 
diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h
index b131fec4a65..9e8ffacaf0e 100644
--- a/cpp/src/arrow/compute/api_vector.h
+++ b/cpp/src/arrow/compute/api_vector.h
@@ -19,7 +19,7 @@
 
 #include <memory>
 
-#include "arrow/compute/options.h"
+#include "arrow/compute/function.h"
 #include "arrow/datum.h"
 #include "arrow/result.h"
 
@@ -28,6 +28,20 @@ namespace compute {
 
 class ExecContext;
 
+struct FilterOptions : public FunctionOptions {
+  /// Configure the action taken when a slot of the selection mask is null
+  enum NullSelectionBehavior {
+    /// the corresponding filtered value will be removed in the output
+    DROP,
+    /// the corresponding filtered value will be null in the output
+    EMIT_NULL,
+  };
+
+  static FilterOptions Defaults() { return FilterOptions{}; }
+
+  NullSelectionBehavior null_selection_behavior = DROP;
+};
+
 /// \brief Filter with a boolean selection filter
 ///
 /// The output will be populated with values from the input at positions
@@ -49,6 +63,10 @@ Result<Datum> Filter(const Datum& values, const Datum& filter,
                      FilterOptions options = FilterOptions::Defaults(),
                      ExecContext* context = NULLPTR);
 
+struct ARROW_EXPORT TakeOptions : public FunctionOptions {
+  static TakeOptions Defaults() { return TakeOptions{}; }
+};
+
 /// \brief Take from an array of values at indices in another array
 ///
 /// \param[in] values datum from which to take
@@ -201,6 +219,11 @@ Result<std::shared_ptr<Table>> Take(const Table& table, const ChunkedArray& indi
                                     const TakeOptions& options = TakeOptions::Defaults(),
                                     ExecContext* context = NULLPTR);
 
+struct PartitionOptions : public FunctionOptions {
+  explicit PartitionOptions(int64_t pivot) : pivot(pivot) {}
+  int64_t pivot;
+};
+
 /// \brief Returns indices that partition an array around n-th
 /// sorted element.
 ///
diff --git a/cpp/src/arrow/compute/cast.cc b/cpp/src/arrow/compute/cast.cc
index 509fd6dbdd8..97b0ce73963 100644
--- a/cpp/src/arrow/compute/cast.cc
+++ b/cpp/src/arrow/compute/cast.cc
@@ -18,7 +18,6 @@
 #include "arrow/compute/cast.h"
 
 #include <mutex>
-#include <sstream>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -26,10 +25,7 @@
 #include <vector>
 
 #include "arrow/compute/cast_internal.h"
-#include "arrow/compute/exec.h"
 #include "arrow/compute/kernel.h"
-#include "arrow/compute/options.h"
-#include "arrow/compute/registry.h"
 
 namespace arrow {
 namespace compute {
@@ -55,13 +51,6 @@ void InitCastTable() {
 
 void EnsureInitCastTable() { std::call_once(cast_table_initialized, InitCastTable); }
 
-void RegisterScalarCasts(FunctionRegistry* registry) {
-  EnsureInitCastTable();
-  for (auto it : g_cast_table) {
-    DCHECK_OK(registry->AddFunction(it.second));
-  }
-}
-
 }  // namespace internal
 
 struct CastFunction::CastFunctionImpl {
@@ -80,9 +69,7 @@ CastFunction::~CastFunction() {}
 Type::type CastFunction::out_type_id() const { return impl_->out_type; }
 
 std::unique_ptr<KernelState> CastInit(KernelContext* ctx, const KernelInitArgs& args) {
-  // NOTE: TakeOptions are currently unused, but we pass it through anyway
   auto cast_options = static_cast<const CastOptions*>(args.options);
-
   // Ensure that the requested type to cast to was attached to the options
   DCHECK(cast_options->to_type);
   return std::unique_ptr<KernelState>(new internal::CastState(*cast_options));
diff --git a/cpp/src/arrow/compute/cast.h b/cpp/src/arrow/compute/cast.h
index 9edc0220682..93961a0fd3b 100644
--- a/cpp/src/arrow/compute/cast.h
+++ b/cpp/src/arrow/compute/cast.h
@@ -21,17 +21,59 @@
 #include <string>
 #include <vector>
 
-#include "arrow/compute/exec.h"
 #include "arrow/compute/function.h"
-#include "arrow/compute/options.h"
+#include "arrow/compute/kernel.h"
 #include "arrow/datum.h"
 #include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
 
 namespace arrow {
+
+class Array;
+
 namespace compute {
 
 class ExecContext;
 
+struct ARROW_EXPORT CastOptions : public FunctionOptions {
+  CastOptions()
+      : allow_int_overflow(false),
+        allow_time_truncate(false),
+        allow_time_overflow(false),
+        allow_decimal_truncate(false),
+        allow_float_truncate(false),
+        allow_invalid_utf8(false) {}
+
+  explicit CastOptions(bool safe)
+      : allow_int_overflow(!safe),
+        allow_time_truncate(!safe),
+        allow_time_overflow(!safe),
+        allow_decimal_truncate(!safe),
+        allow_float_truncate(!safe),
+        allow_invalid_utf8(!safe) {}
+
+  static CastOptions Safe() { return CastOptions(true); }
+
+  static CastOptions Unsafe() { return CastOptions(false); }
+
+  // Type being casted to. May be passed separate to eager function
+  // compute::Cast
+  std::shared_ptr<DataType> to_type;
+
+  bool allow_int_overflow;
+  bool allow_time_truncate;
+  bool allow_time_overflow;
+  bool allow_decimal_truncate;
+  bool allow_float_truncate;
+  // Indicate if conversions from Binary/FixedSizeBinary to string must
+  // validate the utf8 payload.
+  bool allow_invalid_utf8;
+};
+
 // Cast functions are _not_ registered in the FunctionRegistry, though they use
 // the same execution machinery
 class CastFunction : public ScalarFunction {
diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc
index 452369470d8..b48f2440617 100644
--- a/cpp/src/arrow/compute/exec.cc
+++ b/cpp/src/arrow/compute/exec.cc
@@ -18,19 +18,24 @@
 #include "arrow/compute/exec.h"
 
 #include <algorithm>
+#include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <utility>
 #include <vector>
 
 #include "arrow/array.h"
+#include "arrow/buffer.h"
 #include "arrow/compute/exec_internal.h"
 #include "arrow/compute/function.h"
 #include "arrow/compute/kernel.h"
 #include "arrow/compute/registry.h"
 #include "arrow/datum.h"
+#include "arrow/scalar.h"
 #include "arrow/status.h"
 #include "arrow/table.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
 #include "arrow/util/bit_util.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/cpu_info.h"
@@ -43,6 +48,8 @@ using internal::checked_cast;
 using internal::CopyBitmap;
 using internal::CpuInfo;
 
+class MemoryPool;
+
 namespace compute {
 
 namespace {
@@ -755,7 +762,8 @@ class VectorExecutor : public FunctionExecutorImpl<VectorFunction> {
  protected:
   Status ExecuteBatch(const ExecBatch& batch, ExecListener* listener) {
     if (batch.length == 0) {
-      // Skip empty batches. This should only happen with zero-length inputs
+      // Skip empty batches. This may only happen when not using
+      // ExecBatchIterator
       return Status::OK();
     }
     Datum out;
@@ -912,6 +920,10 @@ SelectionVector::SelectionVector(std::shared_ptr<ArrayData> data)
   indices_ = data_->GetValues<int32_t>(1);
 }
 
+SelectionVector::SelectionVector(const Array& arr) : SelectionVector(arr.data()) {}
+
+int32_t SelectionVector::length() const { return static_cast<int32_t>(data_->length); }
+
 Result<std::shared_ptr<SelectionVector>> SelectionVector::FromMask(const Array& arr) {
   return Status::NotImplemented("FromMask");
 }
diff --git a/cpp/src/arrow/compute/exec.h b/cpp/src/arrow/compute/exec.h
index 1770135b7d2..217b5122286 100644
--- a/cpp/src/arrow/compute/exec.h
+++ b/cpp/src/arrow/compute/exec.h
@@ -27,13 +27,17 @@
 #include <vector>
 
 #include "arrow/datum.h"
-#include "arrow/memory_pool.h"
-#include "arrow/status.h"
+#include "arrow/result.h"
+#include "arrow/type_fwd.h"
 #include "arrow/util/macros.h"
 #include "arrow/util/visibility.h"
 
 namespace arrow {
 
+class Array;
+struct ArrayData;
+class MemoryPool;
+
 namespace internal {
 
 class CpuInfo;
@@ -113,14 +117,13 @@ class ARROW_EXPORT SelectionVector {
  public:
   explicit SelectionVector(std::shared_ptr<ArrayData> data);
 
-  explicit SelectionVector(const Array& arr) : SelectionVector(arr.data()) {}
+  explicit SelectionVector(const Array& arr);
 
   /// \brief Create SelectionVector from boolean mask
   static Result<std::shared_ptr<SelectionVector>> FromMask(const Array& arr);
 
-  int32_t index(int i) const { return indices_[i]; }
   const int32_t* indices() const { return indices_; }
-  int32_t length() const { return static_cast<int32_t>(data_->length); }
+  int32_t length() const;
 
  private:
   std::shared_ptr<ArrayData> data_;
diff --git a/cpp/src/arrow/compute/exec_internal.h b/cpp/src/arrow/compute/exec_internal.h
index 650bfc531d2..400f39a7115 100644
--- a/cpp/src/arrow/compute/exec_internal.h
+++ b/cpp/src/arrow/compute/exec_internal.h
@@ -115,11 +115,6 @@ class ARROW_EXPORT FunctionExecutor {
                                                         const FunctionOptions* options);
 };
 
-ARROW_EXPORT
-Status ExecuteFunction(ExecContext* ctx, const std::string& func_name,
-                       const std::vector<Datum>& args, const FunctionOptions* options,
-                       ValueDescr* out_descr, ExecListener* listener);
-
 /// \brief Populate validity bitmap with the intersection of the nullity of the
 /// arguments. If a preallocated bitmap is not provided, then one will be
 /// allocated if needed (in some cases a bitmap can be zero-copied from the
diff --git a/cpp/src/arrow/compute/exec_test.cc b/cpp/src/arrow/compute/exec_test.cc
index 1fb18e5a15a..7679d9eea1f 100644
--- a/cpp/src/arrow/compute/exec_test.cc
+++ b/cpp/src/arrow/compute/exec_test.cc
@@ -15,12 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include <cstring>
 #include <memory>
 #include <vector>
 
 #include <gtest/gtest.h>
 
-#include "arrow/testing/gtest_common.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/random.h"
 
@@ -30,14 +30,16 @@
 #include "arrow/compute/exec_internal.h"
 #include "arrow/compute/function.h"
 #include "arrow/compute/kernel.h"
-#include "arrow/compute/options.h"
 #include "arrow/compute/registry.h"
-#include "arrow/compute/test_util.h"
 #include "arrow/memory_pool.h"
-#include "arrow/pretty_print.h"
+#include "arrow/scalar.h"
+#include "arrow/status.h"
+#include "arrow/table.h"
+#include "arrow/type.h"
 #include "arrow/util/bit_util.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/cpu_info.h"
+#include "arrow/util/logging.h"
 
 namespace arrow {
 
@@ -79,7 +81,6 @@ TEST(SelectionVector, Basics) {
   auto sel_vector = std::make_shared<SelectionVector>(*indices);
 
   ASSERT_EQ(indices->length(), sel_vector->length());
-  ASSERT_EQ(3, sel_vector->index(1));
   ASSERT_EQ(3, sel_vector->indices()[1]);
 }
 
diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc
index 414f3219a6c..c40aa0defc8 100644
--- a/cpp/src/arrow/compute/function.cc
+++ b/cpp/src/arrow/compute/function.cc
@@ -21,12 +21,11 @@
 #include <memory>
 #include <sstream>
 
+#include "arrow/compute/exec.h"
 #include "arrow/compute/exec_internal.h"
+#include "arrow/datum.h"
 
 namespace arrow {
-
-struct ValueDescr;
-
 namespace compute {
 
 static Status CheckArity(const std::vector<InputType>& args, const FunctionArity& arity) {
diff --git a/cpp/src/arrow/compute/function.h b/cpp/src/arrow/compute/function.h
index 2720ea4ba65..db2cb0fc796 100644
--- a/cpp/src/arrow/compute/function.h
+++ b/cpp/src/arrow/compute/function.h
@@ -25,17 +25,22 @@
 #include <vector>
 
 #include "arrow/compute/kernel.h"
-#include "arrow/compute/options.h"  // IWYU pragma: keep
 #include "arrow/result.h"
 #include "arrow/status.h"
+#include "arrow/util/macros.h"
 #include "arrow/util/visibility.h"
 
 namespace arrow {
 
+struct Datum;
 struct ValueDescr;
 
 namespace compute {
 
+class ExecContext;
+
+struct ARROW_EXPORT FunctionOptions {};
+
 /// \brief Contains the number of required arguments for the function
 struct ARROW_EXPORT FunctionArity {
   static FunctionArity Nullary() { return FunctionArity(0, false); }
@@ -150,6 +155,8 @@ class ARROW_EXPORT ScalarFunction : public detail::FunctionImpl<ScalarKernel> {
   /// \brief Return the first kernel that can execute the function given the
   /// exact argument types (without implicit type casts or scalar->array
   /// promotions)
+  ///
+  /// This function is overridden in CastFunction
   virtual Result<const ScalarKernel*> DispatchExact(
       const std::vector<ValueDescr>& values) const;
 };
diff --git a/cpp/src/arrow/compute/function_test.cc b/cpp/src/arrow/compute/function_test.cc
index 84aedd6249c..d8005512b3c 100644
--- a/cpp/src/arrow/compute/function_test.cc
+++ b/cpp/src/arrow/compute/function_test.cc
@@ -23,18 +23,15 @@
 
 #include "arrow/compute/function.h"
 #include "arrow/compute/kernel.h"
+#include "arrow/datum.h"
 #include "arrow/status.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/type.h"
 
 namespace arrow {
-
-struct Datum;
-
 namespace compute {
 
-class ExecContext;
-struct FunctionOptions;
+struct ExecBatch;
 
 TEST(FunctionArity, Basics) {
   auto nullary = FunctionArity::Nullary();
diff --git a/cpp/src/arrow/compute/kernel.cc b/cpp/src/arrow/compute/kernel.cc
index c5bc601ece4..fd79b86e94a 100644
--- a/cpp/src/arrow/compute/kernel.cc
+++ b/cpp/src/arrow/compute/kernel.cc
@@ -26,7 +26,7 @@
 #include "arrow/compute/exec.h"
 #include "arrow/result.h"
 #include "arrow/util/bit_util.h"
-#include "arrow/util/hashing.h"
+#include "arrow/util/hash_util.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/macros.h"
 
diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h
index 701813b81ba..621fb5b8892 100644
--- a/cpp/src/arrow/compute/kernel.h
+++ b/cpp/src/arrow/compute/kernel.h
@@ -32,12 +32,13 @@
 #include "arrow/result.h"
 #include "arrow/status.h"
 #include "arrow/type.h"
+#include "arrow/util/macros.h"
 #include "arrow/util/visibility.h"
 
 namespace arrow {
 
 class Buffer;
-struct Datum;
+class MemoryPool;
 
 namespace compute {
 
@@ -348,6 +349,9 @@ class ARROW_EXPORT KernelSignature {
   mutable int64_t hash_code_;
 };
 
+/// \brief A function may contain multiple variants of a kernel for a given
+/// type combination for different SIMD levels. Based on the active system's
+/// CPU info or the user's preferences, we can elect to use one over the other.
 struct SimdLevel {
   enum type { NONE, SSE4_2, AVX, AVX2, AVX512, NEON };
 };
@@ -419,6 +423,8 @@ struct Kernel {
   // require single-threaded execution.
   bool parallelizable = true;
 
+  /// \brief What level of SIMD instruction support in the host CPU is required
+  /// to use the function
   SimdLevel::type simd_level = SimdLevel::NONE;
 };
 
@@ -435,8 +441,9 @@ struct ArrayKernel : public Kernel {
               KernelInit init = NULLPTR)
       : Kernel(std::move(in_types), std::move(out_type), init), exec(exec) {}
 
-  /// \brief Perform a single invocation of this kernel. In general, this
-  /// function must
+  /// \brief Perform a single invocation of this kernel. Depending on the
+  /// implementation, it may only write into preallocated memory, while in some
+  /// cases it will allocate its own memory.
   ArrayKernelExec exec;
 
   /// \brief Writing execution results into larger contiguous allocations
diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt
index 703236ff9d0..361e24b7523 100644
--- a/cpp/src/arrow/compute/kernels/CMakeLists.txt
+++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt
@@ -26,8 +26,6 @@ add_arrow_compute_test(scalar_test
                        scalar_compare_test.cc
                        scalar_set_lookup_test.cc)
 
-# add_arrow_compute_test(cast_test)
-
 add_arrow_benchmark(scalar_compare_benchmark PREFIX "arrow-compute")
 
 # ----------------------------------------------------------------------
@@ -40,11 +38,11 @@ add_arrow_compute_test(vector_test
                        vector_take_test.cc
                        vector_sort_test.cc)
 
-# add_arrow_benchmark(vector_hash_benchmark PREFIX "arrow-compute")
-# add_arrow_benchmark(vector_sort_benchmark PREFIX "arrow-compute")
-# add_arrow_benchmark(vector_partition_benchmark PREFIX "arrow-compute")
-# add_arrow_benchmark(vector_filter_benchmark PREFIX "arrow-compute")a
-# add_arrow_benchmark(vector_take_benchmark PREFIX "arrow-compute")
+add_arrow_benchmark(vector_hash_benchmark PREFIX "arrow-compute")
+add_arrow_benchmark(vector_sort_benchmark PREFIX "arrow-compute")
+add_arrow_benchmark(vector_partition_benchmark PREFIX "arrow-compute")
+add_arrow_benchmark(vector_filter_benchmark PREFIX "arrow-compute")
+add_arrow_benchmark(vector_take_benchmark PREFIX "arrow-compute")
 
 # ----------------------------------------------------------------------
 # Aggregate kernels
@@ -52,4 +50,4 @@ add_arrow_compute_test(vector_test
 # Aggregates
 
 add_arrow_compute_test(aggregate_test)
-# add_arrow_benchmark(aggregate_benchmark PREFIX "arrow-compute")
+add_arrow_benchmark(aggregate_benchmark PREFIX "arrow-compute")
diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
index 709454d3ac3..db79162b69b 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include "arrow/compute/api_aggregate.h"
 #include "arrow/compute/kernels/aggregate_internal.h"
 #include "arrow/compute/kernels/common.h"
 
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc
index 60df0eb0c68..fa47ce0e647 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.cc
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc
@@ -17,6 +17,7 @@
 
 #include "arrow/compute/kernels/codegen_internal.h"
 
+#include <algorithm>
 #include <cstdint>
 #include <memory>
 #include <mutex>
@@ -35,9 +36,7 @@ void ExecFail(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
 void BinaryExecFlipped(KernelContext* ctx, ArrayKernelExec exec,
                        const ExecBatch& batch, Datum* out) {
   ExecBatch flipped_batch = batch;
-  Datum tmp = flipped_batch.values[0];
-  flipped_batch.values[0] = flipped_batch.values[1];
-  flipped_batch.values[1] = tmp;
+  std::swap(flipped_batch.values[0], flipped_batch.values[1]);
   exec(ctx, flipped_batch, out);
 }
 
@@ -53,53 +52,37 @@ static std::once_flag codegen_static_initialized;
 
 static void InitStaticData() {
   // Signed int types
-  g_signed_int_types.push_back(int8());
-  g_signed_int_types.push_back(int16());
-  g_signed_int_types.push_back(int32());
-  g_signed_int_types.push_back(int64());
+  g_signed_int_types = {int8(), int16(), int32(), int64()};
 
   // Unsigned int types
-  g_unsigned_int_types.push_back(uint8());
-  g_unsigned_int_types.push_back(uint16());
-  g_unsigned_int_types.push_back(uint32());
-  g_unsigned_int_types.push_back(uint64());
+  g_unsigned_int_types = {uint8(), uint16(), uint32(), uint64()};
 
   // All int types
   Extend(g_unsigned_int_types, &g_int_types);
   Extend(g_signed_int_types, &g_int_types);
 
   // Floating point types
-  g_floating_types.push_back(float32());
-  g_floating_types.push_back(float64());
+  g_floating_types = {float32(), float64()};
 
   // Numeric types
   Extend(g_int_types, &g_numeric_types);
   Extend(g_floating_types, &g_numeric_types);
 
   // Temporal types
-  g_temporal_types.push_back(date32());
-  g_temporal_types.push_back(date64());
-  g_temporal_types.push_back(time32(TimeUnit::SECOND));
-  g_temporal_types.push_back(time32(TimeUnit::MILLI));
-  g_temporal_types.push_back(time64(TimeUnit::MICRO));
-  g_temporal_types.push_back(time64(TimeUnit::NANO));
-  g_temporal_types.push_back(timestamp(TimeUnit::SECOND));
-  g_temporal_types.push_back(timestamp(TimeUnit::MILLI));
-  g_temporal_types.push_back(timestamp(TimeUnit::MICRO));
-  g_temporal_types.push_back(timestamp(TimeUnit::NANO));
+  g_temporal_types = {date32(), date64(), time32(TimeUnit::SECOND),
+                      time32(TimeUnit::MILLI), time64(TimeUnit::MICRO),
+                      time64(TimeUnit::NANO), timestamp(TimeUnit::SECOND),
+                      timestamp(TimeUnit::MILLI), timestamp(TimeUnit::MICRO),
+                      timestamp(TimeUnit::NANO)};
 
   // Base binary types (without FixedSizeBinary)
-  g_base_binary_types.push_back(binary());
-  g_base_binary_types.push_back(utf8());
-  g_base_binary_types.push_back(large_binary());
-  g_base_binary_types.push_back(large_utf8());
+  g_base_binary_types = {binary(), utf8(), large_binary(), large_utf8()};
 
   // Non-parametric, non-nested types. This also DOES NOT include
   //
   // * Decimal
   // * Fixed Size Binary
-  g_primitive_types.push_back(null());
-  g_primitive_types.push_back(boolean());
+  g_primitive_types = {null(), boolean()};
   Extend(g_numeric_types, &g_primitive_types);
   Extend(g_temporal_types, &g_primitive_types);
   Extend(g_base_binary_types, &g_primitive_types);
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h
index 1f504ebc520..0ea59780854 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.h
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -39,19 +39,19 @@ namespace compute {
 
 #ifdef ARROW_EXTRA_ERROR_CONTEXT
 
-#define KERNEL_ABORT_IF_ERROR(ctx, expr)                \
-  do {                                                  \
-    Status _st = (expr);                                \
-    if (ARROW_PREDICT_FALSE(!_st.ok())) {               \
-      _st.AddContextLine(__FILE__, __LINE__, #expr);    \
-      ctx->SetStatus(_st);                              \
-      return;                                           \
-    }                                                   \
+#define KERNEL_RETURN_IF_ERROR(ctx, expr)             \
+  do {                                                \
+    Status _st = (expr);                              \
+    if (ARROW_PREDICT_FALSE(!_st.ok())) {             \
+      _st.AddContextLine(__FILE__, __LINE__, #expr);  \
+      ctx->SetStatus(_st);                            \
+      return;                                         \
+    }                                                 \
   } while (0)
 
 #else
 
-#define KERNEL_ABORT_IF_ERROR(ctx, expr)        \
+#define KERNEL_RETURN_IF_ERROR(ctx, expr)       \
   do {                                          \
     Status _st = (expr);                        \
     if (ARROW_PREDICT_FALSE(!_st.ok())) {       \
@@ -78,19 +78,6 @@ template <typename T, typename R = void>
 using enable_if_has_c_type_not_boolean = enable_if_t<has_c_type<T>::value &&
                                                      !is_boolean_type<T>::value, R>;
 
-template <typename T, typename Enable = void>
-struct CodegenTraits;
-
-template <typename T>
-struct CodegenTraits<T, enable_if_has_c_type<T>> {
-  using value_type = typename T::c_type;
-};
-
-template <typename T>
-struct CodegenTraits<T, enable_if_base_binary<T>> {
-  using value_type = util::string_view;
-};
-
 template <typename Type, typename Enable = void>
 struct ArrayIterator;
 
@@ -156,6 +143,11 @@ struct GetValueType<
   using T = util::string_view;
 };
 
+// ----------------------------------------------------------------------
+// Reusable type resolvers
+
+Result<ValueDescr> FirstType(KernelContext*, const std::vector<ValueDescr>& descrs);
+
 // ----------------------------------------------------------------------
 // Generate an array kernel given template classes
 
@@ -164,9 +156,6 @@ void ExecFail(KernelContext* ctx, const ExecBatch& batch, Datum* out);
 void BinaryExecFlipped(KernelContext* ctx, ArrayKernelExec exec,
                        const ExecBatch& batch, Datum* out);
 
-// ----------------------------------------------------------------------
-// Boolean data utilities
-
 // ----------------------------------------------------------------------
 // Template kernel exec function generators
 
@@ -194,33 +183,47 @@ const std::vector<std::shared_ptr<DataType>>& PrimitiveTypes();
 
 namespace codegen {
 
-struct SimpleExec {
-  // Operator must implement
-  //
-  // static void Call(KernelContext*, const ArrayData& in, ArrayData* out)
-  template <typename Operator>
-  static void Unary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    if (batch[0].kind() == Datum::SCALAR) {
-      ctx->SetStatus(Status::NotImplemented("NYI"));
-    } else if (batch.length > 0) {
-      Operator::Call(ctx, *batch[0].array(), out->mutable_array());
-    }
+// Generate an ArrayKernelExec given a functor that handles all of its own
+// iteration, etc.
+//
+// Operator must implement
+//
+// static void Call(KernelContext*, const ArrayData& in, ArrayData* out)
+template <typename Operator>
+void SimpleUnary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  if (batch[0].kind() == Datum::SCALAR) {
+    ctx->SetStatus(Status::NotImplemented("NYI"));
+  } else if (batch.length > 0) {
+    Operator::Call(ctx, *batch[0].array(), out->mutable_array());
   }
+}
 
-  // Operator must implement
-  //
-  // static void Call(KernelContext*, const ArrayData& arg0, const ArrayData& arg1,
-  //                  ArrayData* out)
-  template <typename Operator>
-  static void Binary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    if (batch[0].kind() == Datum::SCALAR || batch[1].kind() == Datum::SCALAR) {
-      ctx->SetStatus(Status::NotImplemented("NYI"));
-    } else if (batch.length > 0) {
-      Operator::Call(ctx, *batch[0].array(), *batch[1].array(), out->mutable_array());
-    }
+// Generate an ArrayKernelExec given a functor that handles all of its own
+// iteration, etc.
+//
+// Operator must implement
+//
+// static void Call(KernelContext*, const ArrayData& arg0, const ArrayData& arg1,
+//                  ArrayData* out)
+template <typename Operator>
+void SimpleBinary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  if (batch[0].kind() == Datum::SCALAR || batch[1].kind() == Datum::SCALAR) {
+    ctx->SetStatus(Status::NotImplemented("NYI"));
+  } else if (batch.length > 0) {
+    Operator::Call(ctx, *batch[0].array(), *batch[1].array(), out->mutable_array());
   }
-};
+}
 
+// A ArrayKernelExec-creation template that iterates over primitive non-boolean
+// inputs and writes into non-boolean primitive outputs.
+//
+// It may be possible to create a more generic template that can deal with any
+// input writing to any output, but we will need to write benchmarks to
+// investigate that on all compiler targets to ensure that the additional
+// template abstractions do not incur performance overhead. This template
+// provides a reference point for performance when there are no templates
+// dealing with value iteration.
+//
 // TODO: Run benchmarks to determine if OutputAdapter is a zero-cost abstraction
 struct ScalarPrimitiveExec {
   template <typename Op, typename OutType, typename Arg0Type>
@@ -295,12 +298,18 @@ struct OutputAdapter<Type, enable_if_base_binary<Type>> {
   }
 };
 
+// A kernel exec generator for unary functions that addresses both array and
+// scalar inputs and dispatches input iteration and output writing to other
+// templates
+//
+// This template executes the operator even on the data behind null values,
+// therefore it is generally only suitable for operators that cannot fail.
 template <typename OutType, typename Arg0Type, typename Op>
 struct ScalarUnary {
   using OutScalar = typename TypeTraits<OutType>::ScalarType;
 
-  using OUT = typename CodegenTraits<OutType>::value_type;
-  using ARG0 = typename CodegenTraits<Arg0Type>::value_type;
+  using OUT = typename GetValueType<OutType>::T;
+  using ARG0 = typename GetValueType<Arg0Type>::T;
 
   static void Array(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     ArrayIterator<Arg0Type> arg0(*batch[0].array());
@@ -328,14 +337,14 @@ struct ScalarUnary {
   }
 };
 
-// Applies a scalar operation with state on the null-null values of a single
-// array
+// An alternative to ScalarUnary that Applies a scalar operation with state on
+// only the not-null values of a single array
 template <typename OutType, typename Arg0Type, typename Op>
 struct ScalarUnaryNotNullStateful {
   using ThisType = ScalarUnaryNotNullStateful<OutType, Arg0Type, Op>;
   using OutScalar = typename TypeTraits<OutType>::ScalarType;
-  using OUT = typename CodegenTraits<OutType>::value_type;
-  using ARG0 = typename CodegenTraits<Arg0Type>::value_type;
+  using OUT = typename GetValueType<OutType>::T;
+  using ARG0 = typename GetValueType<Arg0Type>::T;
 
   Op op;
   ScalarUnaryNotNullStateful(Op op) : op(std::move(op)) {}
@@ -403,6 +412,9 @@ struct ScalarUnaryNotNullStateful {
   }
 };
 
+// An alternative to ScalarUnary that Applies a scalar operation on only the
+// not-null values of a single array. The operator is not stateful; if the
+// operator requires some initialization use ScalarUnaryNotNullStateful
 template <typename OutType, typename Arg0Type, typename Op>
 struct ScalarUnaryNotNull {
   static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
@@ -412,14 +424,20 @@ struct ScalarUnaryNotNull {
   }
 };
 
+// A kernel exec generator for binary functions that addresses both array and
+// scalar inputs and dispatches input iteration and output writing to other
+// templates
+//
+// This template executes the operator even on the data behind null values,
+// therefore it is generally only suitable for operators that cannot fail.
 template <typename OutType, typename Arg0Type, typename Arg1Type, typename Op,
           typename FlippedOp = Op>
 struct ScalarBinary {
   using OutScalarType = typename TypeTraits<OutType>::ScalarType;
 
-  using OUT = typename CodegenTraits<OutType>::value_type;
-  using ARG0 = typename CodegenTraits<Arg0Type>::value_type;
-  using ARG1 = typename CodegenTraits<Arg1Type>::value_type;
+  using OUT = typename GetValueType<OutType>::T;
+  using ARG0 = typename GetValueType<Arg0Type>::T;
+  using ARG1 = typename GetValueType<Arg1Type>::T;
 
   template <typename ChosenOp>
   static void ArrayArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
@@ -465,11 +483,28 @@ struct ScalarBinary {
   }
 };
 
+// A kernel exec generator for binary kernels where both input types are the
+// same
 template <typename OutType, typename ArgType, typename Op,
           typename FlippedOp = Op>
 using ScalarBinaryEqualTypes = ScalarBinary<OutType, ArgType, ArgType, Op, FlippedOp>;
 
+// ----------------------------------------------------------------------
+// Dynamic kernel selectors. These functors allow a kernel implementation to be
+// selected given a arrow::DataType instance. Using these functors triggers the
+// corresponding template that generate's the kernel's Exec function to be
+// instantiated
+
 struct ScalarNumericEqualTypes {
+
+  // Generate a kernel given a functor of type
+  //
+  // struct OPERATOR_NAME {
+  //   template <typename OUT, typename ARG0>
+  //   static OUT Call(KernelContext*, ARG0 val) {
+  //     // IMPLEMENTATION
+  //   }
+  // };
   template <typename Op>
   static ArrayKernelExec Unary(const DataType& type) {
     switch (type.id()) {
@@ -499,6 +534,14 @@ struct ScalarNumericEqualTypes {
     }
   }
 
+  // Generate a kernel given a functor of type
+  //
+  // struct OPERATOR_NAME {
+  //   template <typename OUT, typename ARG0, typename ARG1>
+  //   static OUT Call(KernelContext*, ARG0 left, ARG1 right) {
+  //     // IMPLEMENTATION
+  //   }
+  // };
   template <typename Op>
   static ArrayKernelExec Binary(const DataType& type) {
     switch (type.id()) {
@@ -529,6 +572,18 @@ struct ScalarNumericEqualTypes {
   }
 };
 
+// Generate a kernel given a templated functor. This template effectively
+// "curries" the first type argument. The functor must be of the form:
+//
+// template <typename Type0, typename Type1, Args...>
+// struct FUNCTOR {
+//   static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+//     // IMPLEMENTATION
+//   }
+// };
+//
+// This function will generate exec functions where Type1 is one of the numeric
+// types
 template <template <typename...> class Generator,
           typename Type0, typename... Args>
 ArrayKernelExec Numeric(const DataType& type) {
@@ -559,6 +614,9 @@ ArrayKernelExec Numeric(const DataType& type) {
   }
 }
 
+// Generate a kernel given a templated functor for floating point types
+//
+// See "Numeric" above for description of the generator functor
 template <template <typename...> class Generator,
           typename Type0, typename... Args>
 ArrayKernelExec FloatingPoint(const DataType& type) {
@@ -573,6 +631,9 @@ ArrayKernelExec FloatingPoint(const DataType& type) {
   }
 }
 
+// Generate a kernel given a templated functor for integer types
+//
+// See "Numeric" above for description of the generator functor
 template <template <typename...> class Generator,
           typename Type0, typename... Args>
 ArrayKernelExec Integer(const DataType& type) {
@@ -599,6 +660,9 @@ ArrayKernelExec Integer(const DataType& type) {
   }
 }
 
+// Generate a kernel given a templated functor for base binary types
+//
+// See "Numeric" above for description of the generator functor
 template <template <typename...> class Generator,
           typename Type0, typename... Args>
 ArrayKernelExec BaseBinary(const DataType& type) {
@@ -617,6 +681,9 @@ ArrayKernelExec BaseBinary(const DataType& type) {
   }
 }
 
+// Generate a kernel given a templated functor for temporal types
+//
+// See "Numeric" above for description of the generator functor
 template <template <typename...> class Generator,
           typename Type0, typename... Args>
 ArrayKernelExec Temporal(const DataType& type) {
@@ -639,10 +706,5 @@ ArrayKernelExec Temporal(const DataType& type) {
 
 }  // namespace codegen
 
-// ----------------------------------------------------------------------
-// Reusable type resolvers
-
-Result<ValueDescr> FirstType(KernelContext*, const std::vector<ValueDescr>& descrs);
-
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/common.h b/cpp/src/arrow/compute/kernels/common.h
index 527896b63da..6b14260cec2 100644
--- a/cpp/src/arrow/compute/kernels/common.h
+++ b/cpp/src/arrow/compute/kernels/common.h
@@ -29,7 +29,6 @@
 #include "arrow/compute/function.h"                  // IWYU pragma: export
 #include "arrow/compute/kernel.h"                    // IWYU pragma: export
 #include "arrow/compute/kernels/codegen_internal.h"  // IWYU pragma: export
-#include "arrow/compute/options.h"                   // IWYU pragma: export
 #include "arrow/compute/registry.h"                  // IWYU pragma: export
 #include "arrow/datum.h"                             // IWYU pragma: export
 #include "arrow/memory_pool.h"                       // IWYU pragma: export
diff --git a/cpp/src/arrow/compute/kernels/scalar_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_boolean.cc
index 6444a466c1d..29b51960852 100644
--- a/cpp/src/arrow/compute/kernels/scalar_boolean.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_boolean.cc
@@ -23,8 +23,6 @@ namespace compute {
 
 namespace {
 
-enum class ResolveNull { KLEENE_LOGIC, PROPAGATE };
-
 enum BitmapIndex { LEFT_VALID, LEFT_DATA, RIGHT_VALID, RIGHT_DATA };
 
 template <typename ComputeWord>
@@ -168,19 +166,17 @@ void MakeFunction(std::string name, int arity, ArrayKernelExec exec,
 
 namespace internal {
 
-using codegen::SimpleExec;
-
 void RegisterScalarBoolean(FunctionRegistry* registry) {
   // These functions can write into sliced output bitmaps
-  MakeFunction("invert", 1, SimpleExec::Unary<Invert>, registry);
-  MakeFunction("and", 2, SimpleExec::Binary<And>, registry);
-  MakeFunction("or", 2, SimpleExec::Binary<Or>, registry);
-  MakeFunction("xor", 2, SimpleExec::Binary<Xor>, registry);
+  MakeFunction("invert", 1, codegen::SimpleUnary<Invert>, registry);
+  MakeFunction("and", 2, codegen::SimpleBinary<And>, registry);
+  MakeFunction("or", 2, codegen::SimpleBinary<Or>, registry);
+  MakeFunction("xor", 2, codegen::SimpleBinary<Xor>, registry);
 
   // The Kleene logic kernels cannot write into sliced output bitmaps
-  MakeFunction("and_kleene", 2, SimpleExec::Binary<KleeneAnd>, registry,
+  MakeFunction("and_kleene", 2, codegen::SimpleBinary<KleeneAnd>, registry,
                /*can_write_into_slices=*/false, NullHandling::COMPUTED_PREALLOCATE);
-  MakeFunction("or_kleene", 2, SimpleExec::Binary<KleeneOr>, registry,
+  MakeFunction("or_kleene", 2, codegen::SimpleBinary<KleeneOr>, registry,
                /*can_write_into_slices=*/false, NullHandling::COMPUTED_PREALLOCATE);
 }
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_boolean_test.cc b/cpp/src/arrow/compute/kernels/scalar_boolean_test.cc
index c3c0ea9d9a2..a4e833048b8 100644
--- a/cpp/src/arrow/compute/kernels/scalar_boolean_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_boolean_test.cc
@@ -15,7 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include <cstdint>
 #include <functional>
 #include <memory>
 #include <utility>
@@ -23,12 +22,10 @@
 
 #include <gtest/gtest.h>
 
-#include "arrow/testing/gtest_common.h"
-#include "arrow/testing/gtest_util.h"
-
 #include "arrow/compute/api_scalar.h"
-#include "arrow/compute/kernel.h"
 #include "arrow/compute/test_util.h"
+#include "arrow/testing/gtest_common.h"
+#include "arrow/testing/gtest_util.h"
 
 namespace arrow {
 namespace compute {
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast.cc b/cpp/src/arrow/compute/kernels/scalar_cast.cc
deleted file mode 100644
index 93f10bb1f19..00000000000
--- a/cpp/src/arrow/compute/kernels/scalar_cast.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/compute/kernels/cast.h"
-
-#include <cstdint>
-#include <cstring>
-#include <functional>
-#include <limits>
-#include <memory>
-#include <string>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "arrow/array.h"
-#include "arrow/buffer.h"
-#include "arrow/builder.h"
-#include "arrow/compute/exec.h"
-#include "arrow/type.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/bit_util.h"
-#include "arrow/util/checked_cast.h"
-#include "arrow/util/formatting.h"
-#include "arrow/util/logging.h"
-#include "arrow/util/macros.h"
-#include "arrow/util/time.h"
-#include "arrow/util/utf8.h"
-#include "arrow/util/value_parsing.h"  // IWYU pragma: keep
-#include "arrow/visitor_inline.h"
-
-#include "arrow/compute/kernel.h"
-
-namespace arrow {
-
-using internal::checked_cast;
-using internal::CopyBitmap;
-
-namespace compute {}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
index 44f7dce1491..4fb5fabe02e 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
@@ -33,7 +33,7 @@ void CastFromExtension(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
   ExtensionArray extension(batch[0].array());
 
   Datum casted_storage;
-  KERNEL_ABORT_IF_ERROR(
+  KERNEL_RETURN_IF_ERROR(
       ctx, Cast(*extension.storage(), out->type(), options, ctx->exec_context())
                .Value(&casted_storage));
   out->value = casted_storage.array();
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
index 44642d11645..6d52f55c2e3 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
@@ -153,9 +153,9 @@ struct FromDictUnpackHelper {
   void Unpack(KernelContext* ctx, const ArrayData& indices, const ArrayType& dictionary,
               ArrayData* output) {
     FromDictVisitor<T, IndexType> visitor{ctx, dictionary, output};
-    KERNEL_ABORT_IF_ERROR(ctx, visitor.Init());
-    KERNEL_ABORT_IF_ERROR(ctx, ArrayDataVisitor<IndexType>::Visit(indices, &visitor));
-    KERNEL_ABORT_IF_ERROR(ctx, visitor.Finish());
+    KERNEL_RETURN_IF_ERROR(ctx, visitor.Init());
+    KERNEL_RETURN_IF_ERROR(ctx, ArrayDataVisitor<IndexType>::Visit(indices, &visitor));
+    KERNEL_RETURN_IF_ERROR(ctx, visitor.Finish());
   }
 };
 
@@ -225,7 +225,7 @@ struct FromNullCast {
     ArrayData* output = out->mutable_array();
     std::shared_ptr<Array> nulls;
     Status s = MakeArrayOfNull(output->type, batch.length).Value(&nulls);
-    KERNEL_ABORT_IF_ERROR(ctx, s);
+    KERNEL_RETURN_IF_ERROR(ctx, s);
     out->value = nulls->data();
   }
 };
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
index 4c893bc49da..af8502179ba 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
@@ -46,7 +46,7 @@ void CastListExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
   auto child_type = checked_cast<const Type&>(*result->type).value_type();
 
   Datum casted_child;
-  KERNEL_ABORT_IF_ERROR(
+  KERNEL_RETURN_IF_ERROR(
       ctx, Cast(Datum(input.child_data[0]), child_type, options, ctx->exec_context())
                .Value(&casted_child));
   DCHECK_EQ(Datum::ARRAY, casted_child.kind());
diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
index 7d16f6fb12f..2d1cbd3d791 100644
--- a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
@@ -18,6 +18,7 @@
 #include <algorithm>
 
 #include "arrow/builder.h"
+#include "arrow/compute/api_scalar.h"
 #include "arrow/compute/kernels/common.h"
 #include "arrow/util/hashing.h"
 #include "arrow/util/optional.h"
diff --git a/cpp/src/arrow/compute/kernels/vector_filter.cc b/cpp/src/arrow/compute/kernels/vector_filter.cc
index fe82747ef52..fa45b5c8007 100644
--- a/cpp/src/arrow/compute/kernels/vector_filter.cc
+++ b/cpp/src/arrow/compute/kernels/vector_filter.cc
@@ -116,7 +116,7 @@ struct FilterFunctor {
     BooleanArray filter(batch[1].array());
     const int64_t output_size = FilterOutputSize(NullSelection, filter);
     std::shared_ptr<Array> result;
-    KERNEL_ABORT_IF_ERROR(ctx, Select(ctx, values, IS(filter, output_size), &result));
+    KERNEL_RETURN_IF_ERROR(ctx, Select(ctx, values, IS(filter, output_size), &result));
     out->value = result->data();
   }
 
diff --git a/cpp/src/arrow/compute/kernels/vector_filter_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_filter_benchmark.cc
index 5a1e2126ce2..78c6b4afa23 100644
--- a/cpp/src/arrow/compute/kernels/vector_filter_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/vector_filter_benchmark.cc
@@ -17,8 +17,7 @@
 
 #include "benchmark/benchmark.h"
 
-#include "arrow/compute/kernel.h"
-
+#include "arrow/compute/api_vector.h"
 #include "arrow/compute/benchmark_util.h"
 #include "arrow/compute/test_util.h"
 #include "arrow/testing/gtest_util.h"
diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc
index e6448f7df82..fa71a7cdc0c 100644
--- a/cpp/src/arrow/compute/kernels/vector_hash.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash.cc
@@ -439,21 +439,21 @@ struct HashInitVisitor {
 
 void HashExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
   auto hash_impl = checked_cast<HashKernel*>(ctx->state());
-  KERNEL_ABORT_IF_ERROR(ctx, hash_impl->Append(ctx, *batch[0].array()));
-  KERNEL_ABORT_IF_ERROR(ctx, hash_impl->Flush(out));
+  KERNEL_RETURN_IF_ERROR(ctx, hash_impl->Append(ctx, *batch[0].array()));
+  KERNEL_RETURN_IF_ERROR(ctx, hash_impl->Flush(out));
 }
 
 void UniqueFinalize(KernelContext* ctx, std::vector<Datum>* out) {
   auto hash_impl = checked_cast<HashKernel*>(ctx->state());
   std::shared_ptr<ArrayData> uniques;
-  KERNEL_ABORT_IF_ERROR(ctx, hash_impl->GetDictionary(&uniques));
+  KERNEL_RETURN_IF_ERROR(ctx, hash_impl->GetDictionary(&uniques));
   *out = {Datum(uniques)};
 }
 
 void DictEncodeFinalize(KernelContext* ctx, std::vector<Datum>* out) {
   auto hash_impl = checked_cast<HashKernel*>(ctx->state());
   std::shared_ptr<ArrayData> uniques;
-  KERNEL_ABORT_IF_ERROR(ctx, hash_impl->GetDictionary(&uniques));
+  KERNEL_RETURN_IF_ERROR(ctx, hash_impl->GetDictionary(&uniques));
   auto dict_type = dictionary(int32(), uniques->type);
   auto dict = MakeArray(uniques);
   for (size_t i = 0; i < out->size(); ++i) {
@@ -465,10 +465,10 @@ void DictEncodeFinalize(KernelContext* ctx, std::vector<Datum>* out) {
 void ValueCountsFinalize(KernelContext* ctx, std::vector<Datum>* out) {
   auto hash_impl = checked_cast<HashKernel*>(ctx->state());
   std::shared_ptr<ArrayData> uniques;
-  KERNEL_ABORT_IF_ERROR(ctx, hash_impl->GetDictionary(&uniques));
+  KERNEL_RETURN_IF_ERROR(ctx, hash_impl->GetDictionary(&uniques));
 
   Datum value_counts;
-  KERNEL_ABORT_IF_ERROR(ctx, hash_impl->FlushFinal(&value_counts));
+  KERNEL_RETURN_IF_ERROR(ctx, hash_impl->FlushFinal(&value_counts));
   auto data_type =
       struct_({field(kValuesFieldName, uniques->type), field(kCountsFieldName, int64())});
   ArrayVector children = {MakeArray(uniques), value_counts.make_array()};
diff --git a/cpp/src/arrow/compute/kernels/vector_partition_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_partition_benchmark.cc
index dceaf799fc7..03533f4d1df 100644
--- a/cpp/src/arrow/compute/kernels/vector_partition_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/vector_partition_benchmark.cc
@@ -17,8 +17,7 @@
 
 #include "benchmark/benchmark.h"
 
-#include "arrow/compute/kernel.h"
-
+#include "arrow/compute/api_vector.h"
 #include "arrow/compute/benchmark_util.h"
 #include "arrow/compute/test_util.h"
 #include "arrow/testing/gtest_util.h"
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.h b/cpp/src/arrow/compute/kernels/vector_selection_internal.h
index ac2d548dde7..5e441520bc5 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_internal.h
+++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.h
@@ -22,6 +22,7 @@
 #include <vector>
 
 #include "arrow/builder.h"
+#include "arrow/compute/api_vector.h"
 #include "arrow/compute/kernels/common.h"
 #include "arrow/record_batch.h"
 #include "arrow/result.h"
diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc
index 6aa2db6c108..abd3fdb5eb2 100644
--- a/cpp/src/arrow/compute/kernels/vector_sort.cc
+++ b/cpp/src/arrow/compute/kernels/vector_sort.cc
@@ -21,6 +21,7 @@
 #include <utility>
 #include <vector>
 
+#include "arrow/compute/api_vector.h"
 #include "arrow/compute/kernels/common.h"
 
 namespace arrow {
diff --git a/cpp/src/arrow/compute/kernels/vector_sort_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_sort_benchmark.cc
index 06d7b2cf1e4..ee8a3119c21 100644
--- a/cpp/src/arrow/compute/kernels/vector_sort_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/vector_sort_benchmark.cc
@@ -17,8 +17,8 @@
 
 #include "benchmark/benchmark.h"
 
+#include "arrow/compute/api_vector.h"
 #include "arrow/compute/benchmark_util.h"
-#include "arrow/compute/kernel.h"
 #include "arrow/compute/test_util.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/random.h"
diff --git a/cpp/src/arrow/compute/kernels/vector_take.cc b/cpp/src/arrow/compute/kernels/vector_take.cc
index e3ee318c759..db3a4af9949 100644
--- a/cpp/src/arrow/compute/kernels/vector_take.cc
+++ b/cpp/src/arrow/compute/kernels/vector_take.cc
@@ -49,7 +49,7 @@ struct TakeFunctor {
     ValueArrayType values(batch[0].array());
     IndexArrayType indices(batch[1].array());
     std::shared_ptr<Array> result;
-    KERNEL_ABORT_IF_ERROR(ctx, Select(ctx, values, IS(indices), &result));
+    KERNEL_RETURN_IF_ERROR(ctx, Select(ctx, values, IS(indices), &result));
     out->value = result->data();
   }
 };
diff --git a/cpp/src/arrow/compute/options.h b/cpp/src/arrow/compute/options.h
deleted file mode 100644
index 31373e56941..00000000000
--- a/cpp/src/arrow/compute/options.h
+++ /dev/null
@@ -1,155 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <memory>
-#include <utility>
-
-#include "arrow/util/visibility.h"
-
-namespace arrow {
-
-class Array;
-class DataType;
-
-namespace compute {
-
-struct ARROW_EXPORT FunctionOptions {};
-
-struct ARROW_EXPORT CastOptions : public FunctionOptions {
-  CastOptions()
-      : allow_int_overflow(false),
-        allow_time_truncate(false),
-        allow_time_overflow(false),
-        allow_decimal_truncate(false),
-        allow_float_truncate(false),
-        allow_invalid_utf8(false) {}
-
-  explicit CastOptions(bool safe)
-      : allow_int_overflow(!safe),
-        allow_time_truncate(!safe),
-        allow_time_overflow(!safe),
-        allow_decimal_truncate(!safe),
-        allow_float_truncate(!safe),
-        allow_invalid_utf8(!safe) {}
-
-  static CastOptions Safe() { return CastOptions(true); }
-
-  static CastOptions Unsafe() { return CastOptions(false); }
-
-  // Type being casted to. May be passed separate to eager function
-  // compute::Cast
-  std::shared_ptr<DataType> to_type;
-
-  bool allow_int_overflow;
-  bool allow_time_truncate;
-  bool allow_time_overflow;
-  bool allow_decimal_truncate;
-  bool allow_float_truncate;
-  // Indicate if conversions from Binary/FixedSizeBinary to string must
-  // validate the utf8 payload.
-  bool allow_invalid_utf8;
-};
-
-enum CompareOperator {
-  EQUAL,
-  NOT_EQUAL,
-  GREATER,
-  GREATER_EQUAL,
-  LESS,
-  LESS_EQUAL,
-};
-
-struct CompareOptions : public FunctionOptions {
-  explicit CompareOptions(CompareOperator op) : op(op) {}
-
-  enum CompareOperator op;
-};
-
-/// \class CountOptions
-///
-/// The user control the Count kernel behavior with this class. By default, the
-/// it will count all non-null values.
-struct ARROW_EXPORT CountOptions : public FunctionOptions {
-  enum mode {
-    // Count all non-null values.
-    COUNT_ALL = 0,
-    // Count all null values.
-    COUNT_NULL,
-  };
-
-  explicit CountOptions(enum mode count_mode) : count_mode(count_mode) {}
-
-  static CountOptions Defaults() { return CountOptions(COUNT_ALL); }
-
-  enum mode count_mode = COUNT_ALL;
-};
-
-/// For set lookup operations like IsIn, Match
-struct ARROW_EXPORT SetLookupOptions : public FunctionOptions {
-  explicit SetLookupOptions(std::shared_ptr<Array> value_set, bool skip_nulls)
-      : value_set(std::move(value_set)), skip_nulls(skip_nulls) {}
-
-  std::shared_ptr<Array> value_set;
-  bool skip_nulls;
-};
-
-struct FilterOptions : public FunctionOptions {
-  /// Configure the action taken when a slot of the selection mask is null
-  enum NullSelectionBehavior {
-    /// the corresponding filtered value will be removed in the output
-    DROP,
-    /// the corresponding filtered value will be null in the output
-    EMIT_NULL,
-  };
-
-  static FilterOptions Defaults() { return FilterOptions{}; }
-
-  NullSelectionBehavior null_selection_behavior = DROP;
-};
-
-struct ARROW_EXPORT TakeOptions : public FunctionOptions {
-  static TakeOptions Defaults() { return TakeOptions{}; }
-};
-
-/// \class MinMaxOptions
-///
-/// The user can control the MinMax kernel behavior with this class. By default,
-/// it will skip null if there is a null value present.
-struct ARROW_EXPORT MinMaxOptions : public FunctionOptions {
-  enum mode {
-    /// skip null values
-    SKIP = 0,
-    /// any nulls will result in null output
-    OUTPUT_NULL
-  };
-
-  explicit MinMaxOptions(enum mode null_handling = SKIP) : null_handling(null_handling) {}
-
-  static MinMaxOptions Defaults() { return MinMaxOptions{}; }
-
-  enum mode null_handling = SKIP;
-};
-
-struct PartitionOptions : public FunctionOptions {
-  explicit PartitionOptions(int64_t pivot) : pivot(pivot) {}
-  int64_t pivot;
-};
-
-}  // namespace compute
-}  // namespace arrow
diff --git a/cpp/src/arrow/compute/registry.cc b/cpp/src/arrow/compute/registry.cc
index ac7f9884d73..4a5d32f5c25 100644
--- a/cpp/src/arrow/compute/registry.cc
+++ b/cpp/src/arrow/compute/registry.cc
@@ -21,11 +21,11 @@
 #include <memory>
 #include <mutex>
 #include <unordered_map>
+#include <utility>
 
 #include "arrow/compute/function.h"
 #include "arrow/compute/registry_internal.h"
 #include "arrow/status.h"
-#include "arrow/util/logging.h"
 
 namespace arrow {
 namespace compute {
@@ -92,34 +92,33 @@ std::vector<std::string> FunctionRegistry::GetFunctionNames() const {
 
 int FunctionRegistry::num_functions() const { return impl_->num_functions(); }
 
-static std::unique_ptr<FunctionRegistry> g_registry;
-static std::once_flag func_registry_initialized;
-
 namespace internal {
 
-static void CreateBuiltInRegistry() {
-  g_registry = FunctionRegistry::Make();
+static std::unique_ptr<FunctionRegistry> CreateBuiltInRegistry() {
+  auto registry = FunctionRegistry::Make();
 
   // Scalar functions
-  RegisterScalarArithmetic(g_registry.get());
-  RegisterScalarBoolean(g_registry.get());
-  RegisterScalarComparison(g_registry.get());
-  RegisterScalarSetLookup(g_registry.get());
+  RegisterScalarArithmetic(registry.get());
+  RegisterScalarBoolean(registry.get());
+  RegisterScalarComparison(registry.get());
+  RegisterScalarSetLookup(registry.get());
 
   // Aggregate functions
-  RegisterScalarAggregateBasic(g_registry.get());
+  RegisterScalarAggregateBasic(registry.get());
 
   // Vector functions
-  RegisterVectorFilter(g_registry.get());
-  RegisterVectorHash(g_registry.get());
-  RegisterVectorSort(g_registry.get());
-  RegisterVectorTake(g_registry.get());
+  RegisterVectorFilter(registry.get());
+  RegisterVectorHash(registry.get());
+  RegisterVectorSort(registry.get());
+  RegisterVectorTake(registry.get());
+
+  return registry;
 }
 
 }  // namespace internal
 
 FunctionRegistry* GetFunctionRegistry() {
-  std::call_once(func_registry_initialized, internal::CreateBuiltInRegistry);
+  static auto g_registry = internal::CreateBuiltInRegistry();
   return g_registry.get();
 }
 
diff --git a/cpp/src/arrow/compute/registry.h b/cpp/src/arrow/compute/registry.h
index 5a9774eeb72..50c24539449 100644
--- a/cpp/src/arrow/compute/registry.h
+++ b/cpp/src/arrow/compute/registry.h
@@ -22,7 +22,6 @@
 
 #include <memory>
 #include <string>
-#include <utility>
 #include <vector>
 
 #include "arrow/result.h"
diff --git a/cpp/src/arrow/compute/registry_test.cc b/cpp/src/arrow/compute/registry_test.cc
index 2166155f9c1..155b5255530 100644
--- a/cpp/src/arrow/compute/registry_test.cc
+++ b/cpp/src/arrow/compute/registry_test.cc
@@ -15,18 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include <algorithm>
-#include <cstdint>
-#include <cstdio>
 #include <memory>
 #include <string>
 #include <vector>
 
 #include <gtest/gtest.h>
 
-#include "arrow/array.h"
 #include "arrow/compute/function.h"
-#include "arrow/compute/kernel.h"
 #include "arrow/compute/registry.h"
 #include "arrow/result.h"
 #include "arrow/scalar.h"
diff --git a/cpp/src/arrow/dataset/filter.h b/cpp/src/arrow/dataset/filter.h
index 96366268a8a..436762c1107 100644
--- a/cpp/src/arrow/dataset/filter.h
+++ b/cpp/src/arrow/dataset/filter.h
@@ -25,7 +25,8 @@
 #include <utility>
 #include <vector>
 
-#include "arrow/compute/options.h"
+#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/cast.h"
 #include "arrow/dataset/type_fwd.h"
 #include "arrow/dataset/visibility.h"
 #include "arrow/datum.h"
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index 86b8a8ab869..97d8ededcbf 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -35,6 +35,7 @@
 #include "arrow/status.h"
 #include "arrow/table.h"
 #include "arrow/util/checked_cast.h"
+#include "arrow/util/hash_util.h"
 #include "arrow/util/hashing.h"
 #include "arrow/util/key_value_metadata.h"
 #include "arrow/util/logging.h"
diff --git a/cpp/src/arrow/util/hash_util.h b/cpp/src/arrow/util/hash_util.h
new file mode 100644
index 00000000000..dd1c38a7821
--- /dev/null
+++ b/cpp/src/arrow/util/hash_util.h
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+namespace arrow {
+namespace internal {
+
+// ----------------------------------------------------------------------
+// BEGIN Hash utilities from Boost
+
+namespace detail {
+
+#if defined(_MSC_VER)
+#define ARROW_HASH_ROTL32(x, r) _rotl(x, r)
+#else
+#define ARROW_HASH_ROTL32(x, r) (x << r) | (x >> (32 - r))
+#endif
+
+template <typename SizeT>
+inline void hash_combine_impl(SizeT& seed, SizeT value) {
+  seed ^= value + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
+
+inline void hash_combine_impl(uint32_t& h1, uint32_t k1) {
+  const uint32_t c1 = 0xcc9e2d51;
+  const uint32_t c2 = 0x1b873593;
+
+  k1 *= c1;
+  k1 = ARROW_HASH_ROTL32(k1, 15);
+  k1 *= c2;
+
+  h1 ^= k1;
+  h1 = ARROW_HASH_ROTL32(h1, 13);
+  h1 = h1 * 5 + 0xe6546b64;
+}
+
+#undef ARROW_HASH_ROTL32
+
+}  // namespace detail
+
+template <class T>
+inline void hash_combine(std::size_t& seed, T const& v) {
+  std::hash<T> hasher;
+  return ::arrow::internal::detail::hash_combine_impl(seed, hasher(v));
+}
+
+// END Hash utilities from Boost
+// ----------------------------------------------------------------------
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/hashing.h b/cpp/src/arrow/util/hashing.h
index efb1030a411..dee7d32f786 100644
--- a/cpp/src/arrow/util/hashing.h
+++ b/cpp/src/arrow/util/hashing.h
@@ -869,47 +869,5 @@ static inline Status ComputeNullBitmap(MemoryPool* pool, const MemoTableType& me
   return Status::OK();
 }
 
-// ----------------------------------------------------------------------
-// BEGIN Hash utilities from Boost
-
-namespace detail {
-
-#if defined(_MSC_VER)
-#define ARROW_HASH_ROTL32(x, r) _rotl(x, r)
-#else
-#define ARROW_HASH_ROTL32(x, r) (x << r) | (x >> (32 - r))
-#endif
-
-template <typename SizeT>
-inline void hash_combine_impl(SizeT& seed, SizeT value) {
-  seed ^= value + 0x9e3779b9 + (seed << 6) + (seed >> 2);
-}
-
-inline void hash_combine_impl(uint32_t& h1, uint32_t k1) {
-  const uint32_t c1 = 0xcc9e2d51;
-  const uint32_t c2 = 0x1b873593;
-
-  k1 *= c1;
-  k1 = ARROW_HASH_ROTL32(k1, 15);
-  k1 *= c2;
-
-  h1 ^= k1;
-  h1 = ARROW_HASH_ROTL32(h1, 13);
-  h1 = h1 * 5 + 0xe6546b64;
-}
-
-#undef ARROW_HASH_ROTL32
-
-}  // namespace detail
-
-template <class T>
-inline void hash_combine(std::size_t& seed, T const& v) {
-  std::hash<T> hasher;
-  return ::arrow::internal::detail::hash_combine_impl(seed, hasher(v));
-}
-
-// END Hash utilities from Boost
-// ----------------------------------------------------------------------
-
 }  // namespace internal
 }  // namespace arrow
diff --git a/cpp/src/gandiva/filter.cc b/cpp/src/gandiva/filter.cc
index c4ab09815a5..5546c097b06 100644
--- a/cpp/src/gandiva/filter.cc
+++ b/cpp/src/gandiva/filter.cc
@@ -22,7 +22,7 @@
 #include <utility>
 #include <vector>
 
-#include "arrow/util/hashing.h"
+#include "arrow/util/hash_util.h"
 
 #include "gandiva/bitmap_accumulator.h"
 #include "gandiva/cache.h"
diff --git a/cpp/src/gandiva/function_signature.cc b/cpp/src/gandiva/function_signature.cc
index d148c98bc97..6dc6416178e 100644
--- a/cpp/src/gandiva/function_signature.cc
+++ b/cpp/src/gandiva/function_signature.cc
@@ -24,7 +24,8 @@
 #include <vector>
 
 #include "arrow/util/checked_cast.h"
-#include "arrow/util/hashing.h"
+#include "arrow/util/hash_util.h"
+#include "arrow/util/logging.h"
 #include "arrow/util/string.h"
 
 using arrow::internal::AsciiEqualsCaseInsensitive;
diff --git a/cpp/src/gandiva/projector.cc b/cpp/src/gandiva/projector.cc
index 344e2a64593..734720c64c9 100644
--- a/cpp/src/gandiva/projector.cc
+++ b/cpp/src/gandiva/projector.cc
@@ -22,7 +22,8 @@
 #include <utility>
 #include <vector>
 
-#include "arrow/util/hashing.h"
+#include "arrow/util/hash_util.h"
+#include "arrow/util/logging.h"
 
 #include "gandiva/cache.h"
 #include "gandiva/expr_validator.h"

From e1ab1bda72607af6b4e68f508d81a24bed7c193c Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Thu, 21 May 2020 22:25:05 -0500
Subject: [PATCH 18/41] Initial emkornfield comments

---
 cpp/src/arrow/compute/api_scalar.cc           |  3 --
 cpp/src/arrow/compute/cast.cc                 |  2 +-
 cpp/src/arrow/compute/exec.cc                 |  8 ++---
 cpp/src/arrow/compute/exec.h                  | 21 +++++++++++--
 cpp/src/arrow/compute/exec_internal.h         |  9 ++++--
 cpp/src/arrow/compute/function.cc             | 12 ++++----
 cpp/src/arrow/compute/function.h              | 30 +++++++++----------
 cpp/src/arrow/compute/function_test.cc        | 24 +++++++--------
 cpp/src/arrow/compute/kernel.cc               |  2 +-
 cpp/src/arrow/compute/kernel_test.cc          |  6 ++--
 .../arrow/compute/kernels/aggregate_basic.cc  |  8 ++---
 .../compute/kernels/scalar_arithmetic.cc      |  2 +-
 .../arrow/compute/kernels/scalar_compare.cc   |  2 +-
 .../compute/kernels/scalar_set_lookup.cc      |  4 +--
 .../arrow/compute/kernels/vector_filter.cc    |  2 +-
 cpp/src/arrow/compute/kernels/vector_hash.cc  |  6 ++--
 cpp/src/arrow/compute/kernels/vector_sort.cc  |  5 ++--
 cpp/src/arrow/compute/kernels/vector_take.cc  |  2 +-
 18 files changed, 81 insertions(+), 67 deletions(-)

diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc
index 888e738c90d..4ee402b79f2 100644
--- a/cpp/src/arrow/compute/api_scalar.cc
+++ b/cpp/src/arrow/compute/api_scalar.cc
@@ -103,9 +103,6 @@ Result<Datum> Compare(const Datum& left, const Datum& right, CompareOptions opti
     case CompareOperator::LESS_EQUAL:
       func_name = "<=";
       break;
-    default:
-      DCHECK(false);
-      break;
   }
   return CallFunction(ctx, func_name, {left, right}, &options);
 }
diff --git a/cpp/src/arrow/compute/cast.cc b/cpp/src/arrow/compute/cast.cc
index 97b0ce73963..63b8e509edc 100644
--- a/cpp/src/arrow/compute/cast.cc
+++ b/cpp/src/arrow/compute/cast.cc
@@ -59,7 +59,7 @@ struct CastFunction::CastFunctionImpl {
 };
 
 CastFunction::CastFunction(std::string name, Type::type out_type)
-    : ScalarFunction(std::move(name), /*arity=*/1) {
+    : ScalarFunction(std::move(name), Arity::Unary()) {
   impl_.reset(new CastFunctionImpl());
   impl_->out_type = out_type;
 }
diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc
index b48f2440617..bc085d257cc 100644
--- a/cpp/src/arrow/compute/exec.cc
+++ b/cpp/src/arrow/compute/exec.cc
@@ -123,11 +123,6 @@ Result<std::unique_ptr<ExecBatchIterator>> ExecBatchIterator::Make(
     }
   }
 
-  // No maximum was indicated
-  if (max_chunksize < 1) {
-    max_chunksize = length;
-  }
-
   return std::unique_ptr<ExecBatchIterator>(
       new ExecBatchIterator(std::move(args), length, max_chunksize));
 }
@@ -924,7 +919,8 @@ SelectionVector::SelectionVector(const Array& arr) : SelectionVector(arr.data())
 
 int32_t SelectionVector::length() const { return static_cast<int32_t>(data_->length); }
 
-Result<std::shared_ptr<SelectionVector>> SelectionVector::FromMask(const Array& arr) {
+Result<std::shared_ptr<SelectionVector>> SelectionVector::FromMask(
+    const BooleanArray& arr) {
   return Status::NotImplemented("FromMask");
 }
 
diff --git a/cpp/src/arrow/compute/exec.h b/cpp/src/arrow/compute/exec.h
index 217b5122286..43d7e100f90 100644
--- a/cpp/src/arrow/compute/exec.h
+++ b/cpp/src/arrow/compute/exec.h
@@ -95,7 +95,7 @@ class ARROW_EXPORT ExecContext {
   // TODO: At some point we might want the limit the size of contiguous
   // preallocations (for example, merging small ChunkedArray chunks until
   // reaching some desired size)
-  void set_preallocate_contiguous(bool preallocate = true) {
+  void set_preallocate_contiguous(bool preallocate) {
     preallocate_contiguous_ = preallocate;
   }
 
@@ -112,7 +112,18 @@ class ARROW_EXPORT ExecContext {
 // TODO: Consider standardizing on uint16 selection vectors and only use them
 // when we can ensure that each value is 64K length or smaller
 
-/// \brief Container for a int32 selection
+/// \brief Container for an array of value selection indices that were
+/// materialized from a filter.
+///
+/// Columnar query engines (see e.g. [1]) have found that rather than
+/// materializing filtered data, the filter can instead be converted to an
+/// array of the "on" indices and then "fusing" these indices in operator
+/// implementations. This is especially relevant for aggregations but also
+/// applies to scalar operations.
+///
+/// We are not yet using this so this is mostly a placeholder for now.
+///
+/// [1]: http://cidrdb.org/cidr2005/papers/P19.pdf
 class ARROW_EXPORT SelectionVector {
  public:
   explicit SelectionVector(std::shared_ptr<ArrayData> data);
@@ -120,7 +131,7 @@ class ARROW_EXPORT SelectionVector {
   explicit SelectionVector(const Array& arr);
 
   /// \brief Create SelectionVector from boolean mask
-  static Result<std::shared_ptr<SelectionVector>> FromMask(const Array& arr);
+  static Result<std::shared_ptr<SelectionVector>> FromMask(const BooleanArray& arr);
 
   const int32_t* indices() const { return indices_; }
   int32_t length() const;
@@ -130,6 +141,10 @@ class ARROW_EXPORT SelectionVector {
   const int32_t* indices_;
 };
 
+/// \brief A unit of work for kernel execution. It contains a collection of
+/// Array and Scalar values and an optional SelectionVector indicating that
+/// there is an unmaterialized filter that either must be materialized, or (if
+/// the kernel supports it) pushed down into the kernel implementation.
 struct ExecBatch {
   ExecBatch() {}
   ExecBatch(std::vector<Datum> values, int64_t length)
diff --git a/cpp/src/arrow/compute/exec_internal.h b/cpp/src/arrow/compute/exec_internal.h
index 400f39a7115..5a11c49a991 100644
--- a/cpp/src/arrow/compute/exec_internal.h
+++ b/cpp/src/arrow/compute/exec_internal.h
@@ -18,6 +18,7 @@
 #pragma once
 
 #include <cstdint>
+#include <limits>
 #include <memory>
 #include <string>
 #include <vector>
@@ -46,8 +47,9 @@ class ARROW_EXPORT ExecBatchIterator {
   /// \param[in] max_chunksize the maximum length of each ExecBatch. Depending
   /// on the chunk layout of ChunkedArray. Default of -1 means no maximum, so
   /// as greedy as possible
-  static Result<std::unique_ptr<ExecBatchIterator>> Make(std::vector<Datum> args,
-                                                         int64_t max_chunksize = -1);
+  static Result<std::unique_ptr<ExecBatchIterator>> Make(
+      std::vector<Datum> args,
+      int64_t max_chunksize = std::numeric_limits<int64_t>::max());
 
   /// \brief Compute the next batch. Always returns at least one batch. Return
   /// false if the iterator is exhausted
@@ -95,6 +97,9 @@ class DatumAccumulator : public ExecListener {
   std::vector<Datum> values_;
 };
 
+/// \brief Check that each Datum is of a "value" type, which means either
+/// SCALAR, ARRAY, or CHUNKED_ARRAY. If there are chunked inputs, then these
+/// inputs will be split into non-chunked ExecBatch values for execution
 Status CheckAllValues(const std::vector<Datum>& values);
 
 class ARROW_EXPORT FunctionExecutor {
diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc
index c40aa0defc8..881cd229b3b 100644
--- a/cpp/src/arrow/compute/function.cc
+++ b/cpp/src/arrow/compute/function.cc
@@ -28,10 +28,10 @@
 namespace arrow {
 namespace compute {
 
-static Status CheckArity(const std::vector<InputType>& args, const FunctionArity& arity) {
+static Status CheckArity(const std::vector<InputType>& args, const Arity& arity) {
   const int passed_num_args = static_cast<int>(args.size());
   if (arity.is_varargs && passed_num_args < arity.num_args) {
-    return Status::Invalid("Varargs function needs at least ", arity.num_args,
+    return Status::Invalid("VarArgs function needs at least ", arity.num_args,
                            " arguments but kernel accepts only ", passed_num_args);
   } else if (!arity.is_varargs && passed_num_args != arity.num_args) {
     return Status::Invalid("Function accepts ", arity.num_args,
@@ -61,9 +61,9 @@ Result<const KernelType*> DispatchExactImpl(const Function& func,
   const int passed_num_args = static_cast<int>(values.size());
 
   // Validate arity
-  const FunctionArity arity = func.arity();
+  const Arity arity = func.arity();
   if (arity.is_varargs && passed_num_args < arity.num_args) {
-    return Status::Invalid("Varargs function needs at least ", arity.num_args,
+    return Status::Invalid("VarArgs function needs at least ", arity.num_args,
                            " arguments but passed only ", passed_num_args);
   } else if (!arity.is_varargs && passed_num_args != arity.num_args) {
     return Status::Invalid("Function accepts ", arity.num_args, " arguments but passed ",
@@ -100,7 +100,7 @@ Status ScalarFunction::AddKernel(std::vector<InputType> in_types, OutputType out
   RETURN_NOT_OK(CheckArity(in_types, arity_));
 
   if (arity_.is_varargs && in_types.size() != 1) {
-    return Status::Invalid("Varargs signatures must have exactly one input type");
+    return Status::Invalid("VarArgs signatures must have exactly one input type");
   }
   auto sig =
       KernelSignature::Make(std::move(in_types), std::move(out_type), arity_.is_varargs);
@@ -127,7 +127,7 @@ Status VectorFunction::AddKernel(std::vector<InputType> in_types, OutputType out
   RETURN_NOT_OK(CheckArity(in_types, arity_));
 
   if (arity_.is_varargs && in_types.size() != 1) {
-    return Status::Invalid("Varargs signatures must have exactly one input type");
+    return Status::Invalid("VarArgs signatures must have exactly one input type");
   }
   auto sig =
       KernelSignature::Make(std::move(in_types), std::move(out_type), arity_.is_varargs);
diff --git a/cpp/src/arrow/compute/function.h b/cpp/src/arrow/compute/function.h
index db2cb0fc796..d52241583f5 100644
--- a/cpp/src/arrow/compute/function.h
+++ b/cpp/src/arrow/compute/function.h
@@ -42,14 +42,14 @@ class ExecContext;
 struct ARROW_EXPORT FunctionOptions {};
 
 /// \brief Contains the number of required arguments for the function
-struct ARROW_EXPORT FunctionArity {
-  static FunctionArity Nullary() { return FunctionArity(0, false); }
-  static FunctionArity Unary() { return FunctionArity(1, false); }
-  static FunctionArity Binary() { return FunctionArity(2, false); }
-  static FunctionArity Ternary() { return FunctionArity(3, false); }
-  static FunctionArity Varargs(int min_args = 1) { return FunctionArity(min_args, true); }
-
-  FunctionArity(int num_args, bool is_varargs = false)  // NOLINT implicit conversion
+struct ARROW_EXPORT Arity {
+  static Arity Nullary() { return Arity(0, false); }
+  static Arity Unary() { return Arity(1, false); }
+  static Arity Binary() { return Arity(2, false); }
+  static Arity Ternary() { return Arity(3, false); }
+  static Arity VarArgs(int min_args = 1) { return Arity(min_args, true); }
+
+  Arity(int num_args, bool is_varargs = false)  // NOLINT implicit conversion
       : num_args(num_args), is_varargs(is_varargs) {}
 
   /// The number of required arguments (or the minimum number for varargs
@@ -92,7 +92,7 @@ class ARROW_EXPORT Function {
   Function::Kind kind() const { return kind_; }
 
   /// \brief Contains the number of arguments the function requires
-  const FunctionArity& arity() const { return arity_; }
+  const Arity& arity() const { return arity_; }
 
   /// \brief Returns the number of registered kernels for this function
   virtual int num_kernels() const = 0;
@@ -103,11 +103,11 @@ class ARROW_EXPORT Function {
                         ExecContext* ctx = NULLPTR) const;
 
  protected:
-  Function(std::string name, Function::Kind kind, const FunctionArity& arity)
+  Function(std::string name, Function::Kind kind, const Arity& arity)
       : name_(std::move(name)), kind_(kind), arity_(arity) {}
   std::string name_;
   Function::Kind kind_;
-  FunctionArity arity_;
+  Arity arity_;
 };
 
 namespace detail {
@@ -121,7 +121,7 @@ class FunctionImpl : public Function {
   int num_kernels() const override { return static_cast<int>(kernels_.size()); }
 
  protected:
-  FunctionImpl(std::string name, Function::Kind kind, const FunctionArity& arity)
+  FunctionImpl(std::string name, Function::Kind kind, const Arity& arity)
       : Function(std::move(name), kind, arity) {}
 
   std::vector<KernelType> kernels_;
@@ -138,7 +138,7 @@ class ARROW_EXPORT ScalarFunction : public detail::FunctionImpl<ScalarKernel> {
  public:
   using KernelType = ScalarKernel;
 
-  ScalarFunction(std::string name, const FunctionArity& arity)
+  ScalarFunction(std::string name, const Arity& arity)
       : detail::FunctionImpl<ScalarKernel>(std::move(name), Function::SCALAR, arity) {}
 
   /// \brief Add a simple kernel (function implementation) with given
@@ -169,7 +169,7 @@ class ARROW_EXPORT VectorFunction : public detail::FunctionImpl<VectorKernel> {
  public:
   using KernelType = VectorKernel;
 
-  VectorFunction(std::string name, const FunctionArity& arity)
+  VectorFunction(std::string name, const Arity& arity)
       : detail::FunctionImpl<VectorKernel>(std::move(name), Function::VECTOR, arity) {}
 
   /// \brief Add a simple kernel (function implementation) with given
@@ -194,7 +194,7 @@ class ARROW_EXPORT ScalarAggregateFunction
  public:
   using KernelType = ScalarAggregateKernel;
 
-  ScalarAggregateFunction(std::string name, const FunctionArity& arity)
+  ScalarAggregateFunction(std::string name, const Arity& arity)
       : detail::FunctionImpl<ScalarAggregateKernel>(std::move(name),
                                                     Function::SCALAR_AGGREGATE, arity) {}
 
diff --git a/cpp/src/arrow/compute/function_test.cc b/cpp/src/arrow/compute/function_test.cc
index d8005512b3c..3ce44c004e2 100644
--- a/cpp/src/arrow/compute/function_test.cc
+++ b/cpp/src/arrow/compute/function_test.cc
@@ -33,32 +33,32 @@ namespace compute {
 
 struct ExecBatch;
 
-TEST(FunctionArity, Basics) {
-  auto nullary = FunctionArity::Nullary();
+TEST(Arity, Basics) {
+  auto nullary = Arity::Nullary();
   ASSERT_EQ(0, nullary.num_args);
   ASSERT_FALSE(nullary.is_varargs);
 
-  auto unary = FunctionArity::Unary();
+  auto unary = Arity::Unary();
   ASSERT_EQ(1, unary.num_args);
 
-  auto binary = FunctionArity::Binary();
+  auto binary = Arity::Binary();
   ASSERT_EQ(2, binary.num_args);
 
-  auto ternary = FunctionArity::Ternary();
+  auto ternary = Arity::Ternary();
   ASSERT_EQ(3, ternary.num_args);
 
-  auto varargs = FunctionArity::Varargs();
+  auto varargs = Arity::VarArgs();
   ASSERT_EQ(1, varargs.num_args);
   ASSERT_TRUE(varargs.is_varargs);
 
-  auto varargs2 = FunctionArity::Varargs(2);
+  auto varargs2 = Arity::VarArgs(2);
   ASSERT_EQ(2, varargs2.num_args);
   ASSERT_TRUE(varargs2.is_varargs);
 }
 
 TEST(ScalarFunction, Basics) {
   ScalarFunction func("scalar_test", 2);
-  ScalarFunction varargs_func("varargs_test", FunctionArity::Varargs(1));
+  ScalarFunction varargs_func("varargs_test", Arity::VarArgs(1));
 
   ASSERT_EQ("scalar_test", func.name());
   ASSERT_EQ(2, func.arity().num_args);
@@ -73,7 +73,7 @@ TEST(ScalarFunction, Basics) {
 
 TEST(VectorFunction, Basics) {
   VectorFunction func("vector_test", 2);
-  VectorFunction varargs_func("varargs_test", FunctionArity::Varargs(1));
+  VectorFunction varargs_func("varargs_test", Arity::VarArgs(1));
 
   ASSERT_EQ("vector_test", func.name());
   ASSERT_EQ(2, func.arity().num_args);
@@ -146,8 +146,8 @@ TEST(ScalarVectorFunction, DispatchExact) {
   CheckAddDispatch(&func2);
 }
 
-TEST(ArrayFunction, Varargs) {
-  ScalarFunction va_func("va_test", FunctionArity::Varargs(1));
+TEST(ArrayFunction, VarArgs) {
+  ScalarFunction va_func("va_test", Arity::VarArgs(1));
 
   std::vector<InputType> va_args = {int8()};
 
@@ -156,7 +156,7 @@ TEST(ArrayFunction, Varargs) {
   // No input type passed
   ASSERT_RAISES(Invalid, va_func.AddKernel({}, int8(), ExecNYI));
 
-  // Varargs function expect a single input type
+  // VarArgs function expect a single input type
   ASSERT_RAISES(Invalid, va_func.AddKernel({int8(), int8()}, int8(), ExecNYI));
 
   // Invalid sig
diff --git a/cpp/src/arrow/compute/kernel.cc b/cpp/src/arrow/compute/kernel.cc
index fd79b86e94a..1a44a6cde24 100644
--- a/cpp/src/arrow/compute/kernel.cc
+++ b/cpp/src/arrow/compute/kernel.cc
@@ -227,7 +227,7 @@ KernelSignature::KernelSignature(std::vector<InputType> in_types, OutputType out
       out_type_(std::move(out_type)),
       is_varargs_(is_varargs),
       hash_code_(0) {
-  // Varargs sigs must have only a single input type to use for argument validation
+  // VarArgs sigs must have only a single input type to use for argument validation
   DCHECK(!is_varargs || (is_varargs && (in_types_.size() == 1)));
 }
 
diff --git a/cpp/src/arrow/compute/kernel_test.cc b/cpp/src/arrow/compute/kernel_test.cc
index 147a3c81bf4..56a3006c882 100644
--- a/cpp/src/arrow/compute/kernel_test.cc
+++ b/cpp/src/arrow/compute/kernel_test.cc
@@ -350,7 +350,7 @@ TEST(KernelSignature, Equals) {
   ASSERT_NE(sig6, sig7);
 }
 
-TEST(KernelSignature, VarargsEquals) {
+TEST(KernelSignature, VarArgsEquals) {
   KernelSignature sig1({int8()}, utf8(), /*is_varargs=*/true);
   KernelSignature sig2({int8()}, utf8(), /*is_varargs=*/true);
   KernelSignature sig3({int8()}, utf8());
@@ -405,7 +405,7 @@ TEST(KernelSignature, MatchesInputs) {
       sig3.MatchesInputs({ValueDescr::Array(int8()), ValueDescr::Array(int32())}));
 }
 
-TEST(KernelSignature, VarargsMatchesInputs) {
+TEST(KernelSignature, VarArgsMatchesInputs) {
   KernelSignature sig({int8()}, utf8(), /*is_varargs=*/true);
 
   std::vector<ValueDescr> args = {int8()};
@@ -432,7 +432,7 @@ TEST(KernelSignature, ToString) {
   ASSERT_EQ("(any[int8], any[decimal*]) -> computed", sig2.ToString());
 }
 
-TEST(KernelSignature, VarargsToString) {
+TEST(KernelSignature, VarArgsToString) {
   KernelSignature sig({int8()}, utf8(), /*is_varargs=*/true);
   ASSERT_EQ("varargs[any[int8]] -> any[string]", sig.ToString());
 }
diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
index db79162b69b..0941b2d58be 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
@@ -496,7 +496,7 @@ void AddMinMaxKernels(KernelInit init,
 }
 
 void RegisterScalarAggregateBasic(FunctionRegistry* registry) {
-  auto func = std::make_shared<ScalarAggregateFunction>("count", /*arity=*/1);
+  auto func = std::make_shared<ScalarAggregateFunction>("count", Arity::Unary());
 
   /// Takes any array input, outputs int64 scalar
   InputType any_array(ValueDescr::ARRAY);
@@ -504,17 +504,17 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) {
                func.get());
   DCHECK_OK(registry->AddFunction(std::move(func)));
 
-  func = std::make_shared<ScalarAggregateFunction>("sum", /*arity=*/1);
+  func = std::make_shared<ScalarAggregateFunction>("sum", Arity::Unary());
   AddBasicAggKernels(SumInit, SignedIntTypes(), int64(), func.get());
   AddBasicAggKernels(SumInit, UnsignedIntTypes(), uint64(), func.get());
   AddBasicAggKernels(SumInit, FloatingPointTypes(), float64(), func.get());
   DCHECK_OK(registry->AddFunction(std::move(func)));
 
-  func = std::make_shared<ScalarAggregateFunction>("mean", /*arity=*/1);
+  func = std::make_shared<ScalarAggregateFunction>("mean", Arity::Unary());
   AddBasicAggKernels(MeanInit, NumericTypes(), float64(), func.get());
   DCHECK_OK(registry->AddFunction(std::move(func)));
 
-  func = std::make_shared<ScalarAggregateFunction>("minmax", /*arity=*/1);
+  func = std::make_shared<ScalarAggregateFunction>("minmax", Arity::Unary());
   AddMinMaxKernels(MinMaxInit, NumericTypes(), func.get());
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
index 3f353bf8cf2..3d88532228b 100644
--- a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
@@ -31,7 +31,7 @@ namespace codegen {
 
 template <typename Op>
 void MakeBinaryFunction(std::string name, FunctionRegistry* registry) {
-  auto func = std::make_shared<ScalarFunction>(name, /*arity=*/2);
+  auto func = std::make_shared<ScalarFunction>(name, Arity::Binary());
   for (const std::shared_ptr<DataType>& ty : NumericTypes()) {
     DCHECK_OK(func->AddKernel({InputType::Array(ty), InputType::Array(ty)}, ty,
                               ScalarNumericEqualTypes::Binary<Op>(*ty)));
diff --git a/cpp/src/arrow/compute/kernels/scalar_compare.cc b/cpp/src/arrow/compute/kernels/scalar_compare.cc
index 613994a9268..e314f49dd6d 100644
--- a/cpp/src/arrow/compute/kernels/scalar_compare.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_compare.cc
@@ -70,7 +70,7 @@ struct LessEqual {
 
 template <typename Op, typename FlippedOp = Op>
 void MakeCompareFunction(std::string name, FunctionRegistry* registry) {
-  auto func = std::make_shared<ScalarFunction>(name, /*arity=*/2);
+  auto func = std::make_shared<ScalarFunction>(name, Arity::Binary());
 
   auto out_ty = boolean();
   DCHECK_OK(func->AddKernel(
diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
index 2d1cbd3d791..234f0215a1c 100644
--- a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
@@ -287,7 +287,7 @@ void RegisterScalarSetLookup(FunctionRegistry* registry) {
     ScalarKernel isin_base;
     isin_base.init = InitSetLookup;
     isin_base.exec = ExecIsIn;
-    auto isin = std::make_shared<ScalarFunction>("isin", /*arity=*/1);
+    auto isin = std::make_shared<ScalarFunction>("isin", Arity::Unary());
 
     codegen::AddBasicSetLookupKernels(isin_base, /*output_type=*/boolean(), isin.get());
 
@@ -304,7 +304,7 @@ void RegisterScalarSetLookup(FunctionRegistry* registry) {
     match_base.exec = ExecMatch;
     match_base.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
     match_base.mem_allocation = MemAllocation::NO_PREALLOCATE;
-    auto match = std::make_shared<ScalarFunction>("match", /*arity=*/1);
+    auto match = std::make_shared<ScalarFunction>("match", Arity::Unary());
     codegen::AddBasicSetLookupKernels(match_base, /*output_type=*/int32(), match.get());
 
     match_base.signature = KernelSignature::Make({InputType::Array(null())}, int32());
diff --git a/cpp/src/arrow/compute/kernels/vector_filter.cc b/cpp/src/arrow/compute/kernels/vector_filter.cc
index fa45b5c8007..dd198d40018 100644
--- a/cpp/src/arrow/compute/kernels/vector_filter.cc
+++ b/cpp/src/arrow/compute/kernels/vector_filter.cc
@@ -152,7 +152,7 @@ void RegisterVectorFilter(FunctionRegistry* registry) {
   VectorKernel base;
   base.init = InitFilter;
 
-  auto filter = std::make_shared<VectorFunction>("filter", /*arity=*/2);
+  auto filter = std::make_shared<VectorFunction>("filter", Arity::Binary());
   OutputType out_ty(FirstType);
   InputType arg1_ty = InputType::Array(boolean());
   for (const auto& value_ty : PrimitiveTypes()) {
diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc
index fa71a7cdc0c..2a40cd200fb 100644
--- a/cpp/src/arrow/compute/kernels/vector_hash.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash.cc
@@ -522,18 +522,18 @@ void RegisterVectorHash(FunctionRegistry* registry) {
 
   base.finalize = UniqueFinalize;
   base.output_chunked = false;
-  auto unique = std::make_shared<VectorFunction>("unique", /*arity=*/1);
+  auto unique = std::make_shared<VectorFunction>("unique", Arity::Unary());
   AddHashKernels<UniqueAction>(unique.get(), base, /*output_type=*/FirstType);
   DCHECK_OK(registry->AddFunction(std::move(unique)));
 
   base.finalize = ValueCountsFinalize;
-  auto value_counts = std::make_shared<VectorFunction>("value_counts", /*arity=*/1);
+  auto value_counts = std::make_shared<VectorFunction>("value_counts", Arity::Unary());
   AddHashKernels<ValueCountsAction>(value_counts.get(), base, ValueCountsOutput);
   DCHECK_OK(registry->AddFunction(std::move(value_counts)));
 
   base.finalize = DictEncodeFinalize;
   base.output_chunked = true;
-  auto dict_encode = std::make_shared<VectorFunction>("dict_encode", /*arity=*/1);
+  auto dict_encode = std::make_shared<VectorFunction>("dict_encode", Arity::Unary());
   AddHashKernels<DictEncodeAction>(dict_encode.get(), base, DictEncodeOutput);
   DCHECK_OK(registry->AddFunction(std::move(dict_encode)));
 }
diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc
index abd3fdb5eb2..c878dc2d6d5 100644
--- a/cpp/src/arrow/compute/kernels/vector_sort.cc
+++ b/cpp/src/arrow/compute/kernels/vector_sort.cc
@@ -281,12 +281,13 @@ void RegisterVectorSort(FunctionRegistry* registry) {
   base.mem_allocation = MemAllocation::PREALLOCATE;
   base.null_handling = NullHandling::OUTPUT_NOT_NULL;
 
-  auto sort_indices = std::make_shared<VectorFunction>("sort_indices", /*arity=*/1);
+  auto sort_indices = std::make_shared<VectorFunction>("sort_indices", Arity::Unary());
   AddSortingKernels<SortIndices>(base, sort_indices.get());
   DCHECK_OK(registry->AddFunction(std::move(sort_indices)));
 
   // partition_indices has a parameter so needs its init function
-  auto part_indices = std::make_shared<VectorFunction>("partition_indices", /*arity=*/1);
+  auto part_indices =
+      std::make_shared<VectorFunction>("partition_indices", Arity::Unary());
   base.init = InitPartitionIndices;
   AddSortingKernels<PartitionIndices>(base, part_indices.get());
   DCHECK_OK(registry->AddFunction(std::move(part_indices)));
diff --git a/cpp/src/arrow/compute/kernels/vector_take.cc b/cpp/src/arrow/compute/kernels/vector_take.cc
index db3a4af9949..aa23ec88e8e 100644
--- a/cpp/src/arrow/compute/kernels/vector_take.cc
+++ b/cpp/src/arrow/compute/kernels/vector_take.cc
@@ -84,7 +84,7 @@ void RegisterVectorTake(FunctionRegistry* registry) {
   base.init = InitTake;
   base.can_execute_chunkwise = false;
 
-  auto take = std::make_shared<VectorFunction>("take", /*arity=*/2);
+  auto take = std::make_shared<VectorFunction>("take", Arity::Binary());
 
   OutputType out_ty(FirstType);
   for (const auto& value_ty : PrimitiveTypes()) {

From 297065875f275826cd73dd4b0955d677d90ae066 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Thu, 21 May 2020 22:34:08 -0500
Subject: [PATCH 19/41] Fix buglets with default exec chunksize change

---
 cpp/src/arrow/compute/exec.cc         | 2 ++
 cpp/src/arrow/compute/exec.h          | 3 ++-
 cpp/src/arrow/compute/exec_internal.h | 8 ++++----
 cpp/src/arrow/compute/exec_test.cc    | 9 ++++-----
 4 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc
index bc085d257cc..7c990ea6c9d 100644
--- a/cpp/src/arrow/compute/exec.cc
+++ b/cpp/src/arrow/compute/exec.cc
@@ -123,6 +123,8 @@ Result<std::unique_ptr<ExecBatchIterator>> ExecBatchIterator::Make(
     }
   }
 
+  max_chunksize = std::min(length, max_chunksize);
+
   return std::unique_ptr<ExecBatchIterator>(
       new ExecBatchIterator(std::move(args), length, max_chunksize));
 }
diff --git a/cpp/src/arrow/compute/exec.h b/cpp/src/arrow/compute/exec.h
index 43d7e100f90..c38bc7823e3 100644
--- a/cpp/src/arrow/compute/exec.h
+++ b/cpp/src/arrow/compute/exec.h
@@ -21,6 +21,7 @@
 #pragma once
 
 #include <cstdint>
+#include <limits>
 #include <memory>
 #include <string>
 #include <utility>
@@ -104,7 +105,7 @@ class ARROW_EXPORT ExecContext {
  private:
   MemoryPool* pool_;
   FunctionRegistry* func_registry_;
-  int64_t exec_chunksize_ = -1;
+  int64_t exec_chunksize_ = std::numeric_limits<int64_t>::max();
   bool preallocate_contiguous_ = true;
   bool use_threads_ = true;
 };
diff --git a/cpp/src/arrow/compute/exec_internal.h b/cpp/src/arrow/compute/exec_internal.h
index 5a11c49a991..507cd1703a8 100644
--- a/cpp/src/arrow/compute/exec_internal.h
+++ b/cpp/src/arrow/compute/exec_internal.h
@@ -35,6 +35,8 @@ namespace compute {
 
 class Function;
 
+static constexpr int64_t kDefaultMaxChunksize = std::numeric_limits<int64_t>::max();
+
 namespace detail {
 
 /// \brief Break std::vector<Datum> into a sequence of ExecBatch for kernel
@@ -45,11 +47,9 @@ class ARROW_EXPORT ExecBatchIterator {
   ///
   /// \param[in] args the Datum argument, must be all array-like or scalar
   /// \param[in] max_chunksize the maximum length of each ExecBatch. Depending
-  /// on the chunk layout of ChunkedArray. Default of -1 means no maximum, so
-  /// as greedy as possible
+  /// on the chunk layout of ChunkedArray.
   static Result<std::unique_ptr<ExecBatchIterator>> Make(
-      std::vector<Datum> args,
-      int64_t max_chunksize = std::numeric_limits<int64_t>::max());
+      std::vector<Datum> args, int64_t max_chunksize = kDefaultMaxChunksize);
 
   /// \brief Compute the next batch. Always returns at least one batch. Return
   /// false if the iterator is exhausted
diff --git a/cpp/src/arrow/compute/exec_test.cc b/cpp/src/arrow/compute/exec_test.cc
index 7679d9eea1f..2075abc4c49 100644
--- a/cpp/src/arrow/compute/exec_test.cc
+++ b/cpp/src/arrow/compute/exec_test.cc
@@ -53,9 +53,7 @@ TEST(ExecContext, BasicWorkings) {
     ExecContext ctx;
     ASSERT_EQ(GetFunctionRegistry(), ctx.func_registry());
     ASSERT_EQ(default_memory_pool(), ctx.memory_pool());
-
-    // No default chunksize right now
-    ASSERT_EQ(-1, ctx.exec_chunksize());
+    ASSERT_EQ(std::numeric_limits<int64_t>::max(), ctx.exec_chunksize());
 
     ASSERT_TRUE(ctx.use_threads());
     ASSERT_EQ(internal::CpuInfo::GetInstance(), ctx.cpu_info());
@@ -410,7 +408,8 @@ TEST_F(TestPropagateNulls, NullOutputTypeNoop) {
 
 class TestExecBatchIterator : public TestComputeInternals {
  public:
-  void SetupIterator(std::vector<Datum> args, int64_t max_chunksize = -1) {
+  void SetupIterator(std::vector<Datum> args,
+                     int64_t max_chunksize = kDefaultMaxChunksize) {
     ASSERT_OK_AND_ASSIGN(iterator_,
                          ExecBatchIterator::Make(std::move(args), max_chunksize));
   }
@@ -789,7 +788,7 @@ TEST_F(TestCallScalarFunction, BasicNonStandardCases) {
 
     // The default should be a single array output
     {
-      exec_ctx_->set_exec_chunksize(-1);
+      exec_ctx_->set_exec_chunksize(kDefaultMaxChunksize);
       ASSERT_OK_AND_ASSIGN(Datum result, CallFunction(exec_ctx_.get(), func_name, args));
       AssertArraysEqual(*arr, *result.make_array(), true);
     }

From f6e82d2456903b9b4ddee928cea758f66c2f9bce Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Fri, 22 May 2020 08:57:30 -0500
Subject: [PATCH 20/41] Fix testing submodule

---
 testing | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/testing b/testing
index 7660b5fb3dc..bff46cad1a4 160000
--- a/testing
+++ b/testing
@@ -1 +1 @@
-Subproject commit 7660b5fb3dca9f21996ac239d9f520e631f4d3e5
+Subproject commit bff46cad1a4b4df490f853a319e143a03b6ec233

From 3e32f1c6adf967cc7ed1fbd93bdbf9da25aff29b Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Fri, 22 May 2020 09:47:30 -0500
Subject: [PATCH 21/41] Initial MSVC compilation fixes

---
 cpp/src/arrow/compute/kernel.cc               |   2 +-
 .../arrow/compute/kernels/aggregate_basic.cc  |   6 +-
 .../arrow/compute/kernels/codegen_internal.cc |   7 +
 .../arrow/compute/kernels/codegen_internal.h  | 225 ++++++++----------
 cpp/src/arrow/compute/kernels/common.h        |   3 +
 .../compute/kernels/scalar_arithmetic.cc      |   2 +-
 .../arrow/compute/kernels/scalar_boolean.cc   |   9 +-
 .../compute/kernels/scalar_cast_boolean.cc    |   4 +-
 .../compute/kernels/scalar_cast_temporal.cc   |   1 -
 .../compute/kernels/scalar_set_lookup.cc      |   7 +-
 docs/source/developers/python.rst             |   1 +
 11 files changed, 129 insertions(+), 138 deletions(-)

diff --git a/cpp/src/arrow/compute/kernel.cc b/cpp/src/arrow/compute/kernel.cc
index 1a44a6cde24..7e7478be05d 100644
--- a/cpp/src/arrow/compute/kernel.cc
+++ b/cpp/src/arrow/compute/kernel.cc
@@ -119,7 +119,7 @@ std::string InputType::ToString() const {
     case InputType::SAME_TYPE_ID: {
       // Indicate that the parameters for the type are unspecified. TODO: don't
       // show this for types without parameters, like Type::INT32
-      ss << internal::ToString(type_id_) << "*";
+      ss << ::arrow::internal::ToString(type_id_) << "*";
     } break;
     default:
       DCHECK(false);
diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
index 0941b2d58be..76a4dedd4be 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
@@ -147,8 +147,8 @@ struct SumState {
   ThisType ConsumeTiny(const ArrayType& array) const {
     ThisType local;
 
-    internal::BitmapReader reader(array.null_bitmap_data(), array.offset(),
-                                  array.length());
+    BitmapReader reader(array.null_bitmap_data(), array.offset(),
+                        array.length());
     const auto values = array.raw_values();
     for (int64_t i = 0; i < array.length(); i++) {
       if (reader.IsSet()) {
@@ -387,7 +387,7 @@ struct MinMaxImpl : public ScalarAggregator {
 
     const auto values = arr.raw_values();
     if (arr.null_count() > 0) {
-      internal::BitmapReader reader(arr.null_bitmap_data(), arr.offset(), arr.length());
+      BitmapReader reader(arr.null_bitmap_data(), arr.offset(), arr.length());
       for (int64_t i = 0; i < arr.length(); i++) {
         if (reader.IsSet()) {
           local.MergeOne(values[i]);
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc
index fa47ce0e647..6ab4534ea45 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.cc
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc
@@ -50,6 +50,13 @@ std::vector<std::shared_ptr<DataType>> g_temporal_types;
 std::vector<std::shared_ptr<DataType>> g_primitive_types;
 static std::once_flag codegen_static_initialized;
 
+template <typename T>
+void Extend(const std::vector<T>& values, std::vector<T>* out) {
+  for (const auto& t : values) {
+    out->push_back(t);
+  }
+}
+
 static void InitStaticData() {
   // Signed int types
   g_signed_int_types = {int8(), int16(), int32(), int64()};
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h
index 0ea59780854..bca215cc6f5 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.h
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#pragma once
+
 #include <cstdint>
 #include <memory>
 #include <vector>
@@ -62,15 +64,6 @@ namespace compute {
 
 #endif  // ARROW_EXTRA_ERROR_CONTEXT
 
-// A kernel that exposes Call methods that handles iteration over ArrayData
-// inputs itself
-//
-
-constexpr int kValidity = 0;
-constexpr int kBinaryOffsets = 1;
-constexpr int kPrimitiveData = 1;
-constexpr int kBinaryData = 2;
-
 // ----------------------------------------------------------------------
 // Iteration / value access utilities
 
@@ -159,13 +152,6 @@ void BinaryExecFlipped(KernelContext* ctx, ArrayKernelExec exec,
 // ----------------------------------------------------------------------
 // Template kernel exec function generators
 
-template <typename T>
-void Extend(const std::vector<T>& values, std::vector<T>* out) {
-  for (const auto& t : values) {
-    out->push_back(t);
-  }
-}
-
 const std::vector<std::shared_ptr<DataType>>& BaseBinaryTypes();
 const std::vector<std::shared_ptr<DataType>>& SignedIntTypes();
 const std::vector<std::shared_ptr<DataType>>& UnsignedIntTypes();
@@ -225,43 +211,41 @@ void SimpleBinary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
 // dealing with value iteration.
 //
 // TODO: Run benchmarks to determine if OutputAdapter is a zero-cost abstraction
-struct ScalarPrimitiveExec {
-  template <typename Op, typename OutType, typename Arg0Type>
-  static void Unary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    using OUT = typename OutType::c_type;
-    using ARG0 = typename Arg0Type::c_type;
-
-    if (batch[0].kind() == Datum::SCALAR) {
-      ctx->SetStatus(Status::NotImplemented("NYI"));
-    } else {
-      ArrayData* out_arr = out->mutable_array();
-      auto out_data = out_arr->GetMutableValues<OUT>(kPrimitiveData);
-      auto arg0_data = batch[0].array()->GetValues<ARG0>(kPrimitiveData);
-      for (int64_t i = 0; i < batch.length; ++i) {
-        *out_data++ = Op::template Call<OUT, ARG0>(ctx, *arg0_data++);
-      }
+template <typename Op, typename OutType, typename Arg0Type>
+void ScalarPrimitiveExecUnary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  using OUT = typename OutType::c_type;
+  using ARG0 = typename Arg0Type::c_type;
+
+  if (batch[0].kind() == Datum::SCALAR) {
+    ctx->SetStatus(Status::NotImplemented("NYI"));
+  } else {
+    ArrayData* out_arr = out->mutable_array();
+    auto out_data = out_arr->GetMutableValues<OUT>(1);
+    auto arg0_data = batch[0].array()->GetValues<ARG0>(1);
+    for (int64_t i = 0; i < batch.length; ++i) {
+      *out_data++ = Op::template Call<OUT, ARG0>(ctx, *arg0_data++);
     }
   }
+}
 
-  template <typename Op, typename OutType, typename Arg0Type, typename Arg1Type>
-  static void Binary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    using OUT = typename OutType::c_type;
-    using ARG0 = typename Arg0Type::c_type;
-    using ARG1 = typename Arg1Type::c_type;
+template <typename Op, typename OutType, typename Arg0Type, typename Arg1Type>
+void ScalarPrimitiveExecBinary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  using OUT = typename OutType::c_type;
+  using ARG0 = typename Arg0Type::c_type;
+  using ARG1 = typename Arg1Type::c_type;
 
-    if (batch[0].kind() == Datum::SCALAR || batch[1].kind() == Datum::SCALAR) {
-      ctx->SetStatus(Status::NotImplemented("NYI"));
-    } else {
-      ArrayData* out_arr = out->mutable_array();
-      auto out_data = out_arr->GetMutableValues<OUT>(kPrimitiveData);
-      auto arg0_data = batch[0].array()->GetValues<ARG0>(kPrimitiveData);
-      auto arg1_data = batch[1].array()->GetValues<ARG1>(kPrimitiveData);
-      for (int64_t i = 0; i < batch.length; ++i) {
-        *out_data++ = Op::template Call<OUT, ARG0, ARG1>(ctx, *arg0_data++, *arg1_data++);
-      }
+  if (batch[0].kind() == Datum::SCALAR || batch[1].kind() == Datum::SCALAR) {
+    ctx->SetStatus(Status::NotImplemented("NYI"));
+  } else {
+    ArrayData* out_arr = out->mutable_array();
+    auto out_data = out_arr->GetMutableValues<OUT>(1);
+    auto arg0_data = batch[0].array()->GetValues<ARG0>(1);
+    auto arg1_data = batch[1].array()->GetValues<ARG1>(1);
+    for (int64_t i = 0; i < batch.length; ++i) {
+      *out_data++ = Op::template Call<OUT, ARG0, ARG1>(ctx, *arg0_data++, *arg1_data++);
     }
   }
-};
+}
 
 template <typename Type, typename Enable = void>
 struct OutputAdapter;
@@ -282,7 +266,7 @@ struct OutputAdapter<Type, enable_if_has_c_type_not_boolean<Type>> {
   template <typename Generator>
   static void Write(KernelContext*, Datum* out, Generator&& generator) {
     ArrayData* out_arr = out->mutable_array();
-    auto out_data = out_arr->GetMutableValues<typename Type::c_type>(kPrimitiveData);
+    auto out_data = out_arr->GetMutableValues<typename Type::c_type>(1);
     // TODO: Is this as fast as a more explicitly inlined function?
     for (int64_t i = 0 ; i < out_arr->length; ++i) {
       *out_data++ = generator();
@@ -363,7 +347,7 @@ struct ScalarUnaryNotNullStateful {
     static void Exec(const ThisType& functor, KernelContext* ctx, const ExecBatch& batch,
                      Datum* out) {
       ArrayData* out_arr = out->mutable_array();
-      auto out_data = out_arr->GetMutableValues<OUT>(kPrimitiveData);
+      auto out_data = out_arr->GetMutableValues<OUT>(1);
       VisitArrayDataInline<Arg0Type>(*batch[0].array(), [&](util::optional<ARG0> v) {
           if (v.has_value()) {
             *out_data = functor.op.template Call<OUT, ARG0>(ctx, *v);
@@ -495,82 +479,79 @@ using ScalarBinaryEqualTypes = ScalarBinary<OutType, ArgType, ArgType, Op, Flipp
 // corresponding template that generate's the kernel's Exec function to be
 // instantiated
 
-struct ScalarNumericEqualTypes {
-
-  // Generate a kernel given a functor of type
-  //
-  // struct OPERATOR_NAME {
-  //   template <typename OUT, typename ARG0>
-  //   static OUT Call(KernelContext*, ARG0 val) {
-  //     // IMPLEMENTATION
-  //   }
-  // };
-  template <typename Op>
-  static ArrayKernelExec Unary(const DataType& type) {
-    switch (type.id()) {
-      case Type::INT8:
-        return ScalarPrimitiveExec::Unary<Op, Int8Type, Int8Type>;
-      case Type::UINT8:
-        return ScalarPrimitiveExec::Unary<Op, UInt8Type, UInt8Type>;
-      case Type::INT16:
-        return ScalarPrimitiveExec::Unary<Op, Int16Type, Int16Type>;
-      case Type::UINT16:
-        return ScalarPrimitiveExec::Unary<Op, UInt16Type, UInt16Type>;
-      case Type::INT32:
-        return ScalarPrimitiveExec::Unary<Op, Int32Type, Int32Type>;
-      case Type::UINT32:
-        return ScalarPrimitiveExec::Unary<Op, UInt32Type, UInt32Type>;
-      case Type::INT64:
-        return ScalarPrimitiveExec::Unary<Op, Int64Type, Int64Type>;
-      case Type::UINT64:
-        return ScalarPrimitiveExec::Unary<Op, UInt64Type, UInt64Type>;
-      case Type::FLOAT:
-        return ScalarPrimitiveExec::Unary<Op, FloatType, FloatType>;
-      case Type::DOUBLE:
-        return ScalarPrimitiveExec::Unary<Op, DoubleType, DoubleType>;
-      default:
-        DCHECK(false);
-        return ExecFail;
-    }
+// Generate a kernel given a functor of type
+//
+// struct OPERATOR_NAME {
+//   template <typename OUT, typename ARG0>
+//   static OUT Call(KernelContext*, ARG0 val) {
+//     // IMPLEMENTATION
+//   }
+// };
+template <typename Op>
+ArrayKernelExec NumericEqualTypesUnary(const DataType& type) {
+  switch (type.id()) {
+    case Type::INT8:
+      return ScalarPrimitiveExecUnary<Op, Int8Type, Int8Type>;
+    case Type::UINT8:
+      return ScalarPrimitiveExecUnary<Op, UInt8Type, UInt8Type>;
+    case Type::INT16:
+      return ScalarPrimitiveExecUnary<Op, Int16Type, Int16Type>;
+    case Type::UINT16:
+      return ScalarPrimitiveExecUnary<Op, UInt16Type, UInt16Type>;
+    case Type::INT32:
+      return ScalarPrimitiveExecUnary<Op, Int32Type, Int32Type>;
+    case Type::UINT32:
+      return ScalarPrimitiveExecUnary<Op, UInt32Type, UInt32Type>;
+    case Type::INT64:
+      return ScalarPrimitiveExecUnary<Op, Int64Type, Int64Type>;
+    case Type::UINT64:
+      return ScalarPrimitiveExecUnary<Op, UInt64Type, UInt64Type>;
+    case Type::FLOAT:
+      return ScalarPrimitiveExecUnary<Op, FloatType, FloatType>;
+    case Type::DOUBLE:
+      return ScalarPrimitiveExecUnary<Op, DoubleType, DoubleType>;
+    default:
+      DCHECK(false);
+      return ExecFail;
   }
+}
 
-  // Generate a kernel given a functor of type
-  //
-  // struct OPERATOR_NAME {
-  //   template <typename OUT, typename ARG0, typename ARG1>
-  //   static OUT Call(KernelContext*, ARG0 left, ARG1 right) {
-  //     // IMPLEMENTATION
-  //   }
-  // };
-  template <typename Op>
-  static ArrayKernelExec Binary(const DataType& type) {
-    switch (type.id()) {
-      case Type::INT8:
-        return ScalarPrimitiveExec::Binary<Op, Int8Type, Int8Type, Int8Type>;
-      case Type::UINT8:
-        return ScalarPrimitiveExec::Binary<Op, UInt8Type, UInt8Type, UInt8Type>;
-      case Type::INT16:
-        return ScalarPrimitiveExec::Binary<Op, Int16Type, Int16Type, Int16Type>;
-      case Type::UINT16:
-        return ScalarPrimitiveExec::Binary<Op, UInt16Type, UInt16Type, UInt16Type>;
-      case Type::INT32:
-        return ScalarPrimitiveExec::Binary<Op, Int32Type, Int32Type, Int32Type>;
-      case Type::UINT32:
-        return ScalarPrimitiveExec::Binary<Op, UInt32Type, UInt32Type, UInt32Type>;
-      case Type::INT64:
-        return ScalarPrimitiveExec::Binary<Op, Int64Type, Int64Type, Int64Type>;
-      case Type::UINT64:
-        return ScalarPrimitiveExec::Binary<Op, UInt64Type, UInt64Type, UInt64Type>;
-      case Type::FLOAT:
-        return ScalarPrimitiveExec::Binary<Op, FloatType, FloatType, FloatType>;
-      case Type::DOUBLE:
-        return ScalarPrimitiveExec::Binary<Op, DoubleType, DoubleType, DoubleType>;
-      default:
-        DCHECK(false);
-        return ExecFail;
-    }
+// Generate a kernel given a functor of type
+//
+// struct OPERATOR_NAME {
+//   template <typename OUT, typename ARG0, typename ARG1>
+//   static OUT Call(KernelContext*, ARG0 left, ARG1 right) {
+//     // IMPLEMENTATION
+//   }
+// };
+template <typename Op>
+ArrayKernelExec NumericEqualTypesBinary(const DataType& type) {
+  switch (type.id()) {
+    case Type::INT8:
+      return ScalarPrimitiveExecBinary<Op, Int8Type, Int8Type, Int8Type>;
+    case Type::UINT8:
+      return ScalarPrimitiveExecBinary<Op, UInt8Type, UInt8Type, UInt8Type>;
+    case Type::INT16:
+      return ScalarPrimitiveExecBinary<Op, Int16Type, Int16Type, Int16Type>;
+    case Type::UINT16:
+      return ScalarPrimitiveExecBinary<Op, UInt16Type, UInt16Type, UInt16Type>;
+    case Type::INT32:
+      return ScalarPrimitiveExecBinary<Op, Int32Type, Int32Type, Int32Type>;
+    case Type::UINT32:
+      return ScalarPrimitiveExecBinary<Op, UInt32Type, UInt32Type, UInt32Type>;
+    case Type::INT64:
+      return ScalarPrimitiveExecBinary<Op, Int64Type, Int64Type, Int64Type>;
+    case Type::UINT64:
+      return ScalarPrimitiveExecBinary<Op, UInt64Type, UInt64Type, UInt64Type>;
+    case Type::FLOAT:
+      return ScalarPrimitiveExecBinary<Op, FloatType, FloatType, FloatType>;
+    case Type::DOUBLE:
+      return ScalarPrimitiveExecBinary<Op, DoubleType, DoubleType, DoubleType>;
+    default:
+      DCHECK(false);
+      return ExecFail;
   }
-};
+}
 
 // Generate a kernel given a templated functor. This template effectively
 // "curries" the first type argument. The functor must be of the form:
diff --git a/cpp/src/arrow/compute/kernels/common.h b/cpp/src/arrow/compute/kernels/common.h
index 6b14260cec2..9147fad9a15 100644
--- a/cpp/src/arrow/compute/kernels/common.h
+++ b/cpp/src/arrow/compute/kernels/common.h
@@ -44,6 +44,9 @@
 
 namespace arrow {
 
+using internal::Bitmap;
+using internal::BitmapReader;
+using internal::FirstTimeBitmapWriter;
 using internal::checked_cast;
 using internal::checked_pointer_cast;
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
index 3d88532228b..57129bcb6a4 100644
--- a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
@@ -34,7 +34,7 @@ void MakeBinaryFunction(std::string name, FunctionRegistry* registry) {
   auto func = std::make_shared<ScalarFunction>(name, Arity::Binary());
   for (const std::shared_ptr<DataType>& ty : NumericTypes()) {
     DCHECK_OK(func->AddKernel({InputType::Array(ty), InputType::Array(ty)}, ty,
-                              ScalarNumericEqualTypes::Binary<Op>(*ty)));
+                              NumericEqualTypesBinary<Op>(*ty)));
   }
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
diff --git a/cpp/src/arrow/compute/kernels/scalar_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_boolean.cc
index 29b51960852..6ec10471bba 100644
--- a/cpp/src/arrow/compute/kernels/scalar_boolean.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_boolean.cc
@@ -28,7 +28,6 @@ enum BitmapIndex { LEFT_VALID, LEFT_DATA, RIGHT_VALID, RIGHT_DATA };
 template <typename ComputeWord>
 void ComputeKleene(ComputeWord&& compute_word, KernelContext* ctx, const ArrayData& left,
                    const ArrayData& right, ArrayData* out) {
-  using internal::Bitmap;
   DCHECK(left.null_count != 0 || right.null_count != 0);
 
   Bitmap bitmaps[4];
@@ -80,7 +79,7 @@ struct Invert {
   }
 
   static void Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) {
-    internal::InvertBitmap(in.buffers[1]->data(), in.offset, in.length,
+    ::arrow::internal::InvertBitmap(in.buffers[1]->data(), in.offset, in.length,
                            out->buffers[1]->mutable_data(), out->offset);
   }
 };
@@ -88,7 +87,7 @@ struct Invert {
 struct And {
   static void Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
                    ArrayData* out) {
-    internal::BitmapAnd(left.buffers[1]->data(), left.offset, right.buffers[1]->data(),
+    ::arrow::internal::BitmapAnd(left.buffers[1]->data(), left.offset, right.buffers[1]->data(),
                         right.offset, right.length, out->offset,
                         out->buffers[1]->mutable_data());
   }
@@ -114,7 +113,7 @@ struct KleeneAnd {
 struct Or {
   static void Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
                    ArrayData* out) {
-    internal::BitmapOr(left.buffers[1]->data(), left.offset, right.buffers[1]->data(),
+    ::arrow::internal::BitmapOr(left.buffers[1]->data(), left.offset, right.buffers[1]->data(),
                        right.offset, right.length, out->offset,
                        out->buffers[1]->mutable_data());
   }
@@ -141,7 +140,7 @@ struct KleeneOr {
 struct Xor {
   static void Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
                    ArrayData* out) {
-    internal::BitmapXor(left.buffers[1]->data(), left.offset, right.buffers[1]->data(),
+    ::arrow::internal::BitmapXor(left.buffers[1]->data(), left.offset, right.buffers[1]->data(),
                         right.offset, right.length, out->offset,
                         out->buffers[1]->mutable_data());
   }
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
index 6bb231feca2..a841f57ac70 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
@@ -52,11 +52,11 @@ std::vector<std::shared_ptr<CastFunction>> GetBooleanCasts() {
   AddCommonCasts<BooleanType>(boolean(), func.get());
 
   for (const auto& ty : NumericTypes()) {
-    auto exec = codegen::Numeric<codegen::ScalarUnary, BooleanType, IsNonZero>(*ty);
+    ArrayKernelExec exec = codegen::Numeric<codegen::ScalarUnary, BooleanType, IsNonZero>(*ty);
     DCHECK_OK(func->AddKernel(ty->id(), {ty}, boolean(), exec));
   }
   for (const auto& ty : BaseBinaryTypes()) {
-    auto exec =
+    ArrayKernelExec exec =
         codegen::BaseBinary<codegen::ScalarUnaryNotNull, BooleanType, ParseBooleanString>(
             *ty);
     DCHECK_OK(func->AddKernel(ty->id(), {ty}, boolean(), exec));
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
index 4bcee7b8ea2..b8a01f91d9e 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
@@ -27,7 +27,6 @@
 
 namespace arrow {
 
-using internal::BitmapReader;
 using internal::ParseTimestampContext;
 using internal::ParseValue;
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
index 234f0215a1c..16547a7eefe 100644
--- a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
@@ -26,6 +26,7 @@
 namespace arrow {
 
 using internal::checked_cast;
+using internal::HashTraits;
 
 namespace compute {
 
@@ -54,7 +55,7 @@ struct SetLookupState : public KernelState {
     return VisitArrayDataInline<Type>(*options.value_set->data(), insert_value);
   }
 
-  using MemoTable = typename internal::HashTraits<Type>::MemoTableType;
+  using MemoTable = typename HashTraits<Type>::MemoTableType;
   MemoTable lookup_table;
   int64_t lookup_null_count;
   int64_t null_index = -1;
@@ -221,8 +222,8 @@ struct IsInVisitor {
       BitUtil::SetBitsTo(output->buffers[0]->mutable_data(), output->offset,
                          output->length, true);
     }
-    internal::FirstTimeBitmapWriter writer(output->buffers[1]->mutable_data(),
-                                           output->offset, output->length);
+    FirstTimeBitmapWriter writer(output->buffers[1]->mutable_data(),
+                                 output->offset, output->length);
     auto lookup_value = [&](util::optional<T> v) {
       if (!v.has_value() || state.lookup_table.Get(*v) != -1) {
         writer.Set();
diff --git a/docs/source/developers/python.rst b/docs/source/developers/python.rst
index 75a51b4f5c5..11df444a035 100644
--- a/docs/source/developers/python.rst
+++ b/docs/source/developers/python.rst
@@ -457,6 +457,7 @@ Let's configure, build and install the Arrow C++ libraries:
    pushd arrow\cpp\build
    cmake -G "%PYARROW_CMAKE_GENERATOR%" ^
        -DCMAKE_INSTALL_PREFIX=%ARROW_HOME% ^
+       -DCMAKE_UNITY_BUILD=ON ^
        -DARROW_CXXFLAGS="/WX /MP" ^
        -DARROW_WITH_LZ4=on ^
        -DARROW_WITH_SNAPPY=on ^

From f9233c650fa0225e9ea462b139b08309211a6812 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Fri, 22 May 2020 12:32:23 -0500
Subject: [PATCH 22/41] More MSVC fixes

---
 cpp/src/arrow/compute/exec.h                        |  4 +++-
 cpp/src/arrow/compute/kernel.cc                     | 10 +++++-----
 cpp/src/arrow/compute/kernel.h                      |  4 ++--
 cpp/src/arrow/compute/kernels/vector_filter_test.cc | 10 ++++------
 cpp/src/arrow/compute/kernels/vector_hash_test.cc   |  5 +----
 cpp/src/arrow/compute/kernels/vector_take_test.cc   | 11 +++--------
 cpp/src/arrow/compute/test_util.h                   |  6 ++++++
 cpp/src/parquet/arrow/reader_writer_benchmark.cc    |  2 +-
 cpp/src/parquet/encoding_benchmark.cc               |  9 +++++----
 9 files changed, 30 insertions(+), 31 deletions(-)

diff --git a/cpp/src/arrow/compute/exec.h b/cpp/src/arrow/compute/exec.h
index c38bc7823e3..86abc115b13 100644
--- a/cpp/src/arrow/compute/exec.h
+++ b/cpp/src/arrow/compute/exec.h
@@ -154,7 +154,9 @@ struct ExecBatch {
   std::vector<Datum> values;
   std::shared_ptr<SelectionVector> selection_vector;
   int64_t length;
-  const Datum& operator[](int i) const { return values[i]; }
+
+  template <typename index_type>
+  inline const Datum& operator[](index_type i) const { return values[i]; }
 
   int num_values() const { return static_cast<int>(values.size()); }
 
diff --git a/cpp/src/arrow/compute/kernel.cc b/cpp/src/arrow/compute/kernel.cc
index 7e7478be05d..64519c99f08 100644
--- a/cpp/src/arrow/compute/kernel.cc
+++ b/cpp/src/arrow/compute/kernel.cc
@@ -34,7 +34,7 @@ namespace arrow {
 
 using internal::hash_combine;
 
-static constexpr size_t kHashSeed = 0;
+static constexpr uint64_t kHashSeed = 0;
 
 namespace compute {
 
@@ -79,8 +79,8 @@ void KernelContext::ResetStatus() { status_ = Status::OK(); }
 // ----------------------------------------------------------------------
 // InputType
 
-size_t InputType::Hash() const {
-  size_t result = kHashSeed;
+uint64_t InputType::Hash() const {
+  uint64_t result = kHashSeed;
   hash_combine(result, static_cast<int>(shape_));
   switch (kind_) {
     case InputType::EXACT_TYPE:
@@ -273,11 +273,11 @@ bool KernelSignature::MatchesInputs(const std::vector<ValueDescr>& args) const {
   return true;
 }
 
-int64_t KernelSignature::Hash() const {
+uint64_t KernelSignature::Hash() const {
   if (hash_code_ != 0) {
     return hash_code_;
   }
-  size_t result = kHashSeed;
+  uint64_t result = kHashSeed;
   for (const auto& in_type : in_types_) {
     hash_combine(result, in_type.Hash());
   }
diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h
index 621fb5b8892..3307ad379f4 100644
--- a/cpp/src/arrow/compute/kernel.h
+++ b/cpp/src/arrow/compute/kernel.h
@@ -329,7 +329,7 @@ class ARROW_EXPORT KernelSignature {
   bool operator!=(const KernelSignature& other) const { return !(*this == other); }
 
   /// \brief Compute a hash code for the signature
-  int64_t Hash() const;
+  uint64_t Hash() const;
 
   const std::vector<InputType>& in_types() const { return in_types_; }
 
@@ -346,7 +346,7 @@ class ARROW_EXPORT KernelSignature {
   bool is_varargs_;
 
   // For caching the hash code after it's computed the first time
-  mutable int64_t hash_code_;
+  mutable uint64_t hash_code_;
 };
 
 /// \brief A function may contain multiple variants of a kernel for a given
diff --git a/cpp/src/arrow/compute/kernels/vector_filter_test.cc b/cpp/src/arrow/compute/kernels/vector_filter_test.cc
index 6a784363541..d4c6a2f939e 100644
--- a/cpp/src/arrow/compute/kernels/vector_filter_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_filter_test.cc
@@ -33,8 +33,6 @@ namespace compute {
 using internal::checked_pointer_cast;
 using util::string_view;
 
-constexpr auto kSeed = 0x0ff1ce;
-
 std::shared_ptr<Array> CoalesceNullToFalse(std::shared_ptr<Array> filter) {
   if (filter->null_count() == 0) {
     return filter;
@@ -193,7 +191,7 @@ TYPED_TEST(TestFilterKernelWithNumeric, FilterNumeric) {
 }
 
 TYPED_TEST(TestFilterKernelWithNumeric, FilterRandomNumeric) {
-  auto rand = random::RandomArrayGenerator(kSeed);
+  auto rand = random::RandomArrayGenerator(kRandomSeed);
   for (size_t i = 3; i < 10; i++) {
     const int64_t length = static_cast<int64_t>(1ULL << i);
     for (auto null_probability : {0.0, 0.01, 0.25, 1.0}) {
@@ -257,7 +255,7 @@ TYPED_TEST(TestFilterKernelWithNumeric, CompareScalarAndFilterRandomNumeric) {
   using ArrayType = typename TypeTraits<TypeParam>::ArrayType;
   using CType = typename TypeTraits<TypeParam>::CType;
 
-  auto rand = random::RandomArrayGenerator(kSeed);
+  auto rand = random::RandomArrayGenerator(kRandomSeed);
   for (size_t i = 3; i < 10; i++) {
     const int64_t length = static_cast<int64_t>(1ULL << i);
     // TODO(bkietz) rewrite with some nulls
@@ -281,7 +279,7 @@ TYPED_TEST(TestFilterKernelWithNumeric, CompareScalarAndFilterRandomNumeric) {
 TYPED_TEST(TestFilterKernelWithNumeric, CompareArrayAndFilterRandomNumeric) {
   using ArrayType = typename TypeTraits<TypeParam>::ArrayType;
 
-  auto rand = random::RandomArrayGenerator(kSeed);
+  auto rand = random::RandomArrayGenerator(kRandomSeed);
   for (size_t i = 3; i < 10; i++) {
     const int64_t length = static_cast<int64_t>(1ULL << i);
     auto lhs = checked_pointer_cast<ArrayType>(
@@ -306,7 +304,7 @@ TYPED_TEST(TestFilterKernelWithNumeric, ScalarInRangeAndFilterRandomNumeric) {
   using ArrayType = typename TypeTraits<TypeParam>::ArrayType;
   using CType = typename TypeTraits<TypeParam>::CType;
 
-  auto rand = random::RandomArrayGenerator(kSeed);
+  auto rand = random::RandomArrayGenerator(kRandomSeed);
   for (size_t i = 3; i < 10; i++) {
     const int64_t length = static_cast<int64_t>(1ULL << i);
     auto array = checked_pointer_cast<ArrayType>(
diff --git a/cpp/src/arrow/compute/kernels/vector_hash_test.cc b/cpp/src/arrow/compute/kernels/vector_hash_test.cc
index afeeeead9b3..aab914056f9 100644
--- a/cpp/src/arrow/compute/kernels/vector_hash_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash_test.cc
@@ -51,9 +51,6 @@ using internal::checked_cast;
 
 namespace compute {
 
-using StringTypes =
-    ::testing::Types<StringType, LargeStringType, BinaryType, LargeBinaryType>;
-
 // ----------------------------------------------------------------------
 // Dictionary tests
 
@@ -361,7 +358,7 @@ class TestHashKernelBinaryTypes : public TestHashKernel {
   }
 };
 
-TYPED_TEST_SUITE(TestHashKernelBinaryTypes, StringTypes);
+TYPED_TEST_SUITE(TestHashKernelBinaryTypes, TestingStringTypes);
 
 TYPED_TEST(TestHashKernelBinaryTypes, ZeroChunks) {
   auto type = this->type();
diff --git a/cpp/src/arrow/compute/kernels/vector_take_test.cc b/cpp/src/arrow/compute/kernels/vector_take_test.cc
index bcd607af0fe..53aef1fbf7d 100644
--- a/cpp/src/arrow/compute/kernels/vector_take_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_take_test.cc
@@ -35,8 +35,6 @@ using internal::checked_cast;
 using internal::checked_pointer_cast;
 using util::string_view;
 
-constexpr auto kSeed = 0x0ff1ce;
-
 void AssertTakeArrays(const std::shared_ptr<Array>& values,
                       const std::shared_ptr<Array>& indices,
                       const std::shared_ptr<Array>& expected) {
@@ -153,7 +151,7 @@ TYPED_TEST(TestTakeKernelWithNumeric, TakeNumeric) {
 }
 
 TYPED_TEST(TestTakeKernelWithNumeric, TakeRandomNumeric) {
-  auto rand = random::RandomArrayGenerator(kSeed);
+  auto rand = random::RandomArrayGenerator(kRandomSeed);
   for (size_t i = 3; i < 8; i++) {
     const int64_t length = static_cast<int64_t>(1ULL << i);
     for (size_t j = 0; j < 13; j++) {
@@ -168,9 +166,6 @@ TYPED_TEST(TestTakeKernelWithNumeric, TakeRandomNumeric) {
   }
 }
 
-using StringTypes =
-    ::testing::Types<BinaryType, StringType, LargeBinaryType, LargeStringType>;
-
 template <typename TypeClass>
 class TestTakeKernelWithString : public TestTakeKernel<TypeClass> {
  public:
@@ -199,7 +194,7 @@ class TestTakeKernelWithString : public TestTakeKernel<TypeClass> {
   }
 };
 
-TYPED_TEST_SUITE(TestTakeKernelWithString, StringTypes);
+TYPED_TEST_SUITE(TestTakeKernelWithString, TestingStringTypes);
 
 TYPED_TEST(TestTakeKernelWithString, TakeString) {
   this->AssertTake(R"(["a", "b", "c"])", "[0, 1, 0]", R"(["a", "b", "a"])");
@@ -496,7 +491,7 @@ class TestPermutationsWithTake : public TestBase {
 };
 
 TEST_F(TestPermutationsWithTake, InvertPermutation) {
-  for (int seed : {0, kSeed, kSeed * 2 - 1}) {
+  for (auto seed : std::vector<random::SeedType>({0, kRandomSeed, kRandomSeed * 2 - 1})) {
     std::default_random_engine gen(seed);
     for (int16_t length = 0; length < 1 << 10; ++length) {
       auto identity = Identity(length);
diff --git a/cpp/src/arrow/compute/test_util.h b/cpp/src/arrow/compute/test_util.h
index ba124964525..dfd73350df3 100644
--- a/cpp/src/arrow/compute/test_util.h
+++ b/cpp/src/arrow/compute/test_util.h
@@ -27,6 +27,7 @@
 #include "arrow/memory_pool.h"
 #include "arrow/pretty_print.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/testing/random.h"
 #include "arrow/testing/util.h"
 #include "arrow/type.h"
 
@@ -81,5 +82,10 @@ struct DatumEqual<Type, enable_if_integer<Type>> {
   }
 };
 
+using TestingStringTypes =
+    ::testing::Types<StringType, LargeStringType, BinaryType, LargeBinaryType>;
+
+static constexpr random::SeedType kRandomSeed = 0x0ff1ce;
+
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/parquet/arrow/reader_writer_benchmark.cc b/cpp/src/parquet/arrow/reader_writer_benchmark.cc
index bf3a93c3788..e499eed0b6c 100644
--- a/cpp/src/parquet/arrow/reader_writer_benchmark.cc
+++ b/cpp/src/parquet/arrow/reader_writer_benchmark.cc
@@ -164,7 +164,7 @@ std::shared_ptr<::arrow::Table> TableFromVector<BooleanType>(const std::vector<b
 template <bool nullable, typename ParquetType>
 static void BM_WriteColumn(::benchmark::State& state) {
   using T = typename ParquetType::c_type;
-  std::vector<T> values(BENCHMARK_SIZE, 128);
+  std::vector<T> values(BENCHMARK_SIZE, static_cast<T>(128));
   std::shared_ptr<::arrow::Table> table = TableFromVector<ParquetType>(values, nullable);
 
   while (state.KeepRunning()) {
diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc
index 1101e38290e..21e175e82fa 100644
--- a/cpp/src/parquet/encoding_benchmark.cc
+++ b/cpp/src/parquet/encoding_benchmark.cc
@@ -230,7 +230,7 @@ static void BM_PlainEncodingSpaced(benchmark::State& state) {
   using ArrayType = typename BM_SpacedEncodingTraits<ParquetType>::ArrayType;
   using CType = typename BM_SpacedEncodingTraits<ParquetType>::CType;
 
-  const auto num_values = state.range(0);
+  const int num_values = static_cast<int>(state.range(0));
   const auto null_percent = static_cast<double>(state.range(1)) / 100.0;
 
   auto rand = ::arrow::random::RandomArrayGenerator(1923);
@@ -273,13 +273,13 @@ static void BM_PlainDecodingSpaced(benchmark::State& state) {
   using ArrayType = typename BM_SpacedEncodingTraits<ParquetType>::ArrayType;
   using CType = typename BM_SpacedEncodingTraits<ParquetType>::CType;
 
-  const auto num_values = state.range(0);
+  const int num_values = static_cast<int>(state.range(0));
   const auto null_percent = static_cast<double>(state.range(1)) / 100.0;
 
   auto rand = ::arrow::random::RandomArrayGenerator(1923);
   const auto array = rand.Numeric<ArrowType>(num_values, -100, 100, null_percent);
   const auto valid_bits = array->null_bitmap_data();
-  const auto null_count = array->null_count();
+  const int null_count = static_cast<int>(array->null_count());
   const auto array_actual = arrow::internal::checked_pointer_cast<ArrayType>(array);
   const auto raw_values = array_actual->raw_values();
   // Guarantee the type cast between raw_values and input of PutSpaced.
@@ -296,7 +296,8 @@ static void BM_PlainDecodingSpaced(benchmark::State& state) {
   std::vector<uint8_t> decode_values(num_values * sizeof(CType));
   auto decode_buf = reinterpret_cast<CType*>(decode_values.data());
   for (auto _ : state) {
-    decoder->SetData(num_values - null_count, buf->data(), buf->size());
+    decoder->SetData(num_values - null_count, buf->data(),
+                     static_cast<int>(buf->size()));
     decoder->DecodeSpaced(decode_buf, num_values, null_count, valid_bits, 0);
   }
   state.SetBytesProcessed(state.iterations() * num_values * sizeof(CType));

From fd28670e1d383756aa3ca72722606a324828fbfd Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Fri, 22 May 2020 12:37:27 -0500
Subject: [PATCH 23/41] clang-format

---
 cpp/src/arrow/compute/exec.h                  |  4 +++-
 .../arrow/compute/kernels/aggregate_basic.cc  |  3 +--
 cpp/src/arrow/compute/kernels/common.h        |  2 +-
 .../arrow/compute/kernels/scalar_boolean.cc   | 20 +++++++++----------
 .../compute/kernels/scalar_cast_boolean.cc    |  3 ++-
 .../compute/kernels/scalar_set_lookup.cc      |  4 ++--
 cpp/src/parquet/encoding_benchmark.cc         |  3 +--
 7 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/cpp/src/arrow/compute/exec.h b/cpp/src/arrow/compute/exec.h
index 86abc115b13..d6ba48db366 100644
--- a/cpp/src/arrow/compute/exec.h
+++ b/cpp/src/arrow/compute/exec.h
@@ -156,7 +156,9 @@ struct ExecBatch {
   int64_t length;
 
   template <typename index_type>
-  inline const Datum& operator[](index_type i) const { return values[i]; }
+  inline const Datum& operator[](index_type i) const {
+    return values[i];
+  }
 
   int num_values() const { return static_cast<int>(values.size()); }
 
diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
index 76a4dedd4be..09f2bc30862 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
@@ -147,8 +147,7 @@ struct SumState {
   ThisType ConsumeTiny(const ArrayType& array) const {
     ThisType local;
 
-    BitmapReader reader(array.null_bitmap_data(), array.offset(),
-                        array.length());
+    BitmapReader reader(array.null_bitmap_data(), array.offset(), array.length());
     const auto values = array.raw_values();
     for (int64_t i = 0; i < array.length(); i++) {
       if (reader.IsSet()) {
diff --git a/cpp/src/arrow/compute/kernels/common.h b/cpp/src/arrow/compute/kernels/common.h
index 9147fad9a15..dfcdb0f1ec3 100644
--- a/cpp/src/arrow/compute/kernels/common.h
+++ b/cpp/src/arrow/compute/kernels/common.h
@@ -46,8 +46,8 @@ namespace arrow {
 
 using internal::Bitmap;
 using internal::BitmapReader;
-using internal::FirstTimeBitmapWriter;
 using internal::checked_cast;
 using internal::checked_pointer_cast;
+using internal::FirstTimeBitmapWriter;
 
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/scalar_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_boolean.cc
index 6ec10471bba..336d104f136 100644
--- a/cpp/src/arrow/compute/kernels/scalar_boolean.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_boolean.cc
@@ -80,16 +80,16 @@ struct Invert {
 
   static void Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) {
     ::arrow::internal::InvertBitmap(in.buffers[1]->data(), in.offset, in.length,
-                           out->buffers[1]->mutable_data(), out->offset);
+                                    out->buffers[1]->mutable_data(), out->offset);
   }
 };
 
 struct And {
   static void Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
                    ArrayData* out) {
-    ::arrow::internal::BitmapAnd(left.buffers[1]->data(), left.offset, right.buffers[1]->data(),
-                        right.offset, right.length, out->offset,
-                        out->buffers[1]->mutable_data());
+    ::arrow::internal::BitmapAnd(left.buffers[1]->data(), left.offset,
+                                 right.buffers[1]->data(), right.offset, right.length,
+                                 out->offset, out->buffers[1]->mutable_data());
   }
 };
 
@@ -113,9 +113,9 @@ struct KleeneAnd {
 struct Or {
   static void Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
                    ArrayData* out) {
-    ::arrow::internal::BitmapOr(left.buffers[1]->data(), left.offset, right.buffers[1]->data(),
-                       right.offset, right.length, out->offset,
-                       out->buffers[1]->mutable_data());
+    ::arrow::internal::BitmapOr(left.buffers[1]->data(), left.offset,
+                                right.buffers[1]->data(), right.offset, right.length,
+                                out->offset, out->buffers[1]->mutable_data());
   }
 };
 
@@ -140,9 +140,9 @@ struct KleeneOr {
 struct Xor {
   static void Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
                    ArrayData* out) {
-    ::arrow::internal::BitmapXor(left.buffers[1]->data(), left.offset, right.buffers[1]->data(),
-                        right.offset, right.length, out->offset,
-                        out->buffers[1]->mutable_data());
+    ::arrow::internal::BitmapXor(left.buffers[1]->data(), left.offset,
+                                 right.buffers[1]->data(), right.offset, right.length,
+                                 out->offset, out->buffers[1]->mutable_data());
   }
 };
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
index a841f57ac70..865ea7b0dd9 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
@@ -52,7 +52,8 @@ std::vector<std::shared_ptr<CastFunction>> GetBooleanCasts() {
   AddCommonCasts<BooleanType>(boolean(), func.get());
 
   for (const auto& ty : NumericTypes()) {
-    ArrayKernelExec exec = codegen::Numeric<codegen::ScalarUnary, BooleanType, IsNonZero>(*ty);
+    ArrayKernelExec exec =
+        codegen::Numeric<codegen::ScalarUnary, BooleanType, IsNonZero>(*ty);
     DCHECK_OK(func->AddKernel(ty->id(), {ty}, boolean(), exec));
   }
   for (const auto& ty : BaseBinaryTypes()) {
diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
index 16547a7eefe..54ea161b4bf 100644
--- a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
@@ -222,8 +222,8 @@ struct IsInVisitor {
       BitUtil::SetBitsTo(output->buffers[0]->mutable_data(), output->offset,
                          output->length, true);
     }
-    FirstTimeBitmapWriter writer(output->buffers[1]->mutable_data(),
-                                 output->offset, output->length);
+    FirstTimeBitmapWriter writer(output->buffers[1]->mutable_data(), output->offset,
+                                 output->length);
     auto lookup_value = [&](util::optional<T> v) {
       if (!v.has_value() || state.lookup_table.Get(*v) != -1) {
         writer.Set();
diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc
index 21e175e82fa..a6f950c268e 100644
--- a/cpp/src/parquet/encoding_benchmark.cc
+++ b/cpp/src/parquet/encoding_benchmark.cc
@@ -296,8 +296,7 @@ static void BM_PlainDecodingSpaced(benchmark::State& state) {
   std::vector<uint8_t> decode_values(num_values * sizeof(CType));
   auto decode_buf = reinterpret_cast<CType*>(decode_values.data());
   for (auto _ : state) {
-    decoder->SetData(num_values - null_count, buf->data(),
-                     static_cast<int>(buf->size()));
+    decoder->SetData(num_values - null_count, buf->data(), static_cast<int>(buf->size()));
     decoder->DecodeSpaced(decode_buf, num_values, null_count, valid_bits, 0);
   }
   state.SetBytesProcessed(state.iterations() * num_values * sizeof(CType));

From 78bdf7cd355a2b8ab9cfc54258142c77fc0aeddd Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Fri, 22 May 2020 16:12:15 -0500
Subject: [PATCH 24/41] Introduce TypeMatcher abstraction for input type
 checking and add implementations for same Type::type and same Timestamp
 TimeUnit

---
 cpp/src/arrow/compute/kernel.cc               | 110 ++++++++++++++----
 cpp/src/arrow/compute/kernel.h                |  60 +++++++---
 cpp/src/arrow/compute/kernel_test.cc          |  70 ++++++++---
 .../arrow/compute/kernels/codegen_internal.h  |  45 ++++---
 .../compute/kernels/scalar_cast_internal.cc   |   5 +-
 .../compute/kernels/scalar_cast_internal.h    |   3 +-
 .../compute/kernels/scalar_cast_numeric.cc    |  12 +-
 .../compute/kernels/scalar_cast_temporal.cc   |  14 +--
 .../arrow/compute/kernels/scalar_compare.cc   |  39 +++++--
 .../compute/kernels/scalar_compare_test.cc    |  34 +++++-
 cpp/src/arrow/type.cc                         |  82 +++++++------
 cpp/src/arrow/type.h                          |  17 +--
 12 files changed, 357 insertions(+), 134 deletions(-)

diff --git a/cpp/src/arrow/compute/kernel.cc b/cpp/src/arrow/compute/kernel.cc
index 64519c99f08..8bbff5905b4 100644
--- a/cpp/src/arrow/compute/kernel.cc
+++ b/cpp/src/arrow/compute/kernel.cc
@@ -26,12 +26,14 @@
 #include "arrow/compute/exec.h"
 #include "arrow/result.h"
 #include "arrow/util/bit_util.h"
+#include "arrow/util/checked_cast.h"
 #include "arrow/util/hash_util.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/macros.h"
 
 namespace arrow {
 
+using internal::checked_cast;
 using internal::hash_combine;
 
 static constexpr uint64_t kHashSeed = 0;
@@ -76,19 +78,94 @@ void KernelContext::SetStatus(const Status& status) {
 /// \brief Clear any error status
 void KernelContext::ResetStatus() { status_ = Status::OK(); }
 
+// ----------------------------------------------------------------------
+// Some basic TypeMatcher implementations
+
+namespace match {
+
+class SameTypeIdMatcher : public TypeMatcher {
+ public:
+  explicit SameTypeIdMatcher(Type::type accepted_id) : accepted_id_(accepted_id) {}
+
+  bool Matches(const DataType& type) const override { return type.id() == accepted_id_; }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << "Type::" << ::arrow::internal::ToString(accepted_id_);
+    return ss.str();
+  }
+
+  bool Equals(const TypeMatcher& other) const override {
+    if (this == &other) {
+      return true;
+    }
+
+    auto casted = dynamic_cast<const SameTypeIdMatcher*>(&other);
+    if (casted == nullptr) {
+      return false;
+    }
+    return this->accepted_id_ == casted->accepted_id_;
+  }
+
+ private:
+  Type::type accepted_id_;
+};
+
+std::shared_ptr<TypeMatcher> SameTypeId(Type::type type_id) {
+  return std::make_shared<SameTypeIdMatcher>(type_id);
+}
+
+class TimestampUnitMatcher : public TypeMatcher {
+ public:
+  explicit TimestampUnitMatcher(TimeUnit::type accepted_unit)
+      : accepted_unit_(accepted_unit) {}
+
+  bool Matches(const DataType& type) const override {
+    if (type.id() != Type::TIMESTAMP) {
+      return false;
+    }
+    const auto& ts_type = checked_cast<const TimestampType&>(type);
+    return ts_type.unit() == accepted_unit_;
+  }
+
+  bool Equals(const TypeMatcher& other) const override {
+    if (this == &other) {
+      return true;
+    }
+    auto casted = dynamic_cast<const TimestampUnitMatcher*>(&other);
+    if (casted == nullptr) {
+      return false;
+    }
+    return this->accepted_unit_ == casted->accepted_unit_;
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << "timestamp(" << ::arrow::internal::ToString(accepted_unit_) << ")";
+    return ss.str();
+  }
+
+ private:
+  TimeUnit::type accepted_unit_;
+};
+
+std::shared_ptr<TypeMatcher> TimestampUnit(TimeUnit::type unit) {
+  return std::make_shared<TimestampUnitMatcher>(unit);
+}
+
+}  // namespace match
+
 // ----------------------------------------------------------------------
 // InputType
 
 uint64_t InputType::Hash() const {
   uint64_t result = kHashSeed;
   hash_combine(result, static_cast<int>(shape_));
+  hash_combine(result, static_cast<int>(kind_));
   switch (kind_) {
     case InputType::EXACT_TYPE:
       hash_combine(result, type_->Hash());
       break;
-    case InputType::SAME_TYPE_ID:
-      hash_combine(result, static_cast<int>(type_id_));
-      break;
     default:
       break;
   }
@@ -116,10 +193,8 @@ std::string InputType::ToString() const {
     case InputType::EXACT_TYPE:
       ss << type_->ToString();
       break;
-    case InputType::SAME_TYPE_ID: {
-      // Indicate that the parameters for the type are unspecified. TODO: don't
-      // show this for types without parameters, like Type::INT32
-      ss << ::arrow::internal::ToString(type_id_) << "*";
+    case InputType::USE_TYPE_MATCHER: {
+      ss << type_matcher_->ToString();
     } break;
     default:
       DCHECK(false);
@@ -139,8 +214,8 @@ bool InputType::Equals(const InputType& other) const {
   switch (kind_) {
     case InputType::EXACT_TYPE:
       return type_->Equals(*other.type_);
-    case InputType::SAME_TYPE_ID:
-      return type_id_ == other.type_id_;
+    case InputType::USE_TYPE_MATCHER:
+      return type_matcher_->Equals(*other.type_matcher_);
     default:
       return false;
   }
@@ -153,8 +228,8 @@ bool InputType::Matches(const ValueDescr& descr) const {
   switch (kind_) {
     case InputType::EXACT_TYPE:
       return type_->Equals(*descr.type);
-    case InputType::SAME_TYPE_ID:
-      return type_id_ == descr.type->id();
+    case InputType::USE_TYPE_MATCHER:
+      return type_matcher_->Matches(*descr.type);
     default:
       // ANY_TYPE
       return true;
@@ -168,16 +243,9 @@ const std::shared_ptr<DataType>& InputType::type() const {
   return type_;
 }
 
-Type::type InputType::type_id() const {
-  switch (kind_) {
-    case InputType::EXACT_TYPE:
-      return type_->id();
-    case InputType::SAME_TYPE_ID:
-      return type_id_;
-    default:
-      DCHECK(false);
-      return Type::NA;
-  }
+const TypeMatcher& InputType::type_matcher() const {
+  DCHECK_EQ(InputType::USE_TYPE_MATCHER, kind_);
+  return *type_matcher_;
 }
 
 // ----------------------------------------------------------------------
diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h
index 3307ad379f4..cd59bfa8237 100644
--- a/cpp/src/arrow/compute/kernel.h
+++ b/cpp/src/arrow/compute/kernel.h
@@ -108,6 +108,34 @@ class ARROW_EXPORT KernelContext {
 /// (e.g. StringBuilder) must be employed
 using ArrayKernelExec = std::function<void(KernelContext*, const ExecBatch&, Datum*)>;
 
+/// \brief An abstract type-checking interface to permit customizable
+/// validation rules. This is for scenarios where the acceptance is not an
+/// exact type instance along with its unit.
+struct TypeMatcher {
+  virtual ~TypeMatcher() = default;
+
+  /// \brief Return true if this matcher accepts the data type
+  virtual bool Matches(const DataType& type) const = 0;
+
+  /// \brief A human-interpretable string representation of what the type
+  /// matcher checks for, usable when printing KernelSignature or formatting
+  /// error messages.
+  virtual std::string ToString() const = 0;
+
+  virtual bool Equals(const TypeMatcher& other) const = 0;
+};
+
+namespace match {
+
+/// \brief Match any DataType instance having the same DataType::id
+std::shared_ptr<TypeMatcher> SameTypeId(Type::type type_id);
+
+/// \brief Match any TimestampType instance having the same unit, but the time
+/// zones can be different
+std::shared_ptr<TypeMatcher> TimestampUnit(TimeUnit::type unit);
+
+}  // namespace match
+
 /// \brief A container to express what kernel argument input types are accepted
 class ARROW_EXPORT InputType {
  public:
@@ -120,9 +148,8 @@ class ARROW_EXPORT InputType {
     /// or same nested child types
     EXACT_TYPE,
 
-    /// Any type having the indicated Type::type id. For example, accept
-    /// any Type::LIST or any Type::TIMESTAMP
-    SAME_TYPE_ID,
+    /// Uses an TypeMatcher implementation to check the type
+    USE_TYPE_MATCHER
   };
 
   InputType(ValueDescr::Shape shape = ValueDescr::ANY)  // NOLINT implicit construction
@@ -130,14 +157,17 @@ class ARROW_EXPORT InputType {
 
   InputType(std::shared_ptr<DataType> type,
             ValueDescr::Shape shape = ValueDescr::ANY)  // NOLINT implicit construction
-      : kind_(EXACT_TYPE), shape_(shape), type_(std::move(type)), type_id_(type_->id()) {}
+      : kind_(EXACT_TYPE), shape_(shape), type_(std::move(type)) {}
 
   InputType(const ValueDescr& descr)  // NOLINT implicit construction
       : InputType(descr.type, descr.shape) {}
 
-  InputType(Type::type type_id,
-            ValueDescr::Shape shape = ValueDescr::ANY)  // NOLINT implicit construction
-      : kind_(SAME_TYPE_ID), shape_(shape), type_id_(type_id) {}
+  InputType(std::shared_ptr<TypeMatcher> type_matcher,
+            ValueDescr::Shape shape = ValueDescr::ANY)
+      : kind_(USE_TYPE_MATCHER), shape_(shape), type_matcher_(std::move(type_matcher)) {}
+
+  explicit InputType(Type::type type_id, ValueDescr::Shape shape = ValueDescr::ANY)
+      : InputType(match::SameTypeId(type_id), shape) {}
 
   InputType(const InputType& other) { CopyInto(other); }
 
@@ -186,38 +216,38 @@ class ARROW_EXPORT InputType {
 
   ValueDescr::Shape shape() const { return shape_; }
 
-  /// \brief For ArgKind::EXACT_TYPE, the exact type that this InputType must
+  /// \brief For InputType::EXACT_TYPE, the exact type that this InputType must
   /// match. Otherwise this function should not be used
   const std::shared_ptr<DataType>& type() const;
 
-  /// \brief For ArgKind::SAME_TYPE_ID, the Type::type that this InputType must
+  /// \brief For InputType::, the Type::type that this InputType must
   /// match, Otherwise this function should not be used
-  Type::type type_id() const;
+  const TypeMatcher& type_matcher() const;
 
  private:
   void CopyInto(const InputType& other) {
     this->kind_ = other.kind_;
     this->shape_ = other.shape_;
     this->type_ = other.type_;
-    this->type_id_ = other.type_id_;
+    this->type_matcher_ = other.type_matcher_;
   }
 
   void MoveInto(InputType&& other) {
     this->kind_ = other.kind_;
     this->shape_ = other.shape_;
     this->type_ = std::move(other.type_);
-    this->type_id_ = other.type_id_;
+    this->type_matcher_ = std::move(other.type_matcher_);
   }
 
   Kind kind_;
 
   ValueDescr::Shape shape_ = ValueDescr::ANY;
 
-  // For EXACT_TYPE ArgKind
+  // For EXACT_TYPE Kind
   std::shared_ptr<DataType> type_;
 
-  // For SAME_TYPE_ID ArgKind
-  Type::type type_id_ = Type::NA;
+  // For USE_TYPE_MATCHER Kind
+  std::shared_ptr<TypeMatcher> type_matcher_;
 };
 
 /// \brief Container to capture both exact and input-dependent output types
diff --git a/cpp/src/arrow/compute/kernel_test.cc b/cpp/src/arrow/compute/kernel_test.cc
index 56a3006c882..5528f188aa4 100644
--- a/cpp/src/arrow/compute/kernel_test.cc
+++ b/cpp/src/arrow/compute/kernel_test.cc
@@ -30,6 +30,40 @@
 namespace arrow {
 namespace compute {
 
+// ----------------------------------------------------------------------
+// TypeMatcher
+
+TEST(TypeMatcher, SameTypeId) {
+  std::shared_ptr<TypeMatcher> matcher = match::SameTypeId(Type::DECIMAL);
+  ASSERT_TRUE(matcher->Matches(*decimal(12, 2)));
+  ASSERT_FALSE(matcher->Matches(*int8()));
+
+  ASSERT_EQ("Type::DECIMAL", matcher->ToString());
+
+  ASSERT_TRUE(matcher->Equals(*matcher));
+  ASSERT_TRUE(matcher->Equals(*match::SameTypeId(Type::DECIMAL)));
+  ASSERT_FALSE(matcher->Equals(*match::SameTypeId(Type::TIMESTAMP)));
+}
+
+TEST(TypeMatcher, TimestampUnit) {
+  std::shared_ptr<TypeMatcher> matcher = match::TimestampUnit(TimeUnit::MILLI);
+
+  ASSERT_TRUE(matcher->Matches(*timestamp(TimeUnit::MILLI)));
+  ASSERT_TRUE(matcher->Matches(*timestamp(TimeUnit::MILLI, "utc")));
+  ASSERT_FALSE(matcher->Matches(*timestamp(TimeUnit::SECOND)));
+
+  // Check ToString representation
+  ASSERT_EQ("timestamp(s)", match::TimestampUnit(TimeUnit::SECOND)->ToString());
+  ASSERT_EQ("timestamp(ms)", match::TimestampUnit(TimeUnit::MILLI)->ToString());
+  ASSERT_EQ("timestamp(us)", match::TimestampUnit(TimeUnit::MICRO)->ToString());
+  ASSERT_EQ("timestamp(ns)", match::TimestampUnit(TimeUnit::NANO)->ToString());
+
+  // Equals implementation
+  ASSERT_TRUE(matcher->Equals(*matcher));
+  ASSERT_TRUE(matcher->Equals(*match::TimestampUnit(TimeUnit::MILLI)));
+  ASSERT_FALSE(matcher->Equals(*match::TimestampUnit(TimeUnit::MICRO)));
+}
+
 // ----------------------------------------------------------------------
 // InputType
 
@@ -52,7 +86,6 @@ TEST(InputType, Constructors) {
   ASSERT_EQ(InputType::EXACT_TYPE, ty1.kind());
   ASSERT_EQ(ValueDescr::ANY, ty1.shape());
   AssertTypeEqual(*int8(), *ty1.type());
-  ASSERT_EQ(Type::INT8, ty1.type_id());
 
   InputType ty1_implicit = int8();
   ASSERT_TRUE(ty1.Equals(ty1_implicit));
@@ -64,9 +97,11 @@ TEST(InputType, Constructors) {
   ASSERT_EQ(ValueDescr::SCALAR, ty1_scalar.shape());
 
   // Same type id constructor
-  InputType ty2 = Type::DECIMAL;
-  ASSERT_EQ(InputType::SAME_TYPE_ID, ty2.kind());
-  ASSERT_EQ(Type::DECIMAL, ty2.type_id());
+  InputType ty2(Type::DECIMAL);
+  ASSERT_EQ(InputType::USE_TYPE_MATCHER, ty2.kind());
+  ASSERT_EQ("any[Type::DECIMAL]", ty2.ToString());
+  ASSERT_TRUE(ty2.type_matcher().Matches(*decimal(12, 2)));
+  ASSERT_FALSE(ty2.type_matcher().Matches(*int16()));
 
   InputType ty2_array(Type::DECIMAL, ValueDescr::ARRAY);
   ASSERT_EQ(ValueDescr::ARRAY, ty2_array.shape());
@@ -75,7 +110,7 @@ TEST(InputType, Constructors) {
   ASSERT_EQ(ValueDescr::SCALAR, ty2_scalar.shape());
 
   // Implicit construction in a vector
-  std::vector<InputType> types = {int8(), Type::DECIMAL};
+  std::vector<InputType> types = {int8(), InputType(Type::DECIMAL)};
   ASSERT_TRUE(types[0].Equals(ty1));
   ASSERT_TRUE(types[1].Equals(ty2));
 
@@ -96,9 +131,12 @@ TEST(InputType, Constructors) {
   ASSERT_EQ("array[int8]", ty1_array.ToString());
   ASSERT_EQ("scalar[int8]", ty1_scalar.ToString());
 
-  ASSERT_EQ("any[decimal*]", ty2.ToString());
-  ASSERT_EQ("array[decimal*]", ty2_array.ToString());
-  ASSERT_EQ("scalar[decimal*]", ty2_scalar.ToString());
+  ASSERT_EQ("any[Type::DECIMAL]", ty2.ToString());
+  ASSERT_EQ("array[Type::DECIMAL]", ty2_array.ToString());
+  ASSERT_EQ("scalar[Type::DECIMAL]", ty2_scalar.ToString());
+
+  InputType ty7(match::TimestampUnit(TimeUnit::MICRO));
+  ASSERT_EQ("any[timestamp(us)]", ty7.ToString());
 }
 
 TEST(InputType, Equals) {
@@ -110,8 +148,8 @@ TEST(InputType, Equals) {
   InputType t4(int8(), ValueDescr::ARRAY);
   InputType t4_i32(int32(), ValueDescr::ARRAY);
 
-  InputType t5 = Type::DECIMAL;
-  InputType t6 = Type::DECIMAL;
+  InputType t5(Type::DECIMAL);
+  InputType t6(Type::DECIMAL);
   InputType t7(Type::DECIMAL, ValueDescr::SCALAR);
   InputType t7_i32(Type::INT32, ValueDescr::SCALAR);
   InputType t8(Type::DECIMAL, ValueDescr::SCALAR);
@@ -161,7 +199,7 @@ TEST(InputType, Hash) {
   InputType t0_array(ValueDescr::ARRAY);
 
   InputType t1 = int8();
-  InputType t2 = Type::DECIMAL;
+  InputType t2(Type::DECIMAL);
 
   // These checks try to determine first of all whether Hash always returns the
   // same value, and whether the elements of the type are all incorporated into
@@ -187,7 +225,7 @@ TEST(InputType, Matches) {
   ASSERT_TRUE(ty1.Matches(ValueDescr::Any(int8())));
   ASSERT_FALSE(ty1.Matches(ValueDescr::Any(int16())));
 
-  InputType ty2 = Type::DECIMAL;
+  InputType ty2(Type::DECIMAL);
   ASSERT_TRUE(ty2.Matches(ValueDescr::Scalar(decimal(12, 2))));
   ASSERT_TRUE(ty2.Matches(ValueDescr::Array(decimal(12, 2))));
   ASSERT_FALSE(ty2.Matches(ValueDescr::Any(float64())));
@@ -380,7 +418,7 @@ TEST(KernelSignature, MatchesInputs) {
   ASSERT_FALSE(sig1.MatchesInputs({int8()}));
 
   // (any[int8], any[decimal]) -> boolean
-  KernelSignature sig2({int8(), Type::DECIMAL}, boolean());
+  KernelSignature sig2({int8(), InputType(Type::DECIMAL)}, boolean());
 
   ASSERT_FALSE(sig2.MatchesInputs({}));
   ASSERT_FALSE(sig2.MatchesInputs({int8()}));
@@ -422,14 +460,14 @@ TEST(KernelSignature, ToString) {
                                      InputType(Type::DECIMAL, ValueDescr::ARRAY),
                                      InputType(utf8())};
   KernelSignature sig(in_types, utf8());
-  ASSERT_EQ("(scalar[int8], array[decimal*], any[string]) -> any[string]",
+  ASSERT_EQ("(scalar[int8], array[Type::DECIMAL], any[string]) -> any[string]",
             sig.ToString());
 
   OutputType out_type([](KernelContext*, const std::vector<ValueDescr>& args) {
     return Status::Invalid("NYI");
   });
-  KernelSignature sig2({int8(), Type::DECIMAL}, out_type);
-  ASSERT_EQ("(any[int8], any[decimal*]) -> computed", sig2.ToString());
+  KernelSignature sig2({int8(), InputType(Type::DECIMAL)}, out_type);
+  ASSERT_EQ("(any[int8], any[Type::DECIMAL]) -> computed", sig2.ToString());
 }
 
 TEST(KernelSignature, VarArgsToString) {
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h
index bca215cc6f5..cfbf710e054 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.h
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -479,6 +479,21 @@ using ScalarBinaryEqualTypes = ScalarBinary<OutType, ArgType, ArgType, Op, Flipp
 // corresponding template that generate's the kernel's Exec function to be
 // instantiated
 
+namespace detail {
+
+// Convenience so we can pass DataType or Type::type into these kernel selectors
+struct GetTypeId {
+  Type::type id;
+  GetTypeId(const std::shared_ptr<DataType>& type)  // NOLINT implicit construction
+      : id(type->id()) {}
+  GetTypeId(const DataType& type)  // NOLINT implicit construction
+      : id(type.id()) {}
+  GetTypeId(Type::type id)  // NOLINT implicit construction
+      : id(id) {}
+};
+
+}  // namespace detail
+
 // Generate a kernel given a functor of type
 //
 // struct OPERATOR_NAME {
@@ -488,8 +503,8 @@ using ScalarBinaryEqualTypes = ScalarBinary<OutType, ArgType, ArgType, Op, Flipp
 //   }
 // };
 template <typename Op>
-ArrayKernelExec NumericEqualTypesUnary(const DataType& type) {
-  switch (type.id()) {
+ArrayKernelExec NumericEqualTypesUnary(detail::GetTypeId get_id) {
+  switch (get_id.id) {
     case Type::INT8:
       return ScalarPrimitiveExecUnary<Op, Int8Type, Int8Type>;
     case Type::UINT8:
@@ -525,8 +540,8 @@ ArrayKernelExec NumericEqualTypesUnary(const DataType& type) {
 //   }
 // };
 template <typename Op>
-ArrayKernelExec NumericEqualTypesBinary(const DataType& type) {
-  switch (type.id()) {
+ArrayKernelExec NumericEqualTypesBinary(detail::GetTypeId get_id) {
+  switch (get_id.id) {
     case Type::INT8:
       return ScalarPrimitiveExecBinary<Op, Int8Type, Int8Type, Int8Type>;
     case Type::UINT8:
@@ -567,8 +582,8 @@ ArrayKernelExec NumericEqualTypesBinary(const DataType& type) {
 // types
 template <template <typename...> class Generator,
           typename Type0, typename... Args>
-ArrayKernelExec Numeric(const DataType& type) {
-  switch (type.id()) {
+ArrayKernelExec Numeric(detail::GetTypeId get_id) {
+  switch (get_id.id) {
     case Type::INT8:
       return Generator<Type0, Int8Type, Args...>::Exec;
     case Type::UINT8:
@@ -600,8 +615,8 @@ ArrayKernelExec Numeric(const DataType& type) {
 // See "Numeric" above for description of the generator functor
 template <template <typename...> class Generator,
           typename Type0, typename... Args>
-ArrayKernelExec FloatingPoint(const DataType& type) {
-  switch (type.id()) {
+ArrayKernelExec FloatingPoint(detail::GetTypeId get_id) {
+  switch (get_id.id) {
     case Type::FLOAT:
       return Generator<Type0, FloatType, Args...>::Exec;
     case Type::DOUBLE:
@@ -617,8 +632,8 @@ ArrayKernelExec FloatingPoint(const DataType& type) {
 // See "Numeric" above for description of the generator functor
 template <template <typename...> class Generator,
           typename Type0, typename... Args>
-ArrayKernelExec Integer(const DataType& type) {
-  switch (type.id()) {
+ArrayKernelExec Integer(detail::GetTypeId get_id) {
+  switch (get_id.id) {
     case Type::INT8:
       return Generator<Type0, Int8Type, Args...>::Exec;
     case Type::INT16:
@@ -646,8 +661,8 @@ ArrayKernelExec Integer(const DataType& type) {
 // See "Numeric" above for description of the generator functor
 template <template <typename...> class Generator,
           typename Type0, typename... Args>
-ArrayKernelExec BaseBinary(const DataType& type) {
-  switch (type.id()) {
+ArrayKernelExec BaseBinary(detail::GetTypeId get_id) {
+  switch (get_id.id) {
     case Type::BINARY:
       return Generator<Type0, BinaryType, Args...>::Exec;
     case Type::STRING:
@@ -667,12 +682,14 @@ ArrayKernelExec BaseBinary(const DataType& type) {
 // See "Numeric" above for description of the generator functor
 template <template <typename...> class Generator,
           typename Type0, typename... Args>
-ArrayKernelExec Temporal(const DataType& type) {
-  switch (type.id()) {
+ArrayKernelExec Temporal(detail::GetTypeId get_id) {
+  switch (get_id.id) {
     case Type::DATE32:
       return Generator<Type0, Date32Type, Args...>::Exec;
     case Type::DATE64:
       return Generator<Type0, Date64Type, Args...>::Exec;
+    case Type::DURATION:
+      return Generator<Type0, DurationType, Args...>::Exec;
     case Type::TIME32:
       return Generator<Type0, Time32Type, Args...>::Exec;
     case Type::TIME64:
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
index 4fb5fabe02e..2bb4641d0bb 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
@@ -63,14 +63,15 @@ void ZeroCopyCastExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
   }
 }
 
-void AddZeroCopyCast(InputType in_type, OutputType out_type, CastFunction* func) {
+void AddZeroCopyCast(Type::type in_type_id, InputType in_type, OutputType out_type,
+                     CastFunction* func) {
   auto sig = KernelSignature::Make({in_type}, out_type);
   ScalarKernel kernel;
   kernel.exec = ZeroCopyCastExec;
   kernel.signature = sig;
   kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
   kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
-  DCHECK_OK(func->AddKernel(in_type.type_id(), std::move(kernel)));
+  DCHECK_OK(func->AddKernel(in_type_id, std::move(kernel)));
 }
 
 }  // namespace internal
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
index 6d52f55c2e3..d3fcd84881e 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
@@ -240,7 +240,8 @@ void AddSimpleCast(InputType in_ty, OutputType out_ty, CastFunction* func) {
 
 void ZeroCopyCastExec(KernelContext* ctx, const ExecBatch& batch, Datum* out);
 
-void AddZeroCopyCast(InputType in_type, OutputType out_type, CastFunction* func);
+void AddZeroCopyCast(Type::type in_type_id, InputType in_type, OutputType out_type,
+                     CastFunction* func);
 
 // OutputType::Resolver that returns a descr with the shape of the input
 // argument and the type from CastOptions
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
index a7b963af5ea..26a6cca3ffe 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
@@ -529,16 +529,16 @@ std::vector<std::shared_ptr<CastFunction>> GetNumericCasts() {
 
   auto cast_int32 = GetCastToInteger<Int32Type>("cast_int32");
   // Convert DATE32 or TIME32 to INT32 zero copy
-  AddZeroCopyCast(date32(), int32(), cast_int32.get());
-  AddZeroCopyCast(Type::TIME32, int32(), cast_int32.get());
+  AddZeroCopyCast(Type::DATE32, date32(), int32(), cast_int32.get());
+  AddZeroCopyCast(Type::TIME32, InputType(Type::TIME32), int32(), cast_int32.get());
   functions.push_back(cast_int32);
 
   auto cast_int64 = GetCastToInteger<Int64Type>("cast_int64");
   // Convert DATE64, DURATION, TIMESTAMP, TIME64 to INT64 zero copy
-  AddZeroCopyCast(Type::DATE64, int64(), cast_int64.get());
-  AddZeroCopyCast(Type::DURATION, int64(), cast_int64.get());
-  AddZeroCopyCast(Type::TIMESTAMP, int64(), cast_int64.get());
-  AddZeroCopyCast(Type::TIME64, int64(), cast_int64.get());
+  AddZeroCopyCast(Type::DATE64, InputType(Type::DATE64), int64(), cast_int64.get());
+  AddZeroCopyCast(Type::DURATION, InputType(Type::DURATION), int64(), cast_int64.get());
+  AddZeroCopyCast(Type::TIMESTAMP, InputType(Type::TIMESTAMP), int64(), cast_int64.get());
+  AddZeroCopyCast(Type::TIME64, InputType(Type::TIME64), int64(), cast_int64.get());
   functions.push_back(cast_int64);
 
   functions.push_back(GetCastToInteger<UInt8Type>("cast_uint8"));
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
index b8a01f91d9e..3ca9535b14c 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
@@ -304,8 +304,8 @@ std::shared_ptr<CastFunction> GetDate32Cast() {
   auto out_ty = date32();
   AddCommonCasts<Date32Type>(out_ty, func.get());
 
-  // int64 -> date64
-  AddZeroCopyCast(int32(), date32(), func.get());
+  // int32 -> date32
+  AddZeroCopyCast(Type::INT32, int32(), date32(), func.get());
 
   // date64 -> date32
   AddSimpleCast<Date64Type, Date32Type>(date64(), date32(), func.get());
@@ -322,7 +322,7 @@ std::shared_ptr<CastFunction> GetDate64Cast() {
   AddCommonCasts<Date64Type>(out_ty, func.get());
 
   // int64 -> date64
-  AddZeroCopyCast(int64(), date64(), func.get());
+  AddZeroCopyCast(Type::INT64, int64(), date64(), func.get());
 
   // date32 -> date64
   AddSimpleCast<Date32Type, Date64Type>(date32(), date64(), func.get());
@@ -344,7 +344,7 @@ std::shared_ptr<CastFunction> GetDurationCast() {
   auto nanos = duration(TimeUnit::NANO);
 
   // Same integer representation
-  AddZeroCopyCast(/*in_type=*/int64(), kOutputTargetType, func.get());
+  AddZeroCopyCast(Type::INT64, /*in_type=*/int64(), kOutputTargetType, func.get());
 
   // Between durations
   AddCrossUnitCast<DurationType>(func.get());
@@ -357,7 +357,7 @@ std::shared_ptr<CastFunction> GetTime32Cast() {
   AddCommonCasts<Date32Type>(kOutputTargetType, func.get());
 
   // Zero copy when the unit is the same or same integer representation
-  AddZeroCopyCast(/*in_type=*/int32(), kOutputTargetType, func.get());
+  AddZeroCopyCast(Type::INT32, /*in_type=*/int32(), kOutputTargetType, func.get());
 
   // time64 -> time32
   AddSimpleCast<Time64Type, Time32Type>(InputType(Type::TIME64), kOutputTargetType,
@@ -374,7 +374,7 @@ std::shared_ptr<CastFunction> GetTime64Cast() {
   AddCommonCasts<Time64Type>(kOutputTargetType, func.get());
 
   // Zero copy when the unit is the same or same integer representation
-  AddZeroCopyCast(/*in_type=*/int64(), kOutputTargetType, func.get());
+  AddZeroCopyCast(Type::INT64, /*in_type=*/int64(), kOutputTargetType, func.get());
 
   // time32 -> time64
   AddSimpleCast<Time32Type, Time64Type>(InputType(Type::TIME32), kOutputTargetType,
@@ -391,7 +391,7 @@ std::shared_ptr<CastFunction> GetTimestampCast() {
   AddCommonCasts<TimestampType>(kOutputTargetType, func.get());
 
   // Same integer representation
-  AddZeroCopyCast(/*in_type=*/int64(), kOutputTargetType, func.get());
+  AddZeroCopyCast(Type::INT64, /*in_type=*/int64(), kOutputTargetType, func.get());
 
   // From date types
   // TODO: ARROW-8876, these casts are not implemented
diff --git a/cpp/src/arrow/compute/kernels/scalar_compare.cc b/cpp/src/arrow/compute/kernels/scalar_compare.cc
index e314f49dd6d..1c3fa722f17 100644
--- a/cpp/src/arrow/compute/kernels/scalar_compare.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_compare.cc
@@ -68,33 +68,52 @@ struct LessEqual {
   }
 };
 
+template <typename InType, typename Op, typename FlippedOp>
+void AddCompare(const std::shared_ptr<DataType>& ty, ScalarFunction* func) {
+  ArrayKernelExec exec =
+      codegen::ScalarBinaryEqualTypes<BooleanType, InType, Op, FlippedOp>::Exec;
+  DCHECK_OK(func->AddKernel({ty, ty}, boolean(), exec));
+}
+
+template <typename Op, typename FlippedOp>
+void AddTimestampComparisons(ScalarFunction* func) {
+  ArrayKernelExec exec =
+      codegen::ScalarBinaryEqualTypes<BooleanType, TimestampType, Op, FlippedOp>::Exec;
+  for (auto unit : {TimeUnit::SECOND, TimeUnit::MILLI, TimeUnit::MICRO, TimeUnit::NANO}) {
+    InputType in_type(match::TimestampUnit(unit));
+    DCHECK_OK(func->AddKernel({in_type, in_type}, boolean(), exec));
+  }
+}
+
 template <typename Op, typename FlippedOp = Op>
 void MakeCompareFunction(std::string name, FunctionRegistry* registry) {
   auto func = std::make_shared<ScalarFunction>(name, Arity::Binary());
 
-  auto out_ty = boolean();
   DCHECK_OK(func->AddKernel(
-      {boolean(), boolean()}, out_ty,
+      {boolean(), boolean()}, boolean(),
       codegen::ScalarBinary<BooleanType, BooleanType, BooleanType, Op, FlippedOp>::Exec));
 
   for (const std::shared_ptr<DataType>& ty : NumericTypes()) {
     auto exec =
         codegen::Numeric<codegen::ScalarBinaryEqualTypes, BooleanType, Op, FlippedOp>(
             *ty);
-    DCHECK_OK(func->AddKernel({ty, ty}, out_ty, exec));
-  }
-  for (const std::shared_ptr<DataType>& ty : TemporalTypes()) {
-    auto exec =
-        codegen::Temporal<codegen::ScalarBinaryEqualTypes, BooleanType, Op, FlippedOp>(
-            *ty);
-    DCHECK_OK(func->AddKernel({ty, ty}, out_ty, exec));
+    DCHECK_OK(func->AddKernel({ty, ty}, boolean(), exec));
   }
   for (const std::shared_ptr<DataType>& ty : BaseBinaryTypes()) {
     auto exec =
         codegen::BaseBinary<codegen::ScalarBinaryEqualTypes, BooleanType, Op, FlippedOp>(
             *ty);
-    DCHECK_OK(func->AddKernel({ty, ty}, out_ty, exec));
+    DCHECK_OK(func->AddKernel({ty, ty}, boolean(), exec));
   }
+
+  // Temporal types requires some care because cross-unit comparisons with
+  // everything but DATE32 and DATE64 are not implemented yet
+  AddCompare<Date32Type, Op, FlippedOp>(date32(), func.get());
+  AddCompare<Date64Type, Op, FlippedOp>(date64(), func.get());
+  AddTimestampComparisons<Op, FlippedOp>(func.get());
+
+  // TODO: Leave time32, time64, and duration for follow up work
+
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_compare_test.cc b/cpp/src/arrow/compute/kernels/scalar_compare_test.cc
index df0a76b407b..72b51dbb581 100644
--- a/cpp/src/arrow/compute/kernels/scalar_compare_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_compare_test.cc
@@ -239,7 +239,7 @@ void ValidateCompare(CompareOptions options, const Datum& lhs, const Datum& rhs)
 }
 
 template <typename ArrowType>
-class TestNumericCompareKernel : public TestBase {};
+class TestNumericCompareKernel : public ::testing::Test {};
 
 TYPED_TEST_SUITE(TestNumericCompareKernel, NumericArrowTypes);
 TYPED_TEST(TestNumericCompareKernel, SimpleCompareArrayScalar) {
@@ -416,7 +416,37 @@ TYPED_TEST(TestNumericCompareKernel, RandomCompareArrayArray) {
   }
 }
 
-class TestStringCompareKernel : public TestBase {};
+TEST(TestCompareTimestamps, Basics) {
+  const char* example1_json = R"(["1970-01-01","2000-02-29","1900-02-28"])";
+  const char* example2_json = R"(["1970-01-02","2000-02-01","1900-02-28"])";
+
+  auto CheckArrayCase = [&](std::shared_ptr<DataType> type, CompareOperator op,
+                            const char* expected_json) {
+    auto lhs = ArrayFromJSON(type, example1_json);
+    auto rhs = ArrayFromJSON(type, example2_json);
+    auto expected = ArrayFromJSON(boolean(), expected_json);
+    ASSERT_OK_AND_ASSIGN(Datum result, Compare(lhs, rhs, CompareOptions(op)));
+    AssertArraysEqual(*expected, *result.make_array(), /*verbose=*/true);
+  };
+
+  auto seconds = timestamp(TimeUnit::SECOND);
+  auto millis = timestamp(TimeUnit::MILLI);
+  auto micros = timestamp(TimeUnit::MICRO);
+  auto nanos = timestamp(TimeUnit::NANO);
+
+  CheckArrayCase(seconds, CompareOperator::EQUAL, "[false, false, true]");
+  CheckArrayCase(seconds, CompareOperator::NOT_EQUAL, "[true, true, false]");
+  CheckArrayCase(seconds, CompareOperator::LESS, "[true, false, false]");
+  CheckArrayCase(seconds, CompareOperator::LESS_EQUAL, "[true, false, true]");
+  CheckArrayCase(seconds, CompareOperator::GREATER, "[false, true, false]");
+  CheckArrayCase(seconds, CompareOperator::GREATER_EQUAL, "[false, true, true]");
+
+  // Check that comparisons with tz-aware timestamps work fine
+  auto seconds_utc = timestamp(TimeUnit::SECOND, "utc");
+  CheckArrayCase(seconds_utc, CompareOperator::EQUAL, "[false, false, true]");
+}
+
+class TestStringCompareKernel : public ::testing::Test {};
 
 TEST_F(TestStringCompareKernel, SimpleCompareArrayScalar) {
   Datum one(std::make_shared<StringScalar>("one"));
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index 97d8ededcbf..ece671f34ad 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -92,77 +92,93 @@ namespace internal {
 std::string ToString(Type::type id) {
   switch (id) {
     case Type::NA:
-      return "null";
+      return "NA";
     case Type::BOOL:
-      return "bool";
+      return "BOOL";
     case Type::UINT8:
-      return "uint8";
+      return "UINT8";
     case Type::INT8:
-      return "int8";
+      return "INT8";
     case Type::UINT16:
-      return "uint16";
+      return "UINT16";
     case Type::INT16:
-      return "int16";
+      return "INT16";
     case Type::UINT32:
-      return "uint32";
+      return "UINT32";
     case Type::INT32:
-      return "int32";
+      return "INT32";
     case Type::UINT64:
-      return "uint64";
+      return "UINT64";
     case Type::INT64:
-      return "int64";
+      return "INT64";
     case Type::HALF_FLOAT:
-      return "half_float";
+      return "HALF_FLOAT";
     case Type::FLOAT:
-      return "float";
+      return "FLOAT";
     case Type::DOUBLE:
-      return "double";
+      return "DOUBLE";
     case Type::STRING:
-      return "utf8";
+      return "UTF8";
     case Type::BINARY:
-      return "binary";
+      return "BINARY";
     case Type::FIXED_SIZE_BINARY:
-      return "fixed_size_binary";
+      return "FIXED_SIZE_BINARY";
     case Type::DATE64:
-      return "date64";
+      return "DATE64";
     case Type::TIMESTAMP:
-      return "timestamp";
+      return "TIMESTAMP";
     case Type::TIME32:
-      return "time32";
+      return "TIME32";
     case Type::TIME64:
-      return "time64";
+      return "TIME64";
     case Type::INTERVAL_MONTHS:
-      return "interval_months";
+      return "INTERVAL_MONTHS";
     case Type::INTERVAL_DAY_TIME:
-      return "interval_day_time";
+      return "INTERVAL_DAY_TIME";
     case Type::DECIMAL:
-      return "decimal";
+      return "DECIMAL";
     case Type::LIST:
-      return "list";
+      return "LIST";
     case Type::STRUCT:
-      return "struct";
+      return "STRUCT";
     case Type::UNION:
-      return "union";
+      return "UNION";
     case Type::DICTIONARY:
-      return "dictionary";
+      return "DICTIONARY";
     case Type::MAP:
-      return "map";
+      return "MAP";
     case Type::EXTENSION:
-      return "extension";
+      return "EXTENSION";
     case Type::FIXED_SIZE_LIST:
-      return "fixed_size_list";
+      return "FIXED_SIZE_LIST";
     case Type::DURATION:
-      return "duration";
+      return "DURATION";
     case Type::LARGE_BINARY:
-      return "large_binary";
+      return "LARGE_BINARY";
     case Type::LARGE_LIST:
-      return "large_list";
+      return "LARGE_LIST";
     default:
       DCHECK(false) << "Should not be able to reach here";
       return "unknown";
   }
 }
 
+std::string ToString(TimeUnit::type unit) {
+  switch (unit) {
+    case TimeUnit::SECOND:
+      return "s";
+    case TimeUnit::MILLI:
+      return "ms";
+    case TimeUnit::MICRO:
+      return "us";
+    case TimeUnit::NANO:
+      return "ns";
+    default:
+      DCHECK(false);
+      return "";
+  }
+}
+
 }  // namespace internal
 
 namespace {
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 8899e98a583..fd8c5657396 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -161,13 +161,6 @@ struct Type {
   };
 };
 
-namespace internal {
-
-ARROW_EXPORT
-std::string ToString(Type::type id);
-
-}  // namespace internal
-
 namespace detail {
 
 class ARROW_EXPORT Fingerprintable {
@@ -1880,4 +1873,14 @@ Result<std::shared_ptr<Schema>> UnifySchemas(
     const std::vector<std::shared_ptr<Schema>>& schemas,
     Field::MergeOptions field_merge_options = Field::MergeOptions::Defaults());
 
+namespace internal {
+
+ARROW_EXPORT
+std::string ToString(Type::type id);
+
+ARROW_EXPORT
+std::string ToString(TimeUnit::type unit);
+
+}  // namespace internal
+
 }  // namespace arrow

From d7c4c397ca2dfe00388c41c4e95e54ffb8c18e58 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Fri, 22 May 2020 16:36:58 -0500
Subject: [PATCH 25/41] Add missing timestamp kernels to Take and Filter

---
 .../arrow/compute/kernels/aggregate_basic.cc  |  2 +-
 .../compute/kernels/aggregate_internal.h      |  2 +-
 .../arrow/compute/kernels/vector_filter.cc    | 24 ++++++++++--------
 .../kernels/vector_selection_internal.h       |  5 +++-
 cpp/src/arrow/compute/kernels/vector_take.cc  | 25 +++++++++----------
 .../arrow/compute/kernels/vector_take_test.cc |  2 +-
 6 files changed, 32 insertions(+), 28 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
index 09f2bc30862..14f9be3f93e 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
@@ -1,7 +1,7 @@
 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
-// returnGegarding copyright ownership.  The ASF licenses this file
+// regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
diff --git a/cpp/src/arrow/compute/kernels/aggregate_internal.h b/cpp/src/arrow/compute/kernels/aggregate_internal.h
index 9e3e8c909b8..de3584588ea 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_internal.h
+++ b/cpp/src/arrow/compute/kernels/aggregate_internal.h
@@ -1,7 +1,7 @@
 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
-// returnGegarding copyright ownership.  The ASF licenses this file
+// regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
diff --git a/cpp/src/arrow/compute/kernels/vector_filter.cc b/cpp/src/arrow/compute/kernels/vector_filter.cc
index dd198d40018..b3681687720 100644
--- a/cpp/src/arrow/compute/kernels/vector_filter.cc
+++ b/cpp/src/arrow/compute/kernels/vector_filter.cc
@@ -1,7 +1,7 @@
 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
-// returnGegarding copyright ownership.  The ASF licenses this file
+// regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
@@ -20,6 +20,7 @@
 
 #include "arrow/array/concatenate.h"
 #include "arrow/builder.h"
+#include "arrow/compute/api_vector.h"
 #include "arrow/compute/kernels/common.h"
 #include "arrow/compute/kernels/vector_selection_internal.h"
 #include "arrow/record_batch.h"
@@ -153,20 +154,21 @@ void RegisterVectorFilter(FunctionRegistry* registry) {
   base.init = InitFilter;
 
   auto filter = std::make_shared<VectorFunction>("filter", Arity::Binary());
+  InputType filter_ty = InputType::Array(boolean());
   OutputType out_ty(FirstType);
-  InputType arg1_ty = InputType::Array(boolean());
-  for (const auto& value_ty : PrimitiveTypes()) {
-    InputType arg0_ty = InputType::Array(value_ty);
-    base.signature = KernelSignature::Make({arg0_ty, arg1_ty}, out_ty);
-    DCHECK_OK(GetFilterKernel(*value_ty, &base.exec));
+
+  auto AddKernel = [&](InputType in_ty, const DataType& example_type) {
+    base.signature = KernelSignature::Make({in_ty, filter_ty}, out_ty);
+    DCHECK_OK(GetFilterKernel(example_type, &base.exec));
     DCHECK_OK(filter->AddKernel(base));
-  }
+  };
 
+  for (const auto& value_ty : PrimitiveTypes()) {
+    AddKernel(InputType::Array(value_ty), *value_ty);
+  }
+  // Other types where we may only on the DataType::id
   for (const auto& value_ty : g_dummy_parametric_types) {
-    InputType arg0_ty = InputType::Array(value_ty->id());
-    base.signature = KernelSignature::Make({arg0_ty, arg1_ty}, out_ty);
-    DCHECK_OK(GetFilterKernel(*value_ty, &base.exec));
-    DCHECK_OK(filter->AddKernel(base));
+    AddKernel(InputType::Array(value_ty->id()), *value_ty);
   }
   DCHECK_OK(registry->AddFunction(std::move(filter)));
 }
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.h b/cpp/src/arrow/compute/kernels/vector_selection_internal.h
index 5e441520bc5..a1dc8c2eb23 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_internal.h
+++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.h
@@ -1,7 +1,7 @@
 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
-// returnGegarding copyright ownership.  The ASF licenses this file
+// regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
@@ -775,6 +775,9 @@ Status Select(KernelContext* ctx, const Array& values, IndexSequence sequence,
 // work above
 static DataTypeVector g_dummy_parametric_types = {
     decimal(12, 2),
+    timestamp(TimeUnit::SECOND),
+    time32(TimeUnit::SECOND),
+    time64(TimeUnit::MICRO),
     fixed_size_binary(0),
     list(null()),
     large_list(null()),
diff --git a/cpp/src/arrow/compute/kernels/vector_take.cc b/cpp/src/arrow/compute/kernels/vector_take.cc
index aa23ec88e8e..dc571650a49 100644
--- a/cpp/src/arrow/compute/kernels/vector_take.cc
+++ b/cpp/src/arrow/compute/kernels/vector_take.cc
@@ -1,7 +1,7 @@
 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
-// returnGegarding copyright ownership.  The ASF licenses this file
+// regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
@@ -85,25 +85,24 @@ void RegisterVectorTake(FunctionRegistry* registry) {
   base.can_execute_chunkwise = false;
 
   auto take = std::make_shared<VectorFunction>("take", Arity::Binary());
-
   OutputType out_ty(FirstType);
+
+  auto AddKernel = [&](InputType value_ty, const DataType& example_value_ty,
+                       const std::shared_ptr<DataType>& index_ty) {
+    base.signature =
+        KernelSignature::Make({value_ty, InputType::Array(index_ty)}, out_ty);
+    DCHECK_OK(GetTakeKernel(example_value_ty, *index_ty, &base.exec));
+    DCHECK_OK(take->AddKernel(base));
+  };
+
   for (const auto& value_ty : PrimitiveTypes()) {
-    InputType arg0_ty = InputType::Array(value_ty);
     for (const auto& index_ty : IntTypes()) {
-      base.signature =
-          KernelSignature::Make({arg0_ty, InputType::Array(index_ty)}, out_ty);
-      DCHECK_OK(GetTakeKernel(*value_ty, *index_ty, &base.exec));
-      DCHECK_OK(take->AddKernel(base));
+      AddKernel(InputType::Array(value_ty), *value_ty, index_ty);
     }
   }
-
   for (const auto& value_ty : g_dummy_parametric_types) {
-    InputType arg0_ty = InputType::Array(value_ty->id());
     for (const auto& index_ty : IntTypes()) {
-      base.signature =
-          KernelSignature::Make({arg0_ty, InputType::Array(index_ty)}, out_ty);
-      DCHECK_OK(GetTakeKernel(*value_ty, *index_ty, &base.exec));
-      DCHECK_OK(take->AddKernel(base));
+      AddKernel(InputType::Array(value_ty->id()), *value_ty, index_ty);
     }
   }
   DCHECK_OK(registry->AddFunction(std::move(take)));
diff --git a/cpp/src/arrow/compute/kernels/vector_take_test.cc b/cpp/src/arrow/compute/kernels/vector_take_test.cc
index 53aef1fbf7d..4f5f5f97825 100644
--- a/cpp/src/arrow/compute/kernels/vector_take_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_take_test.cc
@@ -1,7 +1,7 @@
 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
-// returnGegarding copyright ownership.  The ASF licenses this file
+// regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at

From 1318d4cc719f22346c3fb7d85996cccfbef6f198 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Fri, 22 May 2020 17:02:16 -0500
Subject: [PATCH 26/41] Some cleanup, add missing static breaking some builds

---
 .../arrow/compute/kernels/codegen_internal.cc | 21 ++++++++
 .../arrow/compute/kernels/codegen_internal.h  | 54 +++++++++++++++++--
 .../compute/kernels/scalar_cast_nested.cc     |  2 +-
 .../compute/kernels/scalar_cast_numeric.cc    |  2 +-
 .../arrow/compute/kernels/vector_filter.cc    |  2 +-
 .../kernels/vector_selection_internal.h       | 16 ------
 cpp/src/arrow/compute/kernels/vector_take.cc  |  2 +-
 7 files changed, 75 insertions(+), 24 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc
index 6ab4534ea45..2dfa3dac555 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.cc
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc
@@ -135,6 +135,27 @@ const std::vector<std::shared_ptr<DataType>>& PrimitiveTypes() {
   return g_primitive_types;
 }
 
+const std::vector<std::shared_ptr<DataType>>& ExampleParametricTypes() {
+  static DataTypeVector example_parametric_types = {
+    decimal(12, 2),
+    duration(TimeUnit::SECOND),
+    timestamp(TimeUnit::SECOND),
+    time32(TimeUnit::SECOND),
+    time64(TimeUnit::MICRO),
+    fixed_size_binary(0),
+    list(null()),
+    large_list(null()),
+    fixed_size_list(field("dummy", null()), 0),
+    struct_({}),
+    union_({}),
+    dictionary(int32(), null()),
+    map(null(), null())};
+  return example_parametric_types;
+}
+
+// Construct dummy parametric types so that we can get VisitTypeInline to
+// work above
+
 Result<ValueDescr> FirstType(KernelContext*, const std::vector<ValueDescr>& descrs) {
   return descrs[0];
 }
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h
index cfbf710e054..e0d531673c1 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.h
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -150,7 +150,8 @@ void BinaryExecFlipped(KernelContext* ctx, ArrayKernelExec exec,
                        const ExecBatch& batch, Datum* out);
 
 // ----------------------------------------------------------------------
-// Template kernel exec function generators
+// Helpers for iterating over common DataType instances for adding kernels to
+// functions
 
 const std::vector<std::shared_ptr<DataType>>& BaseBinaryTypes();
 const std::vector<std::shared_ptr<DataType>>& SignedIntTypes();
@@ -158,6 +159,24 @@ const std::vector<std::shared_ptr<DataType>>& UnsignedIntTypes();
 const std::vector<std::shared_ptr<DataType>>& IntTypes();
 const std::vector<std::shared_ptr<DataType>>& FloatingPointTypes();
 
+// Returns a vector of example instances of parametric types such as
+//
+// * Decimal
+// * Timestamp (requiring unit)
+// * Time32 (requiring unit)
+// * Time64 (requiring unit)
+// * Duration (requiring unit)
+// * List, LargeList, FixedSizeList
+// * Struct
+// * Union
+// * Dictionary
+// * Map
+//
+// Generally kernels will use the "FirstType" OutputType::Resolver above for
+// the OutputType of the kernel's signature and match::SameTypeId for the
+// corresponding InputType
+const std::vector<std::shared_ptr<DataType>>& ExampleParametricTypes();
+
 // Number types without boolean
 const std::vector<std::shared_ptr<DataType>>& NumericTypes();
 
@@ -167,6 +186,10 @@ const std::vector<std::shared_ptr<DataType>>& TemporalTypes();
 // Integer, floating point, base binary, and temporal
 const std::vector<std::shared_ptr<DataType>>& PrimitiveTypes();
 
+// ----------------------------------------------------------------------
+// Template functions and utilities for generating ArrayKernelExec functions
+// for kernels given functors providing the right kind of template / prototype
+
 namespace codegen {
 
 // Generate an ArrayKernelExec given a functor that handles all of its own
@@ -247,6 +270,10 @@ void ScalarPrimitiveExecBinary(KernelContext* ctx, const ExecBatch& batch, Datum
   }
 }
 
+// OutputAdapter allows passing an inlineable lambda that provides a sequence
+// of output values to write into output memory. Boolean and primitive outputs
+// are currently implemented, and the validity bitmap is presumed to be handled
+// at a higher level, so this writes into every output slot, null or not.
 template <typename Type, typename Enable = void>
 struct OutputAdapter;
 
@@ -287,7 +314,17 @@ struct OutputAdapter<Type, enable_if_base_binary<Type>> {
 // templates
 //
 // This template executes the operator even on the data behind null values,
-// therefore it is generally only suitable for operators that cannot fail.
+// therefore it is generally only suitable for operators that are safe to apply
+// even on the null slot values.
+//
+// The "Op" functor should have the form
+//
+// struct Op {
+//   template <typename OUT, typename ARG0>
+//   static OUT Call(KernelContext* ctx, ARG0 val) {
+//     // implementation
+//   }
+// };
 template <typename OutType, typename Arg0Type, typename Op>
 struct ScalarUnary {
   using OutScalar = typename TypeTraits<OutType>::ScalarType;
@@ -413,7 +450,17 @@ struct ScalarUnaryNotNull {
 // templates
 //
 // This template executes the operator even on the data behind null values,
-// therefore it is generally only suitable for operators that cannot fail.
+// therefore it is generally only suitable for operators that are safe to apply
+// even on the null slot values.
+//
+// The "Op" functor should have the form
+//
+// struct Op {
+//   template <typename OUT, typename ARG0, typename ARG1>
+//   static OUT Call(KernelContext* ctx, ARG0 arg0, ARG1 arg1) {
+//     // implementation
+//   }
+// };
 template <typename OutType, typename Arg0Type, typename Arg1Type, typename Op,
           typename FlippedOp = Op>
 struct ScalarBinary {
@@ -703,6 +750,5 @@ ArrayKernelExec Temporal(detail::GetTypeId get_id) {
 }
 
 }  // namespace codegen
-
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
index af8502179ba..c0177d8c4fa 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
@@ -53,7 +53,7 @@ void CastListExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
   result->child_data.push_back(casted_child.array());
 }
 
-OutputType kOutputTargetType(ResolveOutputFromOptions);
+static OutputType kOutputTargetType(ResolveOutputFromOptions);
 
 template <typename Type>
 void AddListCast(CastFunction* func) {
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
index 26a6cca3ffe..773decd9d76 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-// Implementation of casting to integer or floating point types
+// Implementation of casting to integer, floating point, or decimal types
 
 #include "arrow/compute/kernels/common.h"
 #include "arrow/compute/kernels/scalar_cast_internal.h"
diff --git a/cpp/src/arrow/compute/kernels/vector_filter.cc b/cpp/src/arrow/compute/kernels/vector_filter.cc
index b3681687720..eef16498282 100644
--- a/cpp/src/arrow/compute/kernels/vector_filter.cc
+++ b/cpp/src/arrow/compute/kernels/vector_filter.cc
@@ -167,7 +167,7 @@ void RegisterVectorFilter(FunctionRegistry* registry) {
     AddKernel(InputType::Array(value_ty), *value_ty);
   }
   // Other types where we may only on the DataType::id
-  for (const auto& value_ty : g_dummy_parametric_types) {
+  for (const auto& value_ty : ExampleParametricTypes()) {
     AddKernel(InputType::Array(value_ty->id()), *value_ty);
   }
   DCHECK_OK(registry->AddFunction(std::move(filter)));
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.h b/cpp/src/arrow/compute/kernels/vector_selection_internal.h
index a1dc8c2eb23..f23adc80cbb 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_internal.h
+++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.h
@@ -771,22 +771,6 @@ Status Select(KernelContext* ctx, const Array& values, IndexSequence sequence,
   return taker->Finish(out);
 }
 
-// Construct dummy parametric types so that we can get VisitTypeInline to
-// work above
-static DataTypeVector g_dummy_parametric_types = {
-    decimal(12, 2),
-    timestamp(TimeUnit::SECOND),
-    time32(TimeUnit::SECOND),
-    time64(TimeUnit::MICRO),
-    fixed_size_binary(0),
-    list(null()),
-    large_list(null()),
-    fixed_size_list(field("dummy", null()), 0),
-    struct_({}),
-    union_({}),
-    dictionary(int32(), null()),
-    map(null(), null())};
-
 }  // namespace internal
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/vector_take.cc b/cpp/src/arrow/compute/kernels/vector_take.cc
index dc571650a49..082f004337a 100644
--- a/cpp/src/arrow/compute/kernels/vector_take.cc
+++ b/cpp/src/arrow/compute/kernels/vector_take.cc
@@ -100,7 +100,7 @@ void RegisterVectorTake(FunctionRegistry* registry) {
       AddKernel(InputType::Array(value_ty), *value_ty, index_ty);
     }
   }
-  for (const auto& value_ty : g_dummy_parametric_types) {
+  for (const auto& value_ty : ExampleParametricTypes()) {
     for (const auto& index_ty : IntTypes()) {
       AddKernel(InputType::Array(value_ty->id()), *value_ty, index_ty);
     }

From 36ddf6d65a2e3f7ef3e1ea9fc488e0d2fdb36ccd Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Fri, 22 May 2020 17:15:04 -0500
Subject: [PATCH 27/41] Restore use of ChunkedArray for value_set in set lookup
 operations IsIn and Match

---
 cpp/src/arrow/compute/api_scalar.cc           | 24 ++++----
 cpp/src/arrow/compute/api_scalar.h            | 30 +++++-----
 .../compute/kernels/scalar_set_lookup.cc      | 21 +++++--
 .../compute/kernels/scalar_set_lookup_test.cc | 60 ++++++++++++++++++-
 4 files changed, 103 insertions(+), 32 deletions(-)

diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc
index 4ee402b79f2..fdad5d5dcbd 100644
--- a/cpp/src/arrow/compute/api_scalar.cc
+++ b/cpp/src/arrow/compute/api_scalar.cc
@@ -45,27 +45,29 @@ SCALAR_EAGER_BINARY(Add, "add")
 // Set-related operations
 
 static Result<Datum> ExecSetLookup(const std::string& func_name, const Datum& data,
-                                   std::shared_ptr<Array> value_set,
-                                   bool add_nulls_to_hash_table, ExecContext* ctx) {
-  if (value_set->length() > 0 && !data.type()->Equals(value_set->type())) {
+                                   const Datum& value_set, bool add_nulls_to_hash_table,
+                                   ExecContext* ctx) {
+  if (!value_set.is_arraylike()) {
+    return Status::Invalid("Set lookup value set must be Array or ChunkedArray");
+  }
+
+  if (value_set.length() > 0 && !data.type()->Equals(value_set.type())) {
     std::stringstream ss;
     ss << "Array type didn't match type of values set: " << data.type()->ToString()
-       << " vs " << value_set->type()->ToString();
+       << " vs " << value_set.type()->ToString();
     return Status::Invalid(ss.str());
   }
-  SetLookupOptions options(std::move(value_set), !add_nulls_to_hash_table);
+  SetLookupOptions options(value_set, !add_nulls_to_hash_table);
   return CallFunction(ctx, func_name, {data}, &options);
 }
 
-Result<Datum> IsIn(const Datum& values, std::shared_ptr<Array> value_set,
-                   ExecContext* ctx) {
-  return ExecSetLookup("isin", values, std::move(value_set),
+Result<Datum> IsIn(const Datum& values, const Datum& value_set, ExecContext* ctx) {
+  return ExecSetLookup("isin", values, value_set,
                        /*add_nulls_to_hash_table=*/false, ctx);
 }
 
-Result<Datum> Match(const Datum& values, std::shared_ptr<Array> value_set,
-                    ExecContext* ctx) {
-  return ExecSetLookup("match", values, std::move(value_set),
+Result<Datum> Match(const Datum& values, const Datum& value_set, ExecContext* ctx) {
+  return ExecSetLookup("match", values, value_set,
                        /*add_nulls_to_hash_table=*/true, ctx);
 }
 
diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h
index 0a067d0f64d..e001a74a067 100644
--- a/cpp/src/arrow/compute/api_scalar.h
+++ b/cpp/src/arrow/compute/api_scalar.h
@@ -156,10 +156,10 @@ Result<Datum> Xor(const Datum& left, const Datum& right, ExecContext* ctx = NULL
 
 /// For set lookup operations like IsIn, Match
 struct ARROW_EXPORT SetLookupOptions : public FunctionOptions {
-  explicit SetLookupOptions(std::shared_ptr<Array> value_set, bool skip_nulls)
+  explicit SetLookupOptions(Datum value_set, bool skip_nulls)
       : value_set(std::move(value_set)), skip_nulls(skip_nulls) {}
 
-  std::shared_ptr<Array> value_set;
+  Datum value_set;
   bool skip_nulls;
 };
 
@@ -170,38 +170,38 @@ struct ARROW_EXPORT SetLookupOptions : public FunctionOptions {
 /// it returns true, else returns null.
 ///
 /// \param[in] values array-like input to look up in value_set
-/// \param[in] value_set Array input
+/// \param[in] value_set either Array or ChunkedArray
 /// \param[in] ctx the function execution context, optional
 /// \return the resulting datum
 ///
 /// \since 1.0.0
 /// \note API not yet finalized
 ARROW_EXPORT
-Result<Datum> IsIn(const Datum& values, std::shared_ptr<Array> value_set,
+Result<Datum> IsIn(const Datum& values, const Datum& value_set,
                    ExecContext* ctx = NULLPTR);
 
-/// \brief Match examines each slot in the haystack against a needles array.
-/// If the value is not found in needles, null will be output.
-/// If found, the index of occurrence within needles (ignoring duplicates)
+/// \brief Match examines each slot in the values against a value_set array.
+/// If the value is not found in value_set, null will be output.
+/// If found, the index of occurrence within value_set (ignoring duplicates)
 /// will be output.
 ///
-/// For example given haystack = [99, 42, 3, null] and
-/// needles = [3, 3, 99], the output will be = [1, null, 0, null]
+/// For example given values = [99, 42, 3, null] and
+/// value_set = [3, 3, 99], the output will be = [1, null, 0, null]
 ///
-/// Note: Null in the haystack is considered to match
-/// a null in the needles array. For example given
-/// haystack = [99, 42, 3, null] and needles = [3, 99, null],
+/// Note: Null in the values is considered to match
+/// a null in the value_set array. For example given
+/// values = [99, 42, 3, null] and value_set = [3, 99, null],
 /// the output will be = [1, null, 0, 2]
 ///
-/// \param[in] haystack array-like input
-/// \param[in] needles Array input
+/// \param[in] values array-like input
+/// \param[in] value_set either Array or ChunkedArray
 /// \param[in] ctx the function execution context, optional
 /// \return the resulting datum
 ///
 /// \since 1.0.0
 /// \note API not yet finalized
 ARROW_EXPORT
-Result<Datum> Match(const Datum& haystack, std::shared_ptr<Array> needles,
+Result<Datum> Match(const Datum& values, const Datum& value_set,
                     ExecContext* ctx = NULLPTR);
 
 }  // namespace compute
diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
index 54ea161b4bf..956d9e5c6b9 100644
--- a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
@@ -38,11 +38,11 @@ using enable_if_supports_set_lookup =
 
 template <typename Type>
 struct SetLookupState : public KernelState {
-  explicit SetLookupState(MemoryPool* pool) : lookup_table(pool, 0) {}
+  explicit SetLookupState(MemoryPool* pool)
+      : lookup_table(pool, 0), lookup_null_count(0) {}
 
   Status Init(const SetLookupOptions& options) {
     using T = typename GetValueType<Type>::T;
-    this->lookup_null_count = options.value_set->null_count();
     auto insert_value = [&](util::optional<T> v) {
       if (v.has_value()) {
         int32_t unused_memo_index;
@@ -52,7 +52,18 @@ struct SetLookupState : public KernelState {
       }
       return Status::OK();
     };
-    return VisitArrayDataInline<Type>(*options.value_set->data(), insert_value);
+    if (options.value_set.kind() == Datum::ARRAY) {
+      const std::shared_ptr<ArrayData>& value_set = options.value_set.array();
+      this->lookup_null_count += value_set->GetNullCount();
+      return VisitArrayDataInline<Type>(*value_set, insert_value);
+    } else {
+      const ChunkedArray& value_set = *options.value_set.chunked_array();
+      for (const std::shared_ptr<Array>& chunk : value_set.chunks()) {
+        this->lookup_null_count += chunk->null_count();
+        RETURN_NOT_OK(VisitArrayDataInline<Type>(*chunk->data(), insert_value));
+      }
+      return Status::OK();
+    }
   }
 
   using MemoTable = typename HashTraits<Type>::MemoTableType;
@@ -66,7 +77,7 @@ struct SetLookupState<NullType> : public KernelState {
   explicit SetLookupState(MemoryPool*) {}
 
   Status Init(const SetLookupOptions& options) {
-    this->lookup_null_count = options.value_set->null_count();
+    this->lookup_null_count = options.value_set.null_count();
     return Status::OK();
   }
 
@@ -96,7 +107,7 @@ struct InitStateVisitor {
     return Init<Type>();
   }
   Status GetResult(std::unique_ptr<KernelState>* out) {
-    RETURN_NOT_OK(VisitTypeInline(*options->value_set->type(), this));
+    RETURN_NOT_OK(VisitTypeInline(*options->value_set.type(), this));
     *out = std::move(result);
     return Status::OK();
   }
diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc
index 2d405ea173b..a3052a4b877 100644
--- a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc
@@ -367,7 +367,35 @@ TEST_F(TestIsInKernel, IsInDecimal) {
                                         member_set, {true, false, true, true}, expected,
                                         {});
 }
-
+TEST_F(TestIsInKernel, IsInChunkedArrayInvoke) {
+  std::vector<std::string> values1 = {"foo", "bar", "foo"};
+  std::vector<std::string> values2 = {"bar", "baz", "quuux", "foo"};
+  std::vector<std::string> values3 = {"foo", "bar", "foo"};
+  std::vector<std::string> values4 = {"bar", "baz", "barr", "foo"};
+
+  auto type = utf8();
+  auto a1 = _MakeArray<StringType, std::string>(type, values1, {});
+  auto a2 = _MakeArray<StringType, std::string>(type, values2, {true, true, true, false});
+  auto a3 = _MakeArray<StringType, std::string>(type, values3, {});
+  auto a4 = _MakeArray<StringType, std::string>(type, values4, {});
+
+  ArrayVector array1 = {a1, a2};
+  auto carr = std::make_shared<ChunkedArray>(array1);
+  ArrayVector array2 = {a3, a4};
+  auto member_set = std::make_shared<ChunkedArray>(array2);
+
+  auto i1 = _MakeArray<BooleanType, bool>(boolean(), {true, true, true}, {});
+  auto i2 = _MakeArray<BooleanType, bool>(boolean(), {true, true, false, false},
+                                          {true, true, true, false});
+
+  ArrayVector expected = {i1, i2};
+  auto expected_carr = std::make_shared<ChunkedArray>(expected);
+
+  ASSERT_OK_AND_ASSIGN(Datum encoded_out, IsIn(carr, member_set));
+  ASSERT_EQ(Datum::CHUNKED_ARRAY, encoded_out.kind());
+
+  AssertChunkedEqual(*expected_carr, *encoded_out.chunked_array());
+}
 // ----------------------------------------------------------------------
 // Match tests
 
@@ -672,5 +700,35 @@ TEST_F(TestMatchKernel, MatchDecimal) {
              /* expected= */ R"([0, 1, 2, 0])");
 }
 
+TEST_F(TestMatchKernel, MatchChunkedArrayInvoke) {
+  std::vector<std::string> values1 = {"foo", "bar", "foo"};
+  std::vector<std::string> values2 = {"bar", "baz", "quuux", "foo"};
+  std::vector<std::string> values3 = {"foo", "bar", "foo"};
+  std::vector<std::string> values4 = {"bar", "baz", "barr", "foo"};
+
+  auto type = utf8();
+  auto a1 = _MakeArray<StringType, std::string>(type, values1, {});
+  auto a2 = _MakeArray<StringType, std::string>(type, values2, {true, true, true, false});
+  auto a3 = _MakeArray<StringType, std::string>(type, values3, {});
+  auto a4 = _MakeArray<StringType, std::string>(type, values4, {});
+
+  ArrayVector array1 = {a1, a2};
+  auto carr = std::make_shared<ChunkedArray>(array1);
+  ArrayVector array2 = {a3, a4};
+  auto member_set = std::make_shared<ChunkedArray>(array2);
+
+  auto i1 = _MakeArray<Int32Type, int32_t>(int32(), {0, 1, 0}, {});
+  auto i2 =
+      _MakeArray<Int32Type, int32_t>(int32(), {1, 2, 2, 2}, {true, true, false, false});
+
+  ArrayVector expected = {i1, i2};
+  auto expected_carr = std::make_shared<ChunkedArray>(expected);
+
+  ASSERT_OK_AND_ASSIGN(Datum encoded_out, Match(carr, Datum(member_set)));
+  ASSERT_EQ(Datum::CHUNKED_ARRAY, encoded_out.kind());
+
+  AssertChunkedEqual(*expected_carr, *encoded_out.chunked_array());
+}
+
 }  // namespace compute
 }  // namespace arrow

From c8394b042926bd4a13b200e9049d32f6367f3313 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sat, 23 May 2020 07:17:31 +0900
Subject: [PATCH 28/41] [GLib] Use the new API

---
 c_glib/arrow-glib/compute.cpp | 626 ++++++++++++----------------------
 c_glib/arrow-glib/error.cpp   |  58 ++--
 c_glib/arrow-glib/error.hpp   |  46 ++-
 3 files changed, 273 insertions(+), 457 deletions(-)

diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp
index e08a4d28d7b..581a4c679f0 100644
--- a/c_glib/arrow-glib/compute.cpp
+++ b/c_glib/arrow-glib/compute.cpp
@@ -40,16 +40,11 @@ garrow_numeric_array_sum(GArrowArrayType array,
                          typename ArrowType::c_type default_value)
 {
   auto arrow_array = garrow_array_get_raw(GARROW_ARRAY(array));
-  auto memory_pool = arrow::default_memory_pool();
-  arrow::compute::FunctionContext context(memory_pool);
-  arrow::compute::Datum sum_datum;
-  auto status = arrow::compute::Sum(&context,
-                                    arrow_array,
-                                    &sum_datum);
-  if (garrow_error_check(error, status, tag)) {
+  auto arrow_sum_datum = arrow::compute::Sum(arrow_array);
+  if (garrow::check(error, arrow_sum_datum, tag)) {
     using ScalarType = typename arrow::TypeTraits<ArrowType>::ScalarType;
     auto arrow_numeric_scalar =
-      std::dynamic_pointer_cast<ScalarType>(sum_datum.scalar());
+      std::dynamic_pointer_cast<ScalarType>((*arrow_sum_datum).scalar());
     if (arrow_numeric_scalar->is_valid) {
       return arrow_numeric_scalar->value;
     } else {
@@ -69,17 +64,12 @@ garrow_numeric_array_compare(GArrowArrayType array,
                              const gchar *tag)
 {
   auto arrow_array = garrow_array_get_raw(GARROW_ARRAY(array));
-  auto memory_pool = arrow::default_memory_pool();
-  arrow::compute::FunctionContext context(memory_pool);
-  arrow::compute::Datum compared_datum;
   auto arrow_options = garrow_compare_options_get_raw(options);
-  auto status = arrow::compute::Compare(&context,
-                                        arrow_array,
-                                        arrow::compute::Datum(value),
-                                        *arrow_options,
-                                        &compared_datum);
-  if (garrow_error_check(error, status, tag)) {
-    auto arrow_compared_array = compared_datum.make_array();
+  auto arrow_compared_datum = arrow::compute::Compare(arrow_array,
+                                                      arrow::Datum(value),
+                                                      *arrow_options);
+  if (garrow::check(error, arrow_compared_datum, tag)) {
+    auto arrow_compared_array = (*arrow_compared_datum).make_array();
     return GARROW_BOOLEAN_ARRAY(garrow_array_new_raw(&arrow_compared_array));
   } else {
     return NULL;
@@ -676,39 +666,32 @@ garrow_array_cast(GArrowArray *array,
 {
   auto arrow_array = garrow_array_get_raw(array);
   auto arrow_array_raw = arrow_array.get();
-  auto memory_pool = arrow::default_memory_pool();
-  arrow::compute::FunctionContext context(memory_pool);
   auto arrow_target_data_type = garrow_data_type_get_raw(target_data_type);
-  std::shared_ptr<arrow::Array> arrow_casted_array;
-  arrow::Status status;
+  arrow::Result<std::shared_ptr<arrow::Array>> arrow_casted_array;
   if (options) {
     auto arrow_options = garrow_cast_options_get_raw(options);
-    status = arrow::compute::Cast(&context,
-                                  *arrow_array_raw,
-                                  arrow_target_data_type,
-                                  *arrow_options,
-                                  &arrow_casted_array);
+    arrow_casted_array = arrow::compute::Cast(*arrow_array_raw,
+                                              arrow_target_data_type,
+                                              *arrow_options);
   } else {
-    arrow::compute::CastOptions arrow_options;
-    status = arrow::compute::Cast(&context,
-                                  *arrow_array_raw,
-                                  arrow_target_data_type,
-                                  arrow_options,
-                                  &arrow_casted_array);
+    arrow_casted_array = arrow::compute::Cast(*arrow_array_raw,
+                                              arrow_target_data_type);
   }
-
-  if (!status.ok()) {
-    std::stringstream message;
-    message << "[array][cast] <";
-    message << arrow_array->type()->ToString();
-    message << "> -> <";
-    message << arrow_target_data_type->ToString();
-    message << ">";
-    garrow_error_check(error, status, message.str().c_str());
+  if (garrow::check(error,
+                    arrow_casted_array,
+                    [&]() {
+                      std::stringstream message;
+                      message << "[array][cast] <";
+                      message << arrow_array->type()->ToString();
+                      message << "> -> <";
+                      message << arrow_target_data_type->ToString();
+                      message << ">";
+                      return message.str();
+                    })) {
+    return garrow_array_new_raw(&(*arrow_casted_array));
+  } else {
     return NULL;
   }
-
-  return garrow_array_new_raw(&arrow_casted_array);
 }
 
 /**
@@ -726,22 +709,20 @@ garrow_array_unique(GArrowArray *array,
                     GError **error)
 {
   auto arrow_array = garrow_array_get_raw(array);
-  auto memory_pool = arrow::default_memory_pool();
-  arrow::compute::FunctionContext context(memory_pool);
-  std::shared_ptr<arrow::Array> arrow_unique_array;
-  auto status = arrow::compute::Unique(&context,
-                                       arrow::compute::Datum(arrow_array),
-                                       &arrow_unique_array);
-  if (!status.ok()) {
-    std::stringstream message;
-    message << "[array][unique] <";
-    message << arrow_array->type()->ToString();
-    message << ">";
-    garrow_error_check(error, status, message.str().c_str());
+  auto arrow_unique_array = arrow::compute::Unique(arrow_array);
+  if (garrow::check(error,
+                    arrow_unique_array,
+                    [&]() {
+                      std::stringstream message;
+                      message << "[array][unique] <";
+                      message << arrow_array->type()->ToString();
+                      message << ">";
+                      return message.str();
+                    })) {
+    return garrow_array_new_raw(&(*arrow_unique_array));
+  } else {
     return NULL;
   }
-
-  return garrow_array_new_raw(&arrow_unique_array);
 }
 
 /**
@@ -760,27 +741,25 @@ garrow_array_dictionary_encode(GArrowArray *array,
                                GError **error)
 {
   auto arrow_array = garrow_array_get_raw(array);
-  auto memory_pool = arrow::default_memory_pool();
-  arrow::compute::FunctionContext context(memory_pool);
-  arrow::compute::Datum dictionary_encoded_datum;
-  auto status =
-    arrow::compute::DictionaryEncode(&context,
-                                     arrow::compute::Datum(arrow_array),
-                                     &dictionary_encoded_datum);
-  if (!status.ok()) {
-    std::stringstream message;
-    message << "[array][dictionary-encode] <";
-    message << arrow_array->type()->ToString();
-    message << ">";
-    garrow_error_check(error, status, message.str().c_str());
+  auto arrow_dictionary_encoded_datum =
+    arrow::compute::DictionaryEncode(arrow_array);
+  if (garrow::check(error,
+                    arrow_dictionary_encoded_datum,
+                    [&]() {
+                      std::stringstream message;
+                      message << "[array][dictionary-encode] <";
+                      message << arrow_array->type()->ToString();
+                      message << ">";
+                      return message.str();
+                    })) {
+    auto arrow_dictionary_encoded_array =
+      (*arrow_dictionary_encoded_datum).make_array();
+    auto dictionary_encoded_array =
+      garrow_array_new_raw(&arrow_dictionary_encoded_array);
+    return GARROW_DICTIONARY_ARRAY(dictionary_encoded_array);
+  } else {
     return NULL;
   }
-
-  auto arrow_dictionary_encoded_array =
-    arrow::MakeArray(dictionary_encoded_datum.array());
-  auto dictionary_encoded_array =
-    garrow_array_new_raw(&arrow_dictionary_encoded_array);
-  return GARROW_DICTIONARY_ARRAY(dictionary_encoded_array);
 }
 
 /**
@@ -801,28 +780,19 @@ garrow_array_count(GArrowArray *array,
 {
   auto arrow_array = garrow_array_get_raw(array);
   auto arrow_array_raw = arrow_array.get();
-  auto memory_pool = arrow::default_memory_pool();
-  arrow::compute::FunctionContext context(memory_pool);
-  arrow::compute::Datum counted_datum;
-  arrow::Status status;
+  arrow::Result<arrow::Datum> arrow_counted_datum;
   if (options) {
     auto arrow_options = garrow_count_options_get_raw(options);
-    status = arrow::compute::Count(&context,
-                                   *arrow_options,
-                                   *arrow_array_raw,
-                                   &counted_datum);
+    arrow_counted_datum =
+      arrow::compute::Count(*arrow_array_raw, *arrow_options);
   } else {
-    arrow::compute::CountOptions arrow_options(arrow::compute::CountOptions::COUNT_ALL);
-    status = arrow::compute::Count(&context,
-                                   arrow_options,
-                                   *arrow_array_raw,
-                                   &counted_datum);
+    arrow_counted_datum = arrow::compute::Count(*arrow_array_raw);
   }
-
-  if (garrow_error_check(error, status, "[array][count]")) {
+  if (garrow::check(error, arrow_counted_datum, "[array][count]")) {
     using ScalarType = typename arrow::TypeTraits<arrow::Int64Type>::ScalarType;
-    auto counted_scalar = std::dynamic_pointer_cast<ScalarType>(counted_datum.scalar());
-    return counted_scalar->value;
+    auto arrow_counted_scalar =
+      std::dynamic_pointer_cast<ScalarType>((*arrow_counted_datum).scalar());
+    return arrow_counted_scalar->value;
   } else {
     return 0;
   }
@@ -844,14 +814,9 @@ garrow_array_count_values(GArrowArray *array,
                           GError **error)
 {
   auto arrow_array = garrow_array_get_raw(array);
-  auto memory_pool = arrow::default_memory_pool();
-  arrow::compute::FunctionContext context(memory_pool);
-  std::shared_ptr<arrow::Array> arrow_counted_values;
-  auto status = arrow::compute::ValueCounts(&context,
-                                            arrow::compute::Datum(arrow_array),
-                                            &arrow_counted_values);
-  if (garrow_error_check(error, status, "[array][count-values]")) {
-    return GARROW_STRUCT_ARRAY(garrow_array_new_raw(&arrow_counted_values));
+  auto arrow_counted_values = arrow::compute::ValueCounts(arrow_array);
+  if (garrow::check(error, arrow_counted_values, "[array][count-values]")) {
+    return GARROW_STRUCT_ARRAY(garrow_array_new_raw(&(*arrow_counted_values)));
   } else {
     return NULL;
   }
@@ -874,13 +839,9 @@ garrow_boolean_array_invert(GArrowBooleanArray *array,
                             GError **error)
 {
   auto arrow_array = garrow_array_get_raw(GARROW_ARRAY(array));
-  auto datum = arrow::compute::Datum(arrow_array);
-  auto memory_pool = arrow::default_memory_pool();
-  arrow::compute::FunctionContext context(memory_pool);
-  arrow::compute::Datum inverted_datum;
-  auto status = arrow::compute::Invert(&context, datum, &inverted_datum);
-  if (garrow_error_check(error, status, "[boolean-array][invert]")) {
-    auto arrow_inverted_array = inverted_datum.make_array();
+  auto arrow_inverted_datum = arrow::compute::Invert(arrow_array);
+  if (garrow::check(error, arrow_inverted_datum, "[boolean-array][invert]")) {
+    auto arrow_inverted_array = (*arrow_inverted_datum).make_array();
     return GARROW_BOOLEAN_ARRAY(garrow_array_new_raw(&arrow_inverted_array));
   } else {
     return NULL;
@@ -905,18 +866,10 @@ garrow_boolean_array_and(GArrowBooleanArray *left,
                          GError **error)
 {
   auto arrow_left = garrow_array_get_raw(GARROW_ARRAY(left));
-  auto left_datum = arrow::compute::Datum(arrow_left);
   auto arrow_right = garrow_array_get_raw(GARROW_ARRAY(right));
-  auto right_datum = arrow::compute::Datum(arrow_right);
-  auto memory_pool = arrow::default_memory_pool();
-  arrow::compute::FunctionContext context(memory_pool);
-  arrow::compute::Datum operated_datum;
-  auto status = arrow::compute::And(&context,
-                                    left_datum,
-                                    right_datum,
-                                    &operated_datum);
-  if (garrow_error_check(error, status, "[boolean-array][and]")) {
-    auto arrow_operated_array = operated_datum.make_array();
+  auto arrow_operated_datum = arrow::compute::And(arrow_left, arrow_right);
+  if (garrow::check(error, arrow_operated_datum, "[boolean-array][and]")) {
+    auto arrow_operated_array = (*arrow_operated_datum).make_array();
     return GARROW_BOOLEAN_ARRAY(garrow_array_new_raw(&arrow_operated_array));
   } else {
     return NULL;
@@ -941,18 +894,10 @@ garrow_boolean_array_or(GArrowBooleanArray *left,
                         GError **error)
 {
   auto arrow_left = garrow_array_get_raw(GARROW_ARRAY(left));
-  auto left_datum = arrow::compute::Datum(arrow_left);
   auto arrow_right = garrow_array_get_raw(GARROW_ARRAY(right));
-  auto right_datum = arrow::compute::Datum(arrow_right);
-  auto memory_pool = arrow::default_memory_pool();
-  arrow::compute::FunctionContext context(memory_pool);
-  arrow::compute::Datum operated_datum;
-  auto status = arrow::compute::Or(&context,
-                                   left_datum,
-                                   right_datum,
-                                   &operated_datum);
-  if (garrow_error_check(error, status, "[boolean-array][or]")) {
-    auto arrow_operated_array = operated_datum.make_array();
+  auto arrow_operated_datum = arrow::compute::Or(arrow_left, arrow_right);
+  if (garrow::check(error, arrow_operated_datum, "[boolean-array][or]")) {
+    auto arrow_operated_array = (*arrow_operated_datum).make_array();
     return GARROW_BOOLEAN_ARRAY(garrow_array_new_raw(&arrow_operated_array));
   } else {
     return NULL;
@@ -977,18 +922,10 @@ garrow_boolean_array_xor(GArrowBooleanArray *left,
                          GError **error)
 {
   auto arrow_left = garrow_array_get_raw(GARROW_ARRAY(left));
-  auto left_datum = arrow::compute::Datum(arrow_left);
   auto arrow_right = garrow_array_get_raw(GARROW_ARRAY(right));
-  auto right_datum = arrow::compute::Datum(arrow_right);
-  auto memory_pool = arrow::default_memory_pool();
-  arrow::compute::FunctionContext context(memory_pool);
-  arrow::compute::Datum operated_datum;
-  auto status = arrow::compute::Xor(&context,
-                                    left_datum,
-                                    right_datum,
-                                    &operated_datum);
-  if (garrow_error_check(error, status, "[boolean-array][xor]")) {
-    auto arrow_operated_array = operated_datum.make_array();
+  auto arrow_operated_datum = arrow::compute::Xor(arrow_left, arrow_right);
+  if (garrow::check(error, arrow_operated_datum, "[boolean-array][xor]")) {
+    auto arrow_operated_array = (*arrow_operated_datum).make_array();
     return GARROW_BOOLEAN_ARRAY(garrow_array_new_raw(&arrow_operated_array));
   } else {
     return NULL;
@@ -1010,14 +947,11 @@ garrow_numeric_array_mean(GArrowNumericArray *array,
                           GError **error)
 {
   auto arrow_array = garrow_array_get_raw(GARROW_ARRAY(array));
-  auto memory_pool = arrow::default_memory_pool();
-  arrow::compute::FunctionContext context(memory_pool);
-  arrow::compute::Datum mean_datum;
-  auto status = arrow::compute::Mean(&context, arrow_array, &mean_datum);
-  if (garrow_error_check(error, status, "[numeric-array][mean]")) {
+  auto arrow_mean_datum = arrow::compute::Mean(arrow_array);
+  if (garrow::check(error, arrow_mean_datum, "[numeric-array][mean]")) {
     using ScalarType = typename arrow::TypeTraits<arrow::DoubleType>::ScalarType;
     auto arrow_numeric_scalar =
-      std::dynamic_pointer_cast<ScalarType>(mean_datum.scalar());
+      std::dynamic_pointer_cast<ScalarType>((*arrow_mean_datum).scalar());
     if (arrow_numeric_scalar->is_valid) {
       return arrow_numeric_scalar->value;
     } else {
@@ -1251,28 +1185,18 @@ garrow_array_take(GArrowArray *array,
   auto arrow_array_raw = arrow_array.get();
   auto arrow_indices = garrow_array_get_raw(indices);
   auto arrow_indices_raw = arrow_indices.get();
-  auto memory_pool = arrow::default_memory_pool();
-  arrow::compute::FunctionContext context(memory_pool);
-  std::shared_ptr<arrow::Array> taken_array;
-  arrow::Status status;
+  arrow::Result<std::shared_ptr<arrow::Array>> arrow_taken_array;
   if (options) {
     auto arrow_options = garrow_take_options_get_raw(options);
-    status = arrow::compute::Take(&context,
-                                  *arrow_array_raw,
-                                  *arrow_indices_raw,
-                                  *arrow_options,
-                                  &taken_array);
+    arrow_taken_array = arrow::compute::Take(*arrow_array_raw,
+                                             *arrow_indices_raw,
+                                             *arrow_options);
   } else {
-    arrow::compute::TakeOptions arrow_options;
-    status = arrow::compute::Take(&context,
-                                  *arrow_array_raw,
-                                  *arrow_indices_raw,
-                                  arrow_options,
-                                  &taken_array);
+    arrow_taken_array = arrow::compute::Take(*arrow_array_raw,
+                                             *arrow_indices_raw);
   }
-
-  if (garrow_error_check(error, status, "[array][take]")) {
-    return garrow_array_new_raw(&taken_array);
+  if (garrow::check(error, arrow_taken_array, "[array][take]")) {
+    return garrow_array_new_raw(&(*arrow_taken_array));
   } else {
     return NULL;
   }
@@ -1300,28 +1224,20 @@ garrow_array_take_chunked_array(GArrowArray *array,
   auto arrow_array_raw = arrow_array.get();
   auto arrow_indices = garrow_chunked_array_get_raw(indices);
   auto arrow_indices_raw = arrow_indices.get();
-  auto memory_pool = arrow::default_memory_pool();
-  arrow::compute::FunctionContext context(memory_pool);
-  std::shared_ptr<arrow::ChunkedArray> taken_chunked_array;
-  arrow::Status status;
+  arrow::Result<std::shared_ptr<arrow::ChunkedArray>> arrow_taken_chunked_array;
   if (options) {
     auto arrow_options = garrow_take_options_get_raw(options);
-    status = arrow::compute::Take(&context,
-                                  *arrow_array_raw,
-                                  *arrow_indices_raw,
-                                  *arrow_options,
-                                  &taken_chunked_array);
+    arrow_taken_chunked_array = arrow::compute::Take(*arrow_array_raw,
+                                                     *arrow_indices_raw,
+                                                     *arrow_options);
   } else {
-    arrow::compute::TakeOptions arrow_options;
-    status = arrow::compute::Take(&context,
-                                  *arrow_array_raw,
-                                  *arrow_indices_raw,
-                                  arrow_options,
-                                  &taken_chunked_array);
+    arrow_taken_chunked_array = arrow::compute::Take(*arrow_array_raw,
+                                                     *arrow_indices_raw);
   }
-
-  if (garrow_error_check(error, status, "[array][take][chunked-array]")) {
-    return garrow_chunked_array_new_raw(&taken_chunked_array);
+  if (garrow::check(error,
+                    arrow_taken_chunked_array,
+                    "[array][take][chunked-array]")) {
+    return garrow_chunked_array_new_raw(&(*arrow_taken_chunked_array));
   } else {
     return NULL;
   }
@@ -1349,28 +1265,18 @@ garrow_table_take(GArrowTable *table,
   auto arrow_table_raw = arrow_table.get();
   auto arrow_indices = garrow_array_get_raw(indices);
   auto arrow_indices_raw = arrow_indices.get();
-  auto memory_pool = arrow::default_memory_pool();
-  arrow::compute::FunctionContext context(memory_pool);
-  std::shared_ptr<arrow::Table> taken_table;
-  arrow::Status status;
+  arrow::Result<std::shared_ptr<arrow::Table>> arrow_taken_table;
   if (options) {
     auto arrow_options = garrow_take_options_get_raw(options);
-    status = arrow::compute::Take(&context,
-                                  *arrow_table_raw,
-                                  *arrow_indices_raw,
-                                  *arrow_options,
-                                  &taken_table);
+    arrow_taken_table = arrow::compute::Take(*arrow_table_raw,
+                                             *arrow_indices_raw,
+                                             *arrow_options);
   } else {
-    arrow::compute::TakeOptions arrow_options;
-    status = arrow::compute::Take(&context,
-                                  *arrow_table_raw,
-                                  *arrow_indices_raw,
-                                  arrow_options,
-                                  &taken_table);
+    arrow_taken_table = arrow::compute::Take(*arrow_table_raw,
+                                             *arrow_indices_raw);
   }
-
-  if (garrow_error_check(error, status, "[table][take]")) {
-    return garrow_table_new_raw(&taken_table);
+  if (garrow::check(error, arrow_taken_table, "[table][take]")) {
+    return garrow_table_new_raw(&(*arrow_taken_table));
   } else {
     return NULL;
   }
@@ -1398,28 +1304,18 @@ garrow_table_take_chunked_array(GArrowTable *table,
   auto arrow_table_raw = arrow_table.get();
   auto arrow_indices = garrow_chunked_array_get_raw(indices);
   auto arrow_indices_raw = arrow_indices.get();
-  auto memory_pool = arrow::default_memory_pool();
-  arrow::compute::FunctionContext context(memory_pool);
-  std::shared_ptr<arrow::Table> taken_table;
-  arrow::Status status;
+  arrow::Result<std::shared_ptr<arrow::Table>> arrow_taken_table;
   if (options) {
     auto arrow_options = garrow_take_options_get_raw(options);
-    status = arrow::compute::Take(&context,
-                                  *arrow_table_raw,
-                                  *arrow_indices_raw,
-                                  *arrow_options,
-                                  &taken_table);
+    arrow_taken_table = arrow::compute::Take(*arrow_table_raw,
+                                             *arrow_indices_raw,
+                                             *arrow_options);
   } else {
-    arrow::compute::TakeOptions arrow_options;
-    status = arrow::compute::Take(&context,
-                                  *arrow_table_raw,
-                                  *arrow_indices_raw,
-                                  arrow_options,
-                                  &taken_table);
+    arrow_taken_table = arrow::compute::Take(*arrow_table_raw,
+                                             *arrow_indices_raw);
   }
-
-  if (garrow_error_check(error, status, "[table][take][chunked-array]")) {
-    return garrow_table_new_raw(&taken_table);
+  if (garrow::check(error, arrow_taken_table, "[table][take][chunked-array]")) {
+    return garrow_table_new_raw(&(*arrow_taken_table));
   } else {
     return NULL;
   }
@@ -1447,28 +1343,18 @@ garrow_chunked_array_take(GArrowChunkedArray *chunked_array,
   auto arrow_chunked_array_raw = arrow_chunked_array.get();
   auto arrow_indices = garrow_array_get_raw(indices);
   auto arrow_indices_raw = arrow_indices.get();
-  auto memory_pool = arrow::default_memory_pool();
-  arrow::compute::FunctionContext context(memory_pool);
-  std::shared_ptr<arrow::ChunkedArray> taken_chunked_array;
-  arrow::Status status;
+  arrow::Result<std::shared_ptr<arrow::ChunkedArray>> arrow_taken_chunked_array;
   if (options) {
     auto arrow_options = garrow_take_options_get_raw(options);
-    status = arrow::compute::Take(&context,
-                                  *arrow_chunked_array_raw,
-                                  *arrow_indices_raw,
-                                  *arrow_options,
-                                  &taken_chunked_array);
+    arrow_taken_chunked_array = arrow::compute::Take(*arrow_chunked_array_raw,
+                                                     *arrow_indices_raw,
+                                                     *arrow_options);
   } else {
-    arrow::compute::TakeOptions arrow_options;
-    status = arrow::compute::Take(&context,
-                                  *arrow_chunked_array_raw,
-                                  *arrow_indices_raw,
-                                  arrow_options,
-                                  &taken_chunked_array);
+    arrow_taken_chunked_array = arrow::compute::Take(*arrow_chunked_array_raw,
+                                                     *arrow_indices_raw);
   }
-
-  if (garrow_error_check(error, status, "[chunked-array][take]")) {
-    return garrow_chunked_array_new_raw(&taken_chunked_array);
+  if (garrow::check(error, arrow_taken_chunked_array, "[chunked-array][take]")) {
+    return garrow_chunked_array_new_raw(&(*arrow_taken_chunked_array));
   } else {
     return NULL;
   }
@@ -1496,28 +1382,20 @@ garrow_chunked_array_take_chunked_array(GArrowChunkedArray *chunked_array,
   auto arrow_chunked_array_raw = arrow_chunked_array.get();
   auto arrow_indices = garrow_chunked_array_get_raw(indices);
   auto arrow_indices_raw = arrow_indices.get();
-  auto memory_pool = arrow::default_memory_pool();
-  arrow::compute::FunctionContext context(memory_pool);
-  std::shared_ptr<arrow::ChunkedArray> taken_chunked_array;
-  arrow::Status status;
+  arrow::Result<std::shared_ptr<arrow::ChunkedArray>> arrow_taken_chunked_array;
   if (options) {
     auto arrow_options = garrow_take_options_get_raw(options);
-    status = arrow::compute::Take(&context,
-                                  *arrow_chunked_array_raw,
-                                  *arrow_indices_raw,
-                                  *arrow_options,
-                                  &taken_chunked_array);
+    arrow_taken_chunked_array = arrow::compute::Take(*arrow_chunked_array_raw,
+                                                     *arrow_indices_raw,
+                                                     *arrow_options);
   } else {
-    arrow::compute::TakeOptions arrow_options;
-    status = arrow::compute::Take(&context,
-                                  *arrow_chunked_array_raw,
-                                  *arrow_indices_raw,
-                                  arrow_options,
-                                  &taken_chunked_array);
+    arrow_taken_chunked_array = arrow::compute::Take(*arrow_chunked_array_raw,
+                                                     *arrow_indices_raw);
   }
-
-  if (garrow_error_check(error, status, "[chunked-array][take][chunked-array]")) {
-    return garrow_chunked_array_new_raw(&taken_chunked_array);
+  if (garrow::check(error,
+                    arrow_taken_chunked_array,
+                    "[chunked-array][take][chunked-array]")) {
+    return garrow_chunked_array_new_raw(&(*arrow_taken_chunked_array));
   } else {
     return NULL;
   }
@@ -1541,33 +1419,23 @@ garrow_record_batch_take(GArrowRecordBatch *record_batch,
                          GArrowTakeOptions *options,
                          GError **error)
 {
-  auto arrow_record_batch =
-    garrow_record_batch_get_raw(record_batch);
+  auto arrow_record_batch = garrow_record_batch_get_raw(record_batch);
   auto arrow_record_batch_raw = arrow_record_batch.get();
   auto arrow_indices = garrow_array_get_raw(indices);
   auto arrow_indices_raw = arrow_indices.get();
-  auto memory_pool = arrow::default_memory_pool();
-  arrow::compute::FunctionContext context(memory_pool);
-  std::shared_ptr<arrow::RecordBatch> taken_record_batch;
-  arrow::Status status;
+  arrow::Result<std::shared_ptr<arrow::RecordBatch>> arrow_taken_record_batch;
   if (options) {
     auto arrow_options = garrow_take_options_get_raw(options);
-    status = arrow::compute::Take(&context,
-                                  *arrow_record_batch_raw,
-                                  *arrow_indices_raw,
-                                  *arrow_options,
-                                  &taken_record_batch);
+    arrow_taken_record_batch = arrow::compute::Take(*arrow_record_batch_raw,
+                                                    *arrow_indices_raw,
+                                                    *arrow_options);
   } else {
-    arrow::compute::TakeOptions arrow_options;
-    status = arrow::compute::Take(&context,
-                                  *arrow_record_batch_raw,
-                                  *arrow_indices_raw,
-                                  arrow_options,
-                                  &taken_record_batch);
+    arrow_taken_record_batch = arrow::compute::Take(*arrow_record_batch_raw,
+                                                    *arrow_indices_raw);
   }
 
-  if (garrow_error_check(error, status, "[record-batch][take]")) {
-    return garrow_record_batch_new_raw(&taken_record_batch);
+  if (garrow::check(error, arrow_taken_record_batch, "[record-batch][take]")) {
+    return garrow_record_batch_new_raw(&(*arrow_taken_record_batch));
   } else {
     return NULL;
   }
@@ -1855,27 +1723,18 @@ garrow_array_filter(GArrowArray *array,
 {
   auto arrow_array = garrow_array_get_raw(array);
   auto arrow_filter = garrow_array_get_raw(GARROW_ARRAY(filter));
-  auto memory_pool = arrow::default_memory_pool();
-  arrow::compute::FunctionContext context(memory_pool);
-  arrow::compute::Datum arrow_filtered;
-  arrow::Status status;
+  arrow::Result<arrow::Datum> arrow_filtered_datum;
   if (options) {
     auto arrow_options = garrow_filter_options_get_raw(options);
-    status = arrow::compute::Filter(&context,
-                                    arrow_array,
-                                    arrow_filter,
-                                    *arrow_options,
-                                    &arrow_filtered);
+    arrow_filtered_datum = arrow::compute::Filter(arrow_array,
+                                                  arrow_filter,
+                                                  *arrow_options);
   } else {
-    arrow::compute::FilterOptions arrow_options;
-    status = arrow::compute::Filter(&context,
-                                    arrow_array,
-                                    arrow_filter,
-                                    arrow_options,
-                                    &arrow_filtered);
+    arrow_filtered_datum = arrow::compute::Filter(arrow_array,
+                                                  arrow_filter);
   }
-  if (garrow_error_check(error, status, "[array][filter]")) {
-    auto arrow_filtered_array = arrow_filtered.make_array();
+  if (garrow::check(error, arrow_filtered_datum, "[array][filter]")) {
+    auto arrow_filtered_array = (*arrow_filtered_datum).make_array();
     return garrow_array_new_raw(&arrow_filtered_array);
   } else {
     return NULL;
@@ -1900,19 +1759,11 @@ garrow_array_is_in(GArrowArray *left,
                    GError **error)
 {
   auto arrow_left = garrow_array_get_raw(left);
-  auto arrow_left_datum = arrow::compute::Datum(arrow_left);
   auto arrow_right = garrow_array_get_raw(right);
-  auto arrow_right_datum = arrow::compute::Datum(arrow_right);
-  auto memory_pool = arrow::default_memory_pool();
-  arrow::compute::FunctionContext context(memory_pool);
-  arrow::compute::Datum arrow_datum;
-  auto status = arrow::compute::IsIn(&context,
-                                     arrow_left_datum,
-                                     arrow_right_datum,
-                                     &arrow_datum);
-  if (garrow_error_check(error, status, "[array][is-in]")) {
-    auto arrow_array = arrow_datum.make_array();
-    return GARROW_BOOLEAN_ARRAY(garrow_array_new_raw(&arrow_array));
+  auto arrow_is_in_datum = arrow::compute::IsIn(arrow_left, arrow_right);
+  if (garrow::check(error, arrow_is_in_datum, "[array][is-in]")) {
+    auto arrow_is_in_array = (*arrow_is_in_datum).make_array();
+    return GARROW_BOOLEAN_ARRAY(garrow_array_new_raw(&arrow_is_in_array));
   } else {
     return NULL;
   }
@@ -1936,19 +1787,13 @@ garrow_array_is_in_chunked_array(GArrowArray *left,
                                  GError **error)
 {
   auto arrow_left = garrow_array_get_raw(left);
-  auto arrow_left_datum = arrow::compute::Datum(arrow_left);
   auto arrow_right = garrow_chunked_array_get_raw(right);
-  auto arrow_right_datum = arrow::compute::Datum(arrow_right);
-  auto memory_pool = arrow::default_memory_pool();
-  arrow::compute::FunctionContext context(memory_pool);
-  arrow::compute::Datum arrow_datum;
-  auto status = arrow::compute::IsIn(&context,
-                                     arrow_left_datum,
-                                     arrow_right_datum,
-                                     &arrow_datum);
-  if (garrow_error_check(error, status, "[array][is-in][chunked-array]")) {
-    auto arrow_array = arrow_datum.make_array();
-    return GARROW_BOOLEAN_ARRAY(garrow_array_new_raw(&arrow_array));
+  auto arrow_is_in_datum = arrow::compute::IsIn(arrow_left, arrow_right);
+  if (garrow::check(error,
+                    arrow_is_in_datum,
+                    "[array][is-in][chunked-array]")) {
+    auto arrow_is_in_array = (*arrow_is_in_datum).make_array();
+    return GARROW_BOOLEAN_ARRAY(garrow_array_new_raw(&arrow_is_in_array));
   } else {
     return NULL;
   }
@@ -1970,14 +1815,9 @@ garrow_array_sort_to_indices(GArrowArray *array,
 {
   auto arrow_array = garrow_array_get_raw(array);
   auto arrow_array_raw = arrow_array.get();
-  auto memory_pool = arrow::default_memory_pool();
-  arrow::compute::FunctionContext context(memory_pool);
-  std::shared_ptr<arrow::Array> arrow_indices;
-  auto status = arrow::compute::SortToIndices(&context,
-                                              *arrow_array_raw,
-                                              &arrow_indices);
-  if (garrow_error_check(error, status, "[array][sort-to-indices]")) {
-    return GARROW_UINT64_ARRAY(garrow_array_new_raw(&arrow_indices));
+  auto arrow_indices_array = arrow::compute::SortToIndices(*arrow_array_raw);
+  if (garrow::check(error, arrow_indices_array, "[array][sort-to-indices]")) {
+    return GARROW_UINT64_ARRAY(garrow_array_new_raw(&(*arrow_indices_array)));
   } else {
     return NULL;
   }
@@ -2004,27 +1844,18 @@ garrow_table_filter(GArrowTable *table,
 {
   auto arrow_table = garrow_table_get_raw(table);
   auto arrow_filter = garrow_array_get_raw(GARROW_ARRAY(filter));
-  auto memory_pool = arrow::default_memory_pool();
-  arrow::compute::FunctionContext context(memory_pool);
-  arrow::compute::Datum arrow_filtered;
-  arrow::Status status;
+  arrow::Result<arrow::Datum> arrow_filtered_datum;
   if (options) {
     auto arrow_options = garrow_filter_options_get_raw(options);
-    status = arrow::compute::Filter(&context,
-                                    arrow_table,
-                                    arrow_filter,
-                                    *arrow_options,
-                                    &arrow_filtered);
+    arrow_filtered_datum = arrow::compute::Filter(arrow_table,
+                                                  arrow_filter,
+                                                  *arrow_options);
   } else {
-    arrow::compute::FilterOptions arrow_options;
-    status = arrow::compute::Filter(&context,
-                                    arrow_table,
-                                    arrow_filter,
-                                    arrow_options,
-                                    &arrow_filtered);
+    arrow_filtered_datum = arrow::compute::Filter(arrow_table,
+                                                  arrow_filter);
   }
-  if (garrow_error_check(error, status, "[table][filter]")) {
-    auto arrow_filtered_table = arrow_filtered.table();
+  if (garrow::check(error, arrow_filtered_datum, "[table][filter]")) {
+    auto arrow_filtered_table = (*arrow_filtered_datum).table();
     return garrow_table_new_raw(&arrow_filtered_table);
   } else {
     return NULL;
@@ -2052,27 +1883,20 @@ garrow_table_filter_chunked_array(GArrowTable *table,
 {
   auto arrow_table = garrow_table_get_raw(table);
   auto arrow_filter = garrow_chunked_array_get_raw(filter);
-  auto memory_pool = arrow::default_memory_pool();
-  arrow::compute::FunctionContext context(memory_pool);
-  arrow::compute::Datum arrow_filtered;
-  arrow::Status status;
+  arrow::Result<arrow::Datum> arrow_filtered_datum;
   if (options) {
     auto arrow_options = garrow_filter_options_get_raw(options);
-    status = arrow::compute::Filter(&context,
-                                    arrow_table,
-                                    arrow_filter,
-                                    *arrow_options,
-                                    &arrow_filtered);
+    arrow_filtered_datum = arrow::compute::Filter(arrow_table,
+                                                  arrow_filter,
+                                                  *arrow_options);
   } else {
-    arrow::compute::FilterOptions arrow_options;
-    status = arrow::compute::Filter(&context,
-                                    arrow_table,
-                                    arrow_filter,
-                                    arrow_options,
-                                    &arrow_filtered);
+    arrow_filtered_datum = arrow::compute::Filter(arrow_table,
+                                                  arrow_filter);
   }
-  if (garrow_error_check(error, status, "[table][filter][chunked-array]")) {
-    auto arrow_filtered_table = arrow_filtered.table();
+  if (garrow::check(error,
+                    arrow_filtered_datum,
+                    "[table][filter][chunked-array]")) {
+    auto arrow_filtered_table = (*arrow_filtered_datum).table();
     return garrow_table_new_raw(&arrow_filtered_table);
   } else {
     return NULL;
@@ -2098,30 +1922,20 @@ garrow_chunked_array_filter(GArrowChunkedArray *chunked_array,
                             GArrowFilterOptions *options,
                             GError **error)
 {
-  auto arrow_chunked_array =
-    garrow_chunked_array_get_raw(chunked_array);
+  auto arrow_chunked_array = garrow_chunked_array_get_raw(chunked_array);
   auto arrow_filter = garrow_array_get_raw(GARROW_ARRAY(filter));
-  auto memory_pool = arrow::default_memory_pool();
-  arrow::compute::FunctionContext context(memory_pool);
-  arrow::compute::Datum arrow_filtered;
-  arrow::Status status;
+  arrow::Result<arrow::Datum> arrow_filtered_datum;
   if (options) {
     auto arrow_options = garrow_filter_options_get_raw(options);
-    status = arrow::compute::Filter(&context,
-                                    arrow_chunked_array,
-                                    arrow_filter,
-                                    *arrow_options,
-                                    &arrow_filtered);
+    arrow_filtered_datum = arrow::compute::Filter(arrow_chunked_array,
+                                                  arrow_filter,
+                                                  *arrow_options);
   } else {
-    arrow::compute::FilterOptions arrow_options;
-    status = arrow::compute::Filter(&context,
-                                    arrow_chunked_array,
-                                    arrow_filter,
-                                    arrow_options,
-                                    &arrow_filtered);
+    arrow_filtered_datum = arrow::compute::Filter(arrow_chunked_array,
+                                                  arrow_filter);
   }
-  if (garrow_error_check(error, status, "[chunked-array][filter]")) {
-    auto arrow_filtered_chunked_array = arrow_filtered.chunked_array();
+  if (garrow::check(error, arrow_filtered_datum, "[chunked-array][filter]")) {
+    auto arrow_filtered_chunked_array = (*arrow_filtered_datum).chunked_array();
     return garrow_chunked_array_new_raw(&arrow_filtered_chunked_array);
   } else {
     return NULL;
@@ -2147,30 +1961,22 @@ garrow_chunked_array_filter_chunked_array(GArrowChunkedArray *chunked_array,
                                           GArrowFilterOptions *options,
                                           GError **error)
 {
-  auto arrow_chunked_array =
-    garrow_chunked_array_get_raw(chunked_array);
+  auto arrow_chunked_array = garrow_chunked_array_get_raw(chunked_array);
   auto arrow_filter = garrow_chunked_array_get_raw(filter);
-  auto memory_pool = arrow::default_memory_pool();
-  arrow::compute::FunctionContext context(memory_pool);
-  arrow::compute::Datum arrow_filtered;
-  arrow::Status status;
+  arrow::Result<arrow::Datum> arrow_filtered_datum;
   if (options) {
     auto arrow_options = garrow_filter_options_get_raw(options);
-    status = arrow::compute::Filter(&context,
-                                    arrow_chunked_array,
-                                    arrow_filter,
-                                    *arrow_options,
-                                    &arrow_filtered);
+    arrow_filtered_datum = arrow::compute::Filter(arrow_chunked_array,
+                                                  arrow_filter,
+                                                  *arrow_options);
   } else {
-    arrow::compute::FilterOptions arrow_options;
-    status = arrow::compute::Filter(&context,
-                                    arrow_chunked_array,
-                                    arrow_filter,
-                                    arrow_options,
-                                    &arrow_filtered);
+    arrow_filtered_datum = arrow::compute::Filter(arrow_chunked_array,
+                                                  arrow_filter);
   }
-  if (garrow_error_check(error, status, "[chunked-array][filter][chunked-array]")) {
-    auto arrow_filtered_chunked_array = arrow_filtered.chunked_array();
+  if (garrow::check(error,
+                    arrow_filtered_datum,
+                    "[chunked-array][filter][chunked-array]")) {
+    auto arrow_filtered_chunked_array = (*arrow_filtered_datum).chunked_array();
     return garrow_chunked_array_new_raw(&arrow_filtered_chunked_array);
   } else {
     return NULL;
@@ -2196,30 +2002,20 @@ garrow_record_batch_filter(GArrowRecordBatch *record_batch,
                            GArrowFilterOptions *options,
                            GError **error)
 {
-  auto arrow_record_batch =
-    garrow_record_batch_get_raw(record_batch);
+  auto arrow_record_batch = garrow_record_batch_get_raw(record_batch);
   auto arrow_filter = garrow_array_get_raw(GARROW_ARRAY(filter));
-  auto memory_pool = arrow::default_memory_pool();
-  arrow::compute::FunctionContext context(memory_pool);
-  arrow::compute::Datum arrow_filtered;
-  arrow::Status status;
+  arrow::Result<arrow::Datum> arrow_filtered_datum;
   if (options) {
     auto arrow_options = garrow_filter_options_get_raw(options);
-    status = arrow::compute::Filter(&context,
-                                    arrow_record_batch,
-                                    arrow_filter,
-                                    *arrow_options,
-                                    &arrow_filtered);
+    arrow_filtered_datum = arrow::compute::Filter(arrow_record_batch,
+                                                  arrow_filter,
+                                                  *arrow_options);
   } else {
-    arrow::compute::FilterOptions arrow_options;
-    status = arrow::compute::Filter(&context,
-                                    arrow_record_batch,
-                                    arrow_filter,
-                                    arrow_options,
-                                    &arrow_filtered);
+    arrow_filtered_datum = arrow::compute::Filter(arrow_record_batch,
+                                                  arrow_filter);
   }
-  if (garrow_error_check(error, status, "[record-batch][filter]")) {
-    auto arrow_filtered_record_batch = arrow_filtered.record_batch();
+  if (garrow::check(error, arrow_filtered_datum, "[record-batch][filter]")) {
+    auto arrow_filtered_record_batch = (*arrow_filtered_datum).record_batch();
     return garrow_record_batch_new_raw(&arrow_filtered_record_batch);
   } else {
     return NULL;
diff --git a/c_glib/arrow-glib/error.cpp b/c_glib/arrow-glib/error.cpp
index 211ebefe4b5..b03edefba7d 100644
--- a/c_glib/arrow-glib/error.cpp
+++ b/c_glib/arrow-glib/error.cpp
@@ -39,8 +39,18 @@ G_BEGIN_DECLS
 
 G_DEFINE_QUARK(garrow-error-quark, garrow_error)
 
-static GArrowError
-garrow_error_code(const arrow::Status &status)
+G_END_DECLS
+
+gboolean
+garrow_error_check(GError **error,
+                   const arrow::Status &status,
+                   const char *context)
+{
+  return garrow::check(error, status, context);
+}
+
+GArrowError
+garrow_error_from_status(const arrow::Status &status)
 {
   switch (status.code()) {
   case arrow::StatusCode::OK:
@@ -73,25 +83,34 @@ garrow_error_code(const arrow::Status &status)
     return GARROW_ERROR_EXECUTION;
   case arrow::StatusCode::AlreadyExists:
     return GARROW_ERROR_ALREADY_EXISTS;
-
   default:
     return GARROW_ERROR_UNKNOWN;
   }
 }
 
-G_END_DECLS
+arrow::Status
+garrow_error_to_status(GError *error,
+                       arrow::StatusCode code,
+                       const char *context)
+{
+  std::stringstream message;
+  message << context << ": " << g_quark_to_string(error->domain);
+  message << "(" << error->code << "): ";
+  message << error->message;
+  g_error_free(error);
+  return arrow::Status(code, message.str());
+}
 
 namespace garrow {
-  gboolean
-  check(GError **error,
-        const arrow::Status &status,
-        const char *context) {
+  gboolean check(GError **error,
+                 const arrow::Status &status,
+                 const char *context) {
     if (status.ok()) {
       return TRUE;
     } else {
       g_set_error(error,
                   GARROW_ERROR,
-                  garrow_error_code(status),
+                  garrow_error_from_status(status),
                   "%s: %s",
                   context,
                   status.ToString().c_str());
@@ -99,24 +118,3 @@ namespace garrow {
     }
   }
 }
-
-gboolean
-garrow_error_check(GError **error,
-                   const arrow::Status &status,
-                   const char *context)
-{
-  return garrow::check(error, status, context);
-}
-
-arrow::Status
-garrow_error_to_status(GError *error,
-                       arrow::StatusCode code,
-                       const char *context)
-{
-  std::stringstream message;
-  message << context << ": " << g_quark_to_string(error->domain);
-  message << "(" << error->code << "): ";
-  message << error->message;
-  g_error_free(error);
-  return arrow::Status(code, message.str());
-}
diff --git a/c_glib/arrow-glib/error.hpp b/c_glib/arrow-glib/error.hpp
index 735c67a60d1..d7ab1515c56 100644
--- a/c_glib/arrow-glib/error.hpp
+++ b/c_glib/arrow-glib/error.hpp
@@ -23,26 +23,48 @@
 
 #include <arrow-glib/error.h>
 
+gboolean garrow_error_check(GError **error,
+                            const arrow::Status &status,
+                            const char *context);
+GArrowError garrow_error_from_status(const arrow::Status &status);
+arrow::Status garrow_error_to_status(GError *error,
+                                     arrow::StatusCode code,
+                                     const char *context);
+
 namespace garrow {
   gboolean check(GError **error,
                  const arrow::Status &status,
                  const char *context);
 
-  template <typename TYPE>
+  template <typename CONTEXT_FUNC>
   gboolean check(GError **error,
-                 const arrow::Result<TYPE> &result,
-                 const char *context) {
-    if (result.ok()) {
+                 const arrow::Status &status,
+                 CONTEXT_FUNC &&context_func) {
+    if (status.ok()) {
       return TRUE;
     } else {
-      return check(error, result.status(), context);
+      std::string context = std::move(context_func());
+      g_set_error(error,
+                  GARROW_ERROR,
+                  garrow_error_from_status(status),
+                  "%s: %s",
+                  context.c_str(),
+                  status.ToString().c_str());
+      return FALSE;
     }
   }
-}
 
-gboolean garrow_error_check(GError **error,
-                            const arrow::Status &status,
-                            const char *context);
-arrow::Status garrow_error_to_status(GError *error,
-                                     arrow::StatusCode code,
-                                     const char *context);
+  template <typename TYPE>
+  gboolean check(GError **error,
+                 const arrow::Result<TYPE> &result,
+                 const char *context) {
+    return check(error, result.status(), context);
+  }
+
+  template <typename TYPE, typename CONTEXT_FUNC>
+  gboolean check(GError **error,
+                 const arrow::Result<TYPE> &result,
+                 CONTEXT_FUNC &&context_func) {
+    return check(error, result.status(), context_func);
+  }
+}

From 4cfafe59480f268707730aab51f2299de53378f9 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Fri, 22 May 2020 17:51:28 -0500
Subject: [PATCH 29/41] Start working on Python bindings for new kernels API

---
 cpp/src/arrow/compute/registry.cc    |  10 +--
 cpp/src/arrow/compute/registry.h     |   5 +-
 python/pyarrow/_compute.pyx          | 103 +++++++++++++++++++++------
 python/pyarrow/includes/libarrow.pxd |  67 ++++++++++++++++-
 4 files changed, 153 insertions(+), 32 deletions(-)

diff --git a/cpp/src/arrow/compute/registry.cc b/cpp/src/arrow/compute/registry.cc
index 4a5d32f5c25..c7b6099908a 100644
--- a/cpp/src/arrow/compute/registry.cc
+++ b/cpp/src/arrow/compute/registry.cc
@@ -32,7 +32,7 @@ namespace compute {
 
 class FunctionRegistry::FunctionRegistryImpl {
  public:
-  Status AddFunction(std::shared_ptr<const Function> function, bool allow_overwrite) {
+  Status AddFunction(std::shared_ptr<Function> function, bool allow_overwrite) {
     std::lock_guard<std::mutex> mutation_guard(lock_);
 
     const std::string& name = function->name();
@@ -44,7 +44,7 @@ class FunctionRegistry::FunctionRegistryImpl {
     return Status::OK();
   }
 
-  Result<std::shared_ptr<const Function>> GetFunction(const std::string& name) const {
+  Result<std::shared_ptr<Function>> GetFunction(const std::string& name) const {
     auto it = name_to_function_.find(name);
     if (it == name_to_function_.end()) {
       return Status::KeyError("No function registered with name: ", name);
@@ -65,7 +65,7 @@ class FunctionRegistry::FunctionRegistryImpl {
 
  private:
   std::mutex lock_;
-  std::unordered_map<std::string, std::shared_ptr<const Function>> name_to_function_;
+  std::unordered_map<std::string, std::shared_ptr<Function>> name_to_function_;
 };
 
 std::unique_ptr<FunctionRegistry> FunctionRegistry::Make() {
@@ -76,12 +76,12 @@ FunctionRegistry::FunctionRegistry() { impl_.reset(new FunctionRegistryImpl());
 
 FunctionRegistry::~FunctionRegistry() {}
 
-Status FunctionRegistry::AddFunction(std::shared_ptr<const Function> function,
+Status FunctionRegistry::AddFunction(std::shared_ptr<Function> function,
                                      bool allow_overwrite) {
   return impl_->AddFunction(std::move(function), allow_overwrite);
 }
 
-Result<std::shared_ptr<const Function>> FunctionRegistry::GetFunction(
+Result<std::shared_ptr<Function>> FunctionRegistry::GetFunction(
     const std::string& name) const {
   return impl_->GetFunction(name);
 }
diff --git a/cpp/src/arrow/compute/registry.h b/cpp/src/arrow/compute/registry.h
index 50c24539449..c52618fcf2b 100644
--- a/cpp/src/arrow/compute/registry.h
+++ b/cpp/src/arrow/compute/registry.h
@@ -45,11 +45,10 @@ class ARROW_EXPORT FunctionRegistry {
 
   /// \brief Add a new kernel to the registry. Returns Status::KeyError if a
   /// kernel with the same name is already registered
-  Status AddFunction(std::shared_ptr<const Function> function,
-                     bool allow_overwrite = false);
+  Status AddFunction(std::shared_ptr<Function> function, bool allow_overwrite = false);
 
   /// \brief Retrieve a kernel by name from the registry
-  Result<std::shared_ptr<const Function>> GetFunction(const std::string& name) const;
+  Result<std::shared_ptr<Function>> GetFunction(const std::string& name) const;
 
   /// \brief Return vector of all entry names in the registry. Helpful for
   /// displaying a manifest of available kernels
diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx
index bb4ad916cf7..7ae711a77d7 100644
--- a/python/pyarrow/_compute.pyx
+++ b/python/pyarrow/_compute.pyx
@@ -21,28 +21,97 @@ from pyarrow.lib cimport (
     Array,
     wrap_datum,
     check_status,
-    ChunkedArray
+    ChunkedArray,
+    ScalarValue
 )
-from pyarrow.includes.libarrow cimport CDatum, Sum
+from pyarrow.includes.libarrow cimport *
 from pyarrow.includes.common cimport *
 
+from pyarrow.compat import frombytes, tobytes
 
-cdef _sum_array(array: Array):
-    cdef CDatum out
 
-    with nogil:
-        out = GetResultValue(Sum(CDatum(array.sp_array)))
+cdef wrap_function(const shared_ptr[CFunction]& sp_func):
+    if sp_func.get() == NULL:
+        raise ValueError('Function was NULL')
 
-    return wrap_datum(out)
+    cdef Function func = Function.__new__(Function)
+    func.init(sp_func)
+    return func
 
 
-cdef _sum_chunked_array(array: ChunkedArray):
-    cdef CDatum out
 
-    with nogil:
-        out = GetResultValue(Sum(CDatum(array.sp_chunked_array)))
+cdef class Function:
+    """
+    The base class for all Arrow arrays.
+    """
+    cdef:
+        shared_ptr[CFunction] sp_func
+        const CFunction* func
+
+    def __init__(self):
+        raise TypeError("Do not call {}'s constructor directly"
+                        .format(self.__class__.__name__))
+
+    cdef void init(self, const shared_ptr[CFunction]& sp_func) except *:
+        self.sp_func = sp_func
+        self.func = sp_func.get()
+
+    @property
+    def num_kernels(self):
+        return self.func.num_kernels()
+
+    def call(self, args):
+        cdef:
+            const CFunctionOptions* c_options = NULL
+            vector[CDatum] c_args
+            CDatum result
+
+        _pack_compute_args(args, &c_args)
+
+        with nogil:
+            result = GetResultValue(self.func.Execute(c_args, c_options))
+
+        return wrap_datum(result)
+
+
+cdef _pack_compute_args(object values, vector[CDatum]* out):
+    for val in values:
+        if isinstance(val, Array):
+            out.push_back(CDatum((<Array> val).sp_array))
+        elif isinstance(val, ChunkedArray):
+            out.push_back(CDatum((<ChunkedArray> val).sp_chunked_array))
+        elif isinstance(val, ScalarValue):
+            out.push_back(CDatum((<ScalarValue> val).sp_scalar))
+        else:
+            raise TypeError(type(val))
+
+
+cdef class FunctionRegistry:
+    cdef:
+        CFunctionRegistry* registry
+
+    def __init__(self):
+        self.registry = GetFunctionRegistry()
+
+    def list_functions(self):
+        cdef vector[c_string] names = self.registry.GetFunctionNames()
+        return [frombytes(name) for name in names]
+
+    def get_function(self, name):
+        cdef:
+            c_string c_name = tobytes(name)
+            shared_ptr[CFunction] func
+        with nogil:
+            func = GetResultValue(self.registry.GetFunction(c_name))
+        return wrap_function(func)
+
+
+cdef FunctionRegistry _global_func_registry = FunctionRegistry()
+
 
-    return wrap_datum(out)
+def call_function(name, args):
+    func = _global_func_registry.get_function(name)
+    return func.call(args)
 
 
 def sum(array):
@@ -57,12 +126,4 @@ def sum(array):
     -------
     sum : pyarrow.Scalar
     """
-    if isinstance(array, Array):
-        return _sum_array(array)
-    elif isinstance(array, ChunkedArray):
-        return _sum_chunked_array(array)
-    else:
-        raise ValueError(
-            "Only pyarrow.Array and pyarrow.ChunkedArray supported as"
-            " an input, passed {}".format(type(array))
-        )
+    return call_function('sum', [array])
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 58511fa97b9..81bd0bbfc96 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1457,7 +1457,66 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
         CExecContext()
         CExecContext(CMemoryPool* pool)
 
-    cdef cppclass CCastOptions" arrow::compute::CastOptions":
+    cdef cppclass CKernelSignature" arrow::compute::KernelSignature":
+        c_string ToString() const
+
+    cdef cppclass CKernel" arrow::compute::Kernel":
+        shared_ptr[CKernelSignature] signature
+
+    cdef cppclass CArrayKernel" arrow::compute::ArrayKernel"(CKernel):
+        pass
+
+    cdef cppclass CScalarKernel" arrow::compute::ScalarKernel"(CArrayKernel):
+        pass
+
+    cdef cppclass CVectorKernel" arrow::compute::VectorKernel"(CArrayKernel):
+        pass
+
+    cdef cppclass CScalarAggregateKernel \
+            " arrow::compute::ScalarAggregateKernel"(CKernel):
+        pass
+
+    cdef cppclass CArity" arrow::compute::Arity":
+        int num_args
+        c_bool is_varargs
+
+    enum FunctionKind" arrow::compute::Function::Kind":
+        FunctionKind_SCALAR" arrow::compute::Function::SCALAR"
+        FunctionKind_VECTOR" arrow::compute::Function::VECTOR"
+        FunctionKind_SCALAR_AGGREGATE \
+            " arrow::compute::Function::SCALAR_AGGREGATE"
+
+    cdef cppclass CFunctionOptions" arrow::compute::FunctionOptions":
+        pass
+
+    cdef cppclass CFunction" arrow::compute::Function":
+        const c_string& name() const
+        FunctionKind kind() const
+        const CArity& arity() const
+        int num_kernels() const
+        CResult[CDatum] Execute(const vector[CDatum]& args,
+                                const CFunctionOptions* options)
+
+    cdef cppclass CScalarFunction" arrow::compute::ScalarFunction"(CFunction):
+        pass
+
+    cdef cppclass CVectorFunction" arrow::compute::VectorFunction"(CFunction):
+        pass
+
+    cdef cppclass CScalarAggregateFunction\
+            " arrow::compute::ScalarAggregateFunctionVectorFunction"\
+            (CFunction):
+        pass
+
+    cdef cppclass CFunctionRegistry" arrow::compute::FunctionRegistry":
+        CResult[shared_ptr[CFunction]] GetFunction(
+            const c_string& name) const
+        vector[c_string] GetFunctionNames() const
+        int num_functions() const
+
+    CFunctionRegistry* GetFunctionRegistry()
+
+    cdef cppclass CCastOptions" arrow::compute::CastOptions"(CFunctionOptions):
         CCastOptions()
         CCastOptions(c_bool safe)
 
@@ -1472,7 +1531,7 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
         c_bool allow_float_truncate
         c_bool allow_invalid_utf8
 
-    cdef cppclass CTakeOptions" arrow::compute::TakeOptions":
+    cdef cppclass CTakeOptions" arrow::compute::TakeOptions"(CFunctionOptions):
         pass
 
     enum CFilterNullSelectionBehavior \
@@ -1482,7 +1541,8 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
         CFilterNullSelectionBehavior_EMIT_NULL \
             "arrow::compute::FilterOptions::EMIT_NULL"
 
-    cdef cppclass CFilterOptions" arrow::compute::FilterOptions":
+    cdef cppclass CFilterOptions \
+            " arrow::compute::FilterOptions"(CFunctionOptions):
         CFilterNullSelectionBehavior null_selection_behavior
 
     enum DatumType" arrow::Datum::type":
@@ -1498,6 +1558,7 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
         CDatum()
         CDatum(const shared_ptr[CArray]& value)
         CDatum(const shared_ptr[CChunkedArray]& value)
+        CDatum(const shared_ptr[CScalar]& value)
         CDatum(const shared_ptr[CRecordBatch]& value)
         CDatum(const shared_ptr[CTable]& value)
 

From 4ab5fa8579f65967c988b6c89516971aed2ca079 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Fri, 22 May 2020 17:56:17 -0500
Subject: [PATCH 30/41] Fix IsInChunkedArrayInvoke test

---
 cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc | 4 ++--
 python/pyarrow/_compute.pyx                             | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc
index a3052a4b877..ef6c3c50c0e 100644
--- a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc
@@ -394,7 +394,7 @@ TEST_F(TestIsInKernel, IsInChunkedArrayInvoke) {
   ASSERT_OK_AND_ASSIGN(Datum encoded_out, IsIn(carr, member_set));
   ASSERT_EQ(Datum::CHUNKED_ARRAY, encoded_out.kind());
 
-  AssertChunkedEqual(*expected_carr, *encoded_out.chunked_array());
+  AssertChunkedEquivalent(*expected_carr, *encoded_out.chunked_array());
 }
 // ----------------------------------------------------------------------
 // Match tests
@@ -727,7 +727,7 @@ TEST_F(TestMatchKernel, MatchChunkedArrayInvoke) {
   ASSERT_OK_AND_ASSIGN(Datum encoded_out, Match(carr, Datum(member_set)));
   ASSERT_EQ(Datum::CHUNKED_ARRAY, encoded_out.kind());
 
-  AssertChunkedEqual(*expected_carr, *encoded_out.chunked_array());
+  AssertChunkedEquivalent(*expected_carr, *encoded_out.chunked_array());
 }
 
 }  // namespace compute
diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx
index 7ae711a77d7..20971f3ed83 100644
--- a/python/pyarrow/_compute.pyx
+++ b/python/pyarrow/_compute.pyx
@@ -39,7 +39,6 @@ cdef wrap_function(const shared_ptr[CFunction]& sp_func):
     return func
 
 
-
 cdef class Function:
     """
     The base class for all Arrow arrays.

From 273a0a6472114411c3362901b1bb9385a9f028e3 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Fri, 22 May 2020 18:36:42 -0500
Subject: [PATCH 31/41] Simple Python bindings for new compute functionality.
 Unit tests must come later

---
 cpp/src/arrow/compute/function.h              |  10 +-
 cpp/src/arrow/compute/kernel.cc               |   2 +-
 .../arrow/compute/kernels/scalar_compare.cc   |  12 +-
 python/pyarrow/_compute.pyx                   | 175 +++++++++++++++++-
 python/pyarrow/compute.py                     |   4 +
 python/pyarrow/includes/libarrow.pxd          |   8 +-
 6 files changed, 188 insertions(+), 23 deletions(-)

diff --git a/cpp/src/arrow/compute/function.h b/cpp/src/arrow/compute/function.h
index d52241583f5..4280235d678 100644
--- a/cpp/src/arrow/compute/function.h
+++ b/cpp/src/arrow/compute/function.h
@@ -115,8 +115,14 @@ namespace detail {
 template <typename KernelType>
 class FunctionImpl : public Function {
  public:
-  /// \brief Return vector of all available kernels for this function
-  const std::vector<KernelType>& kernels() const { return kernels_; }
+  /// \brief Return pointers to current-available kernels for inspection
+  std::vector<const KernelType*> kernels() const {
+    std::vector<const KernelType*> result;
+    for (const auto& kernel : kernels_) {
+      result.push_back(&kernel);
+    }
+    return result;
+  }
 
   int num_kernels() const override { return static_cast<int>(kernels_.size()); }
 
diff --git a/cpp/src/arrow/compute/kernel.cc b/cpp/src/arrow/compute/kernel.cc
index 8bbff5905b4..1d02117cc0c 100644
--- a/cpp/src/arrow/compute/kernel.cc
+++ b/cpp/src/arrow/compute/kernel.cc
@@ -280,7 +280,7 @@ const OutputType::Resolver& OutputType::resolver() const {
 
 std::string OutputType::ToString() const {
   if (kind_ == OutputType::FIXED) {
-    return ValueDescr(type_, shape_).ToString();
+    return type_->ToString();
   } else {
     return "computed";
   }
diff --git a/cpp/src/arrow/compute/kernels/scalar_compare.cc b/cpp/src/arrow/compute/kernels/scalar_compare.cc
index 1c3fa722f17..699f7273f40 100644
--- a/cpp/src/arrow/compute/kernels/scalar_compare.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_compare.cc
@@ -118,12 +118,12 @@ void MakeCompareFunction(std::string name, FunctionRegistry* registry) {
 }
 
 void RegisterScalarComparison(FunctionRegistry* registry) {
-  MakeCompareFunction<Equal>("==", registry);
-  MakeCompareFunction<NotEqual>("!=", registry);
-  MakeCompareFunction<Less, Greater>("<", registry);
-  MakeCompareFunction<LessEqual, GreaterEqual>("<=", registry);
-  MakeCompareFunction<Greater, Less>(">", registry);
-  MakeCompareFunction<GreaterEqual, LessEqual>(">=", registry);
+  MakeCompareFunction<Equal>("equals", registry);
+  MakeCompareFunction<NotEqual>("not_equals", registry);
+  MakeCompareFunction<Less, Greater>("less", registry);
+  MakeCompareFunction<LessEqual, GreaterEqual>("less_equal", registry);
+  MakeCompareFunction<Greater, Less>("greater", registry);
+  MakeCompareFunction<GreaterEqual, LessEqual>("greater_equal", registry);
 }
 
 }  // namespace internal
diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx
index 20971f3ed83..a743c8030c4 100644
--- a/python/pyarrow/_compute.pyx
+++ b/python/pyarrow/_compute.pyx
@@ -30,22 +30,114 @@ from pyarrow.includes.common cimport *
 from pyarrow.compat import frombytes, tobytes
 
 
+cdef wrap_scalar_function(const shared_ptr[CFunction]& sp_func):
+    cdef ScalarFunction func = ScalarFunction.__new__(ScalarFunction)
+    func.init(sp_func)
+    return func
+
+
+cdef wrap_vector_function(const shared_ptr[CFunction]& sp_func):
+    cdef VectorFunction func = VectorFunction.__new__(VectorFunction)
+    func.init(sp_func)
+    return func
+
+
+cdef wrap_scalar_aggregate_function(const shared_ptr[CFunction]& sp_func):
+    cdef ScalarAggregateFunction func = (
+        ScalarAggregateFunction.__new__(ScalarAggregateFunction)
+    )
+    func.init(sp_func)
+    return func
+
+
 cdef wrap_function(const shared_ptr[CFunction]& sp_func):
     if sp_func.get() == NULL:
         raise ValueError('Function was NULL')
 
-    cdef Function func = Function.__new__(Function)
-    func.init(sp_func)
-    return func
+    cdef FunctionKind c_kind = sp_func.get().kind()
+    if c_kind == FunctionKind_SCALAR:
+        return wrap_scalar_function(sp_func)
+    elif c_kind == FunctionKind_VECTOR:
+        return wrap_vector_function(sp_func)
+    elif c_kind == FunctionKind_SCALAR_AGGREGATE:
+        return wrap_scalar_aggregate_function(sp_func)
+    else:
+        raise NotImplementedError("Unknown Function::Kind")
+
+
+cdef wrap_scalar_kernel(const CScalarKernel* c_kernel):
+    if c_kernel == NULL:
+        raise ValueError('Kernel was NULL')
+    cdef ScalarKernel kernel = ScalarKernel.__new__(ScalarKernel)
+    kernel.init(c_kernel)
+    return kernel
+
+
+cdef wrap_vector_kernel(const CVectorKernel* c_kernel):
+    if c_kernel == NULL:
+        raise ValueError('Kernel was NULL')
+    cdef VectorKernel kernel = VectorKernel.__new__(VectorKernel)
+    kernel.init(c_kernel)
+    return kernel
+
+
+cdef wrap_scalar_aggregate_kernel(const CScalarAggregateKernel* c_kernel):
+    if c_kernel == NULL:
+        raise ValueError('Kernel was NULL')
+    cdef ScalarAggregateKernel kernel = (
+        ScalarAggregateKernel.__new__(ScalarAggregateKernel)
+    )
+    kernel.init(c_kernel)
+    return kernel
+
+
+cdef class Kernel:
+
+    def __init__(self):
+        raise TypeError("Do not call {}'s constructor directly"
+                        .format(self.__class__.__name__))
+
+
+cdef class ScalarKernel(Kernel):
+    cdef:
+        const CScalarKernel* kernel
+
+    cdef void init(self, const CScalarKernel* kernel) except *:
+        self.kernel = kernel
+
+    def __repr__(self):
+        return ("ScalarKernel<{}>"
+                .format(frombytes(self.kernel.signature.get().ToString())))
+
+
+cdef class VectorKernel(Kernel):
+    cdef:
+        const CVectorKernel* kernel
+
+    cdef void init(self, const CVectorKernel* kernel) except *:
+        self.kernel = kernel
+
+    def __repr__(self):
+        return ("VectorKernel<{}>"
+                .format(frombytes(self.kernel.signature.get().ToString())))
+
+
+cdef class ScalarAggregateKernel(Kernel):
+    cdef:
+        const CScalarAggregateKernel* kernel
+
+    cdef void init(self, const CScalarAggregateKernel* kernel) except *:
+        self.kernel = kernel
+
+    def __repr__(self):
+        return ("ScalarAggregateKernel<{}>"
+                .format(frombytes(self.kernel.signature.get().ToString())))
 
 
 cdef class Function:
-    """
-    The base class for all Arrow arrays.
-    """
     cdef:
         shared_ptr[CFunction] sp_func
-        const CFunction* func
+        CFunction* base_func
 
     def __init__(self):
         raise TypeError("Do not call {}'s constructor directly"
@@ -53,11 +145,29 @@ cdef class Function:
 
     cdef void init(self, const shared_ptr[CFunction]& sp_func) except *:
         self.sp_func = sp_func
-        self.func = sp_func.get()
+        self.base_func = sp_func.get()
+
+    def __repr__(self):
+        return """arrow.compute.Function
+kind: {}
+num_kernels: {}
+""".format(self.kind, self.num_kernels)
+
+    @property
+    def kind(self):
+        cdef FunctionKind c_kind = self.base_func.kind()
+        if c_kind == FunctionKind_SCALAR:
+            return 'scalar'
+        elif c_kind == FunctionKind_VECTOR:
+            return 'vector'
+        elif c_kind == FunctionKind_SCALAR_AGGREGATE:
+            return 'scalar_aggregate'
+        else:
+            raise NotImplementedError("Unknown Function::Kind")
 
     @property
     def num_kernels(self):
-        return self.func.num_kernels()
+        return self.base_func.num_kernels()
 
     def call(self, args):
         cdef:
@@ -68,11 +178,52 @@ cdef class Function:
         _pack_compute_args(args, &c_args)
 
         with nogil:
-            result = GetResultValue(self.func.Execute(c_args, c_options))
+            result = GetResultValue(self.base_func.Execute(c_args, c_options))
 
         return wrap_datum(result)
 
 
+cdef class ScalarFunction(Function):
+    cdef:
+        const CScalarFunction* func
+
+    cdef void init(self, const shared_ptr[CFunction]& sp_func) except *:
+        Function.init(self, sp_func)
+        self.func = <const CScalarFunction*> sp_func.get()
+
+    def list_kernels(self):
+        cdef vector[const CScalarKernel*] kernels = self.func.kernels()
+        return [wrap_scalar_kernel(k) for k in kernels]
+
+
+cdef class VectorFunction(Function):
+    cdef:
+        const CVectorFunction* func
+
+    cdef void init(self, const shared_ptr[CFunction]& sp_func) except *:
+        Function.init(self, sp_func)
+        self.func = <const CVectorFunction*> sp_func.get()
+
+    def list_kernels(self):
+        cdef vector[const CVectorKernel*] kernels = self.func.kernels()
+        return [wrap_vector_kernel(k) for k in kernels]
+
+
+cdef class ScalarAggregateFunction(Function):
+    cdef:
+        const CScalarAggregateFunction* func
+
+    cdef void init(self, const shared_ptr[CFunction]& sp_func) except *:
+        Function.init(self, sp_func)
+        self.func = <const CScalarAggregateFunction*> sp_func.get()
+
+    def list_kernels(self):
+        cdef vector[const CScalarAggregateKernel*] kernels = (
+            self.func.kernels()
+        )
+        return [wrap_scalar_aggregate_kernel(k) for k in kernels]
+
+
 cdef _pack_compute_args(object values, vector[CDatum]* out):
     for val in values:
         if isinstance(val, Array):
@@ -108,6 +259,10 @@ cdef class FunctionRegistry:
 cdef FunctionRegistry _global_func_registry = FunctionRegistry()
 
 
+def function_registry():
+    return _global_func_registry
+
+
 def call_function(name, args):
     func = _global_func_registry.get_function(name)
     return func.call(args)
diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
index 380145bb610..7f55f8d776e 100644
--- a/python/pyarrow/compute.py
+++ b/python/pyarrow/compute.py
@@ -17,5 +17,9 @@
 
 
 from pyarrow._compute import (  # noqa
+    Function,
+    FunctionRegistry,
+    function_registry,
+    call_function,
     sum
 )
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 81bd0bbfc96..d7f1a7733cd 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1498,15 +1498,15 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
                                 const CFunctionOptions* options)
 
     cdef cppclass CScalarFunction" arrow::compute::ScalarFunction"(CFunction):
-        pass
+        vector[const CScalarKernel*] kernels() const
 
     cdef cppclass CVectorFunction" arrow::compute::VectorFunction"(CFunction):
-        pass
+        vector[const CVectorKernel*] kernels() const
 
     cdef cppclass CScalarAggregateFunction\
-            " arrow::compute::ScalarAggregateFunctionVectorFunction"\
+            " arrow::compute::ScalarAggregateFunction"\
             (CFunction):
-        pass
+        vector[const CScalarAggregateKernel*] kernels() const
 
     cdef cppclass CFunctionRegistry" arrow::compute::FunctionRegistry":
         CResult[shared_ptr[CFunction]] GetFunction(

From 7264af46903f4c7ed3f7fdf807bce16d627a3e3f Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sat, 23 May 2020 08:41:08 +0900
Subject: [PATCH 32/41] Fix build error with -DCMAKE_UNITY_BUILD=ON

---
 cpp/src/arrow/adapters/orc/CMakeLists.txt             |  2 +-
 cpp/src/arrow/compute/kernels/scalar_cast_internal.cc | 11 +++++++++++
 cpp/src/arrow/compute/kernels/scalar_cast_internal.h  |  2 ++
 cpp/src/arrow/compute/kernels/scalar_cast_nested.cc   |  2 --
 cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc | 11 -----------
 5 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/cpp/src/arrow/adapters/orc/CMakeLists.txt b/cpp/src/arrow/adapters/orc/CMakeLists.txt
index 20501dccf7d..7a3681968fd 100644
--- a/cpp/src/arrow/adapters/orc/CMakeLists.txt
+++ b/cpp/src/arrow/adapters/orc/CMakeLists.txt
@@ -44,7 +44,7 @@ elseif(NOT MSVC)
   set(ORC_MIN_TEST_LIBS ${ORC_MIN_TEST_LIBS} pthread ${CMAKE_DL_LIBS})
 endif()
 
-set(ORC_STATIC_TEST_LINK_LIBS ${ORC_MIN_TEST_LIBS} ${ARROW_LIBRARIES_FOR_STATIC_TESTS}
+set(ORC_STATIC_TEST_LINK_LIBS ${ARROW_LIBRARIES_FOR_STATIC_TESTS} ${ORC_MIN_TEST_LIBS}
                               orc::liborc)
 
 add_arrow_test(adapter_test
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
index 2bb4641d0bb..4780da60749 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
@@ -45,6 +45,17 @@ Result<ValueDescr> ResolveOutputFromOptions(KernelContext* ctx,
   return ValueDescr(options.to_type, args[0].shape);
 }
 
+/// You will see some of kernels with
+///
+/// kOutputTargetType
+///
+/// for their output type resolution. This is somewhat of an eyesore but the
+/// easiest initial way to get the requested cast type including the TimeUnit
+/// to the kernel (which is needed to compute the output) was through
+/// CastOptions
+
+OutputType kOutputTargetType(ResolveOutputFromOptions);
+
 void ZeroCopyCastExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
   if (batch[0].kind() == Datum::ARRAY) {
     // Make a copy of the buffers into a destination array without carrying
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
index d3fcd84881e..e6c687288ed 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
@@ -248,6 +248,8 @@ void AddZeroCopyCast(Type::type in_type_id, InputType in_type, OutputType out_ty
 Result<ValueDescr> ResolveOutputFromOptions(KernelContext* ctx,
                                             const std::vector<ValueDescr>& args);
 
+ARROW_EXPORT extern OutputType kOutputTargetType;
+
 template <typename T, typename Enable = void>
 struct MaybeAddFromDictionary {
   static void Add(const OutputType& out_ty, CastFunction* func) {}
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
index c0177d8c4fa..f0366d49bf8 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
@@ -53,8 +53,6 @@ void CastListExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
   result->child_data.push_back(casted_child.array());
 }
 
-static OutputType kOutputTargetType(ResolveOutputFromOptions);
-
 template <typename Type>
 void AddListCast(CastFunction* func) {
   ScalarKernel kernel;
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
index 3ca9535b14c..e33a459a10a 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
@@ -280,17 +280,6 @@ struct CastFunctor<TimestampType, I, enable_if_t<is_base_binary_type<I>::value>>
   }
 };
 
-/// You will see some of these kernels with
-///
-/// kOutputTargetType
-///
-/// for their output type resolution. This is somewhat of an eyesore but the
-/// easiest initial way to get the requested cast type including the TimeUnit
-/// to the kernel (which is needed to compute the output) was through
-/// CastOptions
-
-static OutputType kOutputTargetType(ResolveOutputFromOptions);
-
 template <typename Type>
 void AddCrossUnitCast(CastFunction* func) {
   ScalarKernel kernel;

From c69050640a2741bc6b46fac48ad2d4ef12fcab95 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Fri, 22 May 2020 18:45:17 -0500
Subject: [PATCH 33/41] Fix unit tests per minor API changes

---
 cpp/src/arrow/compute/api_scalar.cc             | 12 ++++++------
 cpp/src/arrow/compute/function_test.cc          |  4 ++--
 cpp/src/arrow/compute/kernel_test.cc            |  6 +++---
 cpp/src/arrow/compute/kernels/scalar_compare.cc |  4 ++--
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc
index fdad5d5dcbd..07064395b68 100644
--- a/cpp/src/arrow/compute/api_scalar.cc
+++ b/cpp/src/arrow/compute/api_scalar.cc
@@ -88,22 +88,22 @@ Result<Datum> Compare(const Datum& left, const Datum& right, CompareOptions opti
   std::string func_name;
   switch (options.op) {
     case CompareOperator::EQUAL:
-      func_name = "==";
+      func_name = "equal";
       break;
     case CompareOperator::NOT_EQUAL:
-      func_name = "!=";
+      func_name = "not_equal";
       break;
     case CompareOperator::GREATER:
-      func_name = ">";
+      func_name = "greater";
       break;
     case CompareOperator::GREATER_EQUAL:
-      func_name = ">=";
+      func_name = "greater_equal";
       break;
     case CompareOperator::LESS:
-      func_name = "<";
+      func_name = "less";
       break;
     case CompareOperator::LESS_EQUAL:
-      func_name = "<=";
+      func_name = "less_equal";
       break;
   }
   return CallFunction(ctx, func_name, {left, right}, &options);
diff --git a/cpp/src/arrow/compute/function_test.cc b/cpp/src/arrow/compute/function_test.cc
index 3ce44c004e2..0c1d6241ef4 100644
--- a/cpp/src/arrow/compute/function_test.cc
+++ b/cpp/src/arrow/compute/function_test.cc
@@ -203,7 +203,7 @@ TEST(ScalarAggregateFunction, DispatchExact) {
 
   ASSERT_EQ(2, func.num_kernels());
   ASSERT_EQ(2, func.kernels().size());
-  ASSERT_TRUE(func.kernels()[1].signature->Equals(*kernel.signature));
+  ASSERT_TRUE(func.kernels()[1]->signature->Equals(*kernel.signature));
 
   // Invalid arity
   in_args = {};
@@ -217,7 +217,7 @@ TEST(ScalarAggregateFunction, DispatchExact) {
   std::vector<ValueDescr> dispatch_args = {ValueDescr::Array(int8())};
   ASSERT_OK_AND_ASSIGN(const ScalarAggregateKernel* selected_kernel,
                        func.DispatchExact(dispatch_args));
-  ASSERT_EQ(&func.kernels()[0], selected_kernel);
+  ASSERT_EQ(func.kernels()[0], selected_kernel);
   ASSERT_TRUE(selected_kernel->signature->MatchesInputs(dispatch_args));
 
   // We declared that only arrays are accepted
diff --git a/cpp/src/arrow/compute/kernel_test.cc b/cpp/src/arrow/compute/kernel_test.cc
index 5528f188aa4..bd5571b2fb5 100644
--- a/cpp/src/arrow/compute/kernel_test.cc
+++ b/cpp/src/arrow/compute/kernel_test.cc
@@ -278,7 +278,7 @@ TEST(OutputType, Constructors) {
   // ToString
 
   // ty1 was copied to ty3
-  ASSERT_EQ("any[int8]", ty3.ToString());
+  ASSERT_EQ("int8", ty3.ToString());
   ASSERT_EQ("computed", ty2.ToString());
 }
 
@@ -460,7 +460,7 @@ TEST(KernelSignature, ToString) {
                                      InputType(Type::DECIMAL, ValueDescr::ARRAY),
                                      InputType(utf8())};
   KernelSignature sig(in_types, utf8());
-  ASSERT_EQ("(scalar[int8], array[Type::DECIMAL], any[string]) -> any[string]",
+  ASSERT_EQ("(scalar[int8], array[Type::DECIMAL], any[string]) -> string",
             sig.ToString());
 
   OutputType out_type([](KernelContext*, const std::vector<ValueDescr>& args) {
@@ -472,7 +472,7 @@ TEST(KernelSignature, ToString) {
 
 TEST(KernelSignature, VarArgsToString) {
   KernelSignature sig({int8()}, utf8(), /*is_varargs=*/true);
-  ASSERT_EQ("varargs[any[int8]] -> any[string]", sig.ToString());
+  ASSERT_EQ("varargs[any[int8]] -> string", sig.ToString());
 }
 
 }  // namespace compute
diff --git a/cpp/src/arrow/compute/kernels/scalar_compare.cc b/cpp/src/arrow/compute/kernels/scalar_compare.cc
index 699f7273f40..42a911a6e09 100644
--- a/cpp/src/arrow/compute/kernels/scalar_compare.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_compare.cc
@@ -118,8 +118,8 @@ void MakeCompareFunction(std::string name, FunctionRegistry* registry) {
 }
 
 void RegisterScalarComparison(FunctionRegistry* registry) {
-  MakeCompareFunction<Equal>("equals", registry);
-  MakeCompareFunction<NotEqual>("not_equals", registry);
+  MakeCompareFunction<Equal>("equal", registry);
+  MakeCompareFunction<NotEqual>("not_equal", registry);
   MakeCompareFunction<Less, Greater>("less", registry);
   MakeCompareFunction<LessEqual, GreaterEqual>("less_equal", registry);
   MakeCompareFunction<Greater, Less>("greater", registry);

From 7288f4521462c1709520620ccc121a7dd69837fa Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sat, 23 May 2020 08:55:48 +0900
Subject: [PATCH 34/41] Use size_t for hash type

---
 cpp/src/arrow/compute/kernel.cc | 8 ++++----
 cpp/src/arrow/compute/kernel.h  | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/cpp/src/arrow/compute/kernel.cc b/cpp/src/arrow/compute/kernel.cc
index 1d02117cc0c..2895468c82e 100644
--- a/cpp/src/arrow/compute/kernel.cc
+++ b/cpp/src/arrow/compute/kernel.cc
@@ -158,8 +158,8 @@ std::shared_ptr<TypeMatcher> TimestampUnit(TimeUnit::type unit) {
 // ----------------------------------------------------------------------
 // InputType
 
-uint64_t InputType::Hash() const {
-  uint64_t result = kHashSeed;
+size_t InputType::Hash() const {
+  size_t result = kHashSeed;
   hash_combine(result, static_cast<int>(shape_));
   hash_combine(result, static_cast<int>(kind_));
   switch (kind_) {
@@ -341,11 +341,11 @@ bool KernelSignature::MatchesInputs(const std::vector<ValueDescr>& args) const {
   return true;
 }
 
-uint64_t KernelSignature::Hash() const {
+size_t KernelSignature::Hash() const {
   if (hash_code_ != 0) {
     return hash_code_;
   }
-  uint64_t result = kHashSeed;
+  size_t result = kHashSeed;
   for (const auto& in_type : in_types_) {
     hash_combine(result, in_type.Hash());
   }
diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h
index cd59bfa8237..f90627ab531 100644
--- a/cpp/src/arrow/compute/kernel.h
+++ b/cpp/src/arrow/compute/kernel.h
@@ -198,7 +198,7 @@ class ARROW_EXPORT InputType {
   bool operator!=(const InputType& other) const { return !(*this == other); }
 
   /// \brief Return hash code
-  uint64_t Hash() const;
+  size_t Hash() const;
 
   /// \brief Render a human-readable string representation
   std::string ToString() const;
@@ -359,7 +359,7 @@ class ARROW_EXPORT KernelSignature {
   bool operator!=(const KernelSignature& other) const { return !(*this == other); }
 
   /// \brief Compute a hash code for the signature
-  uint64_t Hash() const;
+  size_t Hash() const;
 
   const std::vector<InputType>& in_types() const { return in_types_; }
 

From 0833b5cfb079c783b89ab06e950bb58214140af7 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sat, 23 May 2020 08:56:32 +0900
Subject: [PATCH 35/41] Use size_t for kHashSeed

---
 cpp/src/arrow/compute/kernel.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/arrow/compute/kernel.cc b/cpp/src/arrow/compute/kernel.cc
index 2895468c82e..d3652131b32 100644
--- a/cpp/src/arrow/compute/kernel.cc
+++ b/cpp/src/arrow/compute/kernel.cc
@@ -36,7 +36,7 @@ namespace arrow {
 using internal::checked_cast;
 using internal::hash_combine;
 
-static constexpr uint64_t kHashSeed = 0;
+static constexpr size_t kHashSeed = 0;
 
 namespace compute {
 

From 729a3be79a1d16bf68d83fd5435e3b1a9ea05b11 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sat, 23 May 2020 09:46:25 +0900
Subject: [PATCH 36/41] Add missing ARROW_EXPORT

---
 cpp/src/arrow/compute/kernel.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h
index f90627ab531..b76c27a7ab8 100644
--- a/cpp/src/arrow/compute/kernel.h
+++ b/cpp/src/arrow/compute/kernel.h
@@ -128,11 +128,11 @@ struct TypeMatcher {
 namespace match {
 
 /// \brief Match any DataType instance having the same DataType::id
-std::shared_ptr<TypeMatcher> SameTypeId(Type::type type_id);
+ARROW_EXPORT std::shared_ptr<TypeMatcher> SameTypeId(Type::type type_id);
 
 /// \brief Match any TimestampType instance having the same unit, but the time
 /// zones can be different
-std::shared_ptr<TypeMatcher> TimestampUnit(TimeUnit::type unit);
+ARROW_EXPORT std::shared_ptr<TypeMatcher> TimestampUnit(TimeUnit::type unit);
 
 }  // namespace match
 

From 94184d14f1fca330e0ea30ed2b77be26382e1a35 Mon Sep 17 00:00:00 2001
From: Sutou Kouhei <kou@clear-code.com>
Date: Sat, 23 May 2020 09:54:20 +0900
Subject: [PATCH 37/41] Add missing "#pragma once"

---
 cpp/src/arrow/compute/kernels/vector_selection_internal.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.h b/cpp/src/arrow/compute/kernels/vector_selection_internal.h
index f23adc80cbb..6333ef3a671 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_internal.h
+++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.h
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#pragma once
+
 #include <algorithm>
 #include <limits>
 #include <memory>

From c2090d901306ad4d01f15bfa3b5e171c9daf25e1 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Fri, 22 May 2020 22:09:17 -0500
Subject: [PATCH 38/41] Fix failure on VS 2019 where input bitmap was all 0 by
 chance

---
 cpp/src/arrow/compute/exec_test.cc | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/cpp/src/arrow/compute/exec_test.cc b/cpp/src/arrow/compute/exec_test.cc
index 2075abc4c49..b3cbbceb5d9 100644
--- a/cpp/src/arrow/compute/exec_test.cc
+++ b/cpp/src/arrow/compute/exec_test.cc
@@ -555,9 +555,12 @@ void ExecComputedBitmap(KernelContext* ctx, const ExecBatch& batch, Datum* out)
   const ArrayData& arg0 = *batch[0].array();
   ArrayData* out_arr = out->mutable_array();
 
-  DCHECK(!internal::BitmapEquals(arg0.buffers[0]->data(), arg0.offset,
-                                 out_arr->buffers[0]->data(), out_arr->offset,
-                                 batch.length));
+  if (internal::CountSetBits(arg0.buffers[0]->data(), arg0.offset, batch.length) > 0) {
+    // Check that the bitmap has not been already copied over
+    DCHECK(!internal::BitmapEquals(arg0.buffers[0]->data(), arg0.offset,
+                                   out_arr->buffers[0]->data(), out_arr->offset,
+                                   batch.length));
+  }
   internal::CopyBitmap(arg0.buffers[0]->data(), arg0.offset, batch.length,
                        out_arr->buffers[0]->mutable_data(), out_arr->offset);
   ExecCopy(ctx, batch, out);
@@ -780,7 +783,7 @@ TEST_F(TestCallScalarFunction, BasicNonStandardCases) {
 
   double null_prob = 0.2;
 
-  auto arr = GetUInt8Array(100, null_prob);
+  auto arr = GetUInt8Array(10000, null_prob);
   std::vector<Datum> args = {Datum(arr)};
 
   auto CheckFunction = [&](std::string func_name) {
@@ -795,14 +798,14 @@ TEST_F(TestCallScalarFunction, BasicNonStandardCases) {
 
     // Split execution into 3 chunks
     {
-      exec_ctx_->set_exec_chunksize(40);
+      exec_ctx_->set_exec_chunksize(4000);
       ASSERT_OK_AND_ASSIGN(Datum result, CallFunction(exec_ctx_.get(), func_name, args));
       ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind());
       const ChunkedArray& carr = *result.chunked_array();
       ASSERT_EQ(3, carr.num_chunks());
-      AssertArraysEqual(*arr->Slice(0, 40), *carr.chunk(0));
-      AssertArraysEqual(*arr->Slice(40, 40), *carr.chunk(1));
-      AssertArraysEqual(*arr->Slice(80), *carr.chunk(2));
+      AssertArraysEqual(*arr->Slice(0, 4000), *carr.chunk(0));
+      AssertArraysEqual(*arr->Slice(4000, 4000), *carr.chunk(1));
+      AssertArraysEqual(*arr->Slice(8000), *carr.chunk(2));
     }
   };
 

From 86535035e9bc5ed5d88b4252e917bbb1d2db9372 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Fri, 22 May 2020 23:26:08 -0500
Subject: [PATCH 39/41] Simplify some Python method implementations. Do not
 register unnecessarily timestamp kernels

---
 cpp/src/arrow/compute/api_vector.cc           |  2 +-
 .../arrow/compute/kernels/codegen_internal.cc |  6 ++-
 .../arrow/compute/kernels/codegen_internal.h  | 22 +++++++++++
 cpp/src/arrow/compute/kernels/vector_hash.cc  | 17 +++++----
 cpp/src/arrow/compute/kernels/vector_take.cc  |  2 +-
 python/pyarrow/array.pxi                      | 37 +++----------------
 python/pyarrow/includes/libarrow.pxd          | 18 ---------
 python/pyarrow/lib.pyx                        |  6 +++
 python/pyarrow/table.pxi                      | 19 ++--------
 python/pyarrow/tests/test_compute.py          |  2 +-
 10 files changed, 53 insertions(+), 78 deletions(-)

diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc
index 1192ce01ec6..6b28b02fa21 100644
--- a/cpp/src/arrow/compute/api_vector.cc
+++ b/cpp/src/arrow/compute/api_vector.cc
@@ -57,7 +57,7 @@ Result<std::shared_ptr<Array>> Unique(const Datum& value, ExecContext* ctx) {
 }
 
 Result<Datum> DictionaryEncode(const Datum& value, ExecContext* ctx) {
-  return CallFunction(ctx, "dict_encode", {value});
+  return CallFunction(ctx, "dictionary_encode", {value});
 }
 
 const char kValuesFieldName[] = "values";
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc
index 2dfa3dac555..2771b6a89f8 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.cc
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc
@@ -89,9 +89,11 @@ static void InitStaticData() {
   //
   // * Decimal
   // * Fixed Size Binary
-  g_primitive_types = {null(), boolean()};
+  // * Time32
+  // * Time64
+  // * Timestamp
+  g_primitive_types = {null(), boolean(), date32(), date64()};
   Extend(g_numeric_types, &g_primitive_types);
-  Extend(g_temporal_types, &g_primitive_types);
   Extend(g_base_binary_types, &g_primitive_types);
 }
 
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h
index e0d531673c1..bf504a320fc 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.h
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -703,6 +703,28 @@ ArrayKernelExec Integer(detail::GetTypeId get_id) {
   }
 }
 
+
+// Generate a kernel given a templated functor for integer types
+//
+// See "Numeric" above for description of the generator functor
+template <template <typename...> class Generator,
+          typename Type0, typename... Args>
+ArrayKernelExec SignedInteger(detail::GetTypeId get_id) {
+  switch (get_id.id) {
+    case Type::INT8:
+      return Generator<Type0, Int8Type, Args...>::Exec;
+    case Type::INT16:
+      return Generator<Type0, Int16Type, Args...>::Exec;
+    case Type::INT32:
+      return Generator<Type0, Int32Type, Args...>::Exec;
+    case Type::INT64:
+      return Generator<Type0, Int64Type, Args...>::Exec;
+    default:
+      DCHECK(false);
+      return ExecFail;
+  }
+}
+
 // Generate a kernel given a templated functor for base binary types
 //
 // See "Numeric" above for description of the generator functor
diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc
index 2a40cd200fb..7dab0bac77a 100644
--- a/cpp/src/arrow/compute/kernels/vector_hash.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash.cc
@@ -502,12 +502,14 @@ void AddHashKernels(VectorFunction* func, VectorKernel base,
     AddKernel<Action>(func, base, ty);
   }
 
-  base.signature =
-      KernelSignature::Make({InputType::Array(Type::FIXED_SIZE_BINARY)}, out_ty);
-  AddKernel<Action>(func, base, /*dummy=*/fixed_size_binary(0));
-
-  base.signature = KernelSignature::Make({InputType::Array(Type::DECIMAL)}, out_ty);
-  AddKernel<Action>(func, base, /*dummy*/ decimal(12, 2));
+  // Example parametric types that we want to match only on Type::type
+  auto parametric_types = {time32(TimeUnit::SECOND), time64(TimeUnit::MICRO),
+                           timestamp(TimeUnit::SECOND), fixed_size_binary(0),
+                           decimal(12, 2)};
+  for (const auto& ty : parametric_types) {
+    base.signature = KernelSignature::Make({InputType::Array(ty->id())}, out_ty);
+    AddKernel<Action>(func, base, /*dummy=*/ty);
+  }
 }
 
 }  // namespace
@@ -533,7 +535,8 @@ void RegisterVectorHash(FunctionRegistry* registry) {
 
   base.finalize = DictEncodeFinalize;
   base.output_chunked = true;
-  auto dict_encode = std::make_shared<VectorFunction>("dict_encode", Arity::Unary());
+  auto dict_encode =
+      std::make_shared<VectorFunction>("dictionary_encode", Arity::Unary());
   AddHashKernels<DictEncodeAction>(dict_encode.get(), base, DictEncodeOutput);
   DCHECK_OK(registry->AddFunction(std::move(dict_encode)));
 }
diff --git a/cpp/src/arrow/compute/kernels/vector_take.cc b/cpp/src/arrow/compute/kernels/vector_take.cc
index 082f004337a..527f50335e3 100644
--- a/cpp/src/arrow/compute/kernels/vector_take.cc
+++ b/cpp/src/arrow/compute/kernels/vector_take.cc
@@ -60,7 +60,7 @@ struct TakeKernelVisitor {
 
   template <typename Type>
   Status Visit(const Type&) {
-    this->result = codegen::Integer<TakeFunctor, Type>(index_type);
+    this->result = codegen::Integer<TakeFunctor, Type>(index_type.id());
     return Status::OK();
   }
 
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index ee10ff779dd..55952c799bc 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -720,31 +720,19 @@ cdef class Array(_PandasConvertible):
         """
         Sum the values in a numerical array.
         """
-        cdef CDatum out
-        with nogil:
-            out = GetResultValue(Sum(CDatum(self.sp_array)))
-        return wrap_datum(out)
+        return _pc().call_function('sum', [self])
 
     def unique(self):
         """
         Compute distinct elements in array.
         """
-        cdef shared_ptr[CArray] result
-
-        with nogil:
-            result = GetResultValue(Unique(CDatum(self.sp_array)))
-
-        return pyarrow_wrap_array(result)
+        return _pc().call_function('unique', [self])
 
     def dictionary_encode(self):
         """
         Compute dictionary-encoded representation of array.
         """
-        cdef CDatum out
-
-        with nogil:
-            out = GetResultValue(DictionaryEncode(CDatum(self.sp_array)))
-        return wrap_datum(out)
+        return _pc().call_function('dictionary_encode', [self])
 
     def value_counts(self):
         """
@@ -754,11 +742,7 @@ cdef class Array(_PandasConvertible):
         -------
         An array of  <input type "Values", int64_t "Counts"> structs
         """
-        cdef shared_ptr[CArray] result
-
-        with nogil:
-            result = GetResultValue(ValueCounts(CDatum(self.sp_array)))
-        return pyarrow_wrap_array(result)
+        return _pc().call_function('value_counts', [self])
 
     @staticmethod
     def from_pandas(obj, mask=None, type=None, bint safe=True,
@@ -1012,18 +996,7 @@ cdef class Array(_PandasConvertible):
           null
         ]
         """
-        cdef:
-            cdef CTakeOptions options
-            cdef CDatum out
-            cdef Array c_indices
-
-        c_indices = asarray(indices)
-
-        with nogil:
-            out = GetResultValue(Take(CDatum(self.sp_array),
-                                      CDatum(c_indices.sp_array), options))
-
-        return wrap_datum(out)
+        return _pc().call_function('take', [self, asarray(indices)])
 
     def filter(self, Array mask, null_selection_behavior='drop'):
         """
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index d7f1a7733cd..dc09f04a61e 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1579,14 +1579,6 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
                          const shared_ptr[CDataType]& to_type,
                          const CCastOptions& options)
 
-    CResult[shared_ptr[CArray]] Unique(const CDatum& value)
-
-    CResult[CDatum] DictionaryEncode(CDatum& value)
-
-    CResult[shared_ptr[CArray]] ValueCounts(const CDatum& value)
-
-    CResult[CDatum] Sum(const CDatum& value)
-
     CResult[CDatum] Take(const CDatum& values, const CDatum& indices,
                          const CTakeOptions& options)
 
@@ -1604,16 +1596,6 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
     CResult[CDatum] FilterKernel" arrow::compute::Filter"(
         const CDatum& values, const CDatum& filter, CFilterOptions options)
 
-    enum CCompareOperator "arrow::compute::CompareOperator":
-        CCompareOperator_EQUAL "arrow::compute::CompareOperator::EQUAL"
-        CCompareOperator_NOT_EQUAL "arrow::compute::CompareOperator::NOT_EQUAL"
-        CCompareOperator_GREATER "arrow::compute::CompareOperator::GREATER"
-        CCompareOperator_GREATER_EQUAL \
-            "arrow::compute::CompareOperator::GREATER_EQUAL"
-        CCompareOperator_LESS "arrow::compute::CompareOperator::LESS"
-        CCompareOperator_LESS_EQUAL \
-            "arrow::compute::CompareOperator::LESS_EQUAL"
-
 
 cdef extern from "arrow/python/api.h" namespace "arrow::py":
     # Requires GIL
diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx
index 218576e7f7b..41903dec9ca 100644
--- a/python/pyarrow/lib.pyx
+++ b/python/pyarrow/lib.pyx
@@ -99,6 +99,12 @@ Type_DICTIONARY = _Type_DICTIONARY
 UnionMode_SPARSE = _UnionMode_SPARSE
 UnionMode_DENSE = _UnionMode_DENSE
 
+
+def _pc():
+    import pyarrow.compute as pc
+    return pc
+
+
 # pandas API shim
 include "pandas-shim.pxi"
 
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index b072063d632..44e95b974a0 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -274,11 +274,7 @@ cdef class ChunkedArray(_PandasConvertible):
         pyarrow.ChunkedArray
             Same chunking as the input, all chunks share a common dictionary.
         """
-        cdef CDatum out
-        with nogil:
-            out = GetResultValue(
-                DictionaryEncode(CDatum(self.sp_chunked_array)))
-        return wrap_datum(out)
+        return _pc().call_function('dictionary_encode', [self])
 
     def flatten(self, MemoryPool memory_pool=None):
         """
@@ -311,12 +307,7 @@ cdef class ChunkedArray(_PandasConvertible):
         -------
         pyarrow.Array
         """
-        cdef shared_ptr[CArray] result
-
-        with nogil:
-            result = GetResultValue(Unique(CDatum(self.sp_chunked_array)))
-
-        return pyarrow_wrap_array(result)
+        return _pc().call_function('unique', [self])
 
     def value_counts(self):
         """
@@ -326,11 +317,7 @@ cdef class ChunkedArray(_PandasConvertible):
         -------
         An array of  <input type "Values", int64_t "Counts"> structs
         """
-        cdef shared_ptr[CArray] result
-
-        with nogil:
-            result = GetResultValue(ValueCounts(CDatum(self.sp_chunked_array)))
-        return pyarrow_wrap_array(result)
+        return _pc().call_function('value_counts', [self])
 
     def slice(self, offset=0, length=None):
         """
diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
index d56baaf23ab..09c4d024070 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -93,7 +93,7 @@ def test_sum_chunked_array(arrow_type):
 @pytest.mark.parametrize(('ty', 'values'), all_array_types)
 def test_take(ty, values):
     arr = pa.array(values, type=ty)
-    for indices_type in [pa.uint8(), pa.int64()]:
+    for indices_type in [pa.int8(), pa.int64()]:
         indices = pa.array([0, 4, 2, None], type=indices_type)
         result = arr.take(indices)
         result.validate()

From cb85735f78d9409d679ec1f3b473d1437943ed3f Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Fri, 22 May 2020 23:30:52 -0500
Subject: [PATCH 40/41] Try again to fix failing VS 2019 unit test

---
 cpp/src/arrow/compute/exec_test.cc | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/cpp/src/arrow/compute/exec_test.cc b/cpp/src/arrow/compute/exec_test.cc
index b3cbbceb5d9..933c260344e 100644
--- a/cpp/src/arrow/compute/exec_test.cc
+++ b/cpp/src/arrow/compute/exec_test.cc
@@ -712,7 +712,7 @@ TEST_F(TestCallScalarFunction, ArgumentValidation) {
 TEST_F(TestCallScalarFunction, PreallocationCases) {
   double null_prob = 0.2;
 
-  auto arr = GetUInt8Array(50, null_prob);
+  auto arr = GetUInt8Array(1000, null_prob);
 
   auto CheckFunction = [&](std::string func_name) {
     ResetContexts();
@@ -729,7 +729,7 @@ TEST_F(TestCallScalarFunction, PreallocationCases) {
     // of the kernel, but still the output is onee array
     {
       std::vector<Datum> args = {Datum(arr)};
-      exec_ctx_->set_exec_chunksize(8);
+      exec_ctx_->set_exec_chunksize(80);
       ASSERT_OK_AND_ASSIGN(Datum result, CallFunction(exec_ctx_.get(), func_name, args));
       AssertArraysEqual(*arr, *result.make_array());
     }
@@ -739,7 +739,7 @@ TEST_F(TestCallScalarFunction, PreallocationCases) {
     // Chunksize not multiple of 8
     {
       std::vector<Datum> args = {Datum(arr)};
-      exec_ctx_->set_exec_chunksize(12);
+      exec_ctx_->set_exec_chunksize(111);
       ASSERT_OK_AND_ASSIGN(Datum result, CallFunction(exec_ctx_.get(), func_name, args));
       AssertArraysEqual(*arr, *result.make_array());
     }
@@ -747,7 +747,7 @@ TEST_F(TestCallScalarFunction, PreallocationCases) {
     // Input is chunked, output has one big chunk
     {
       auto carr = std::shared_ptr<ChunkedArray>(
-          new ChunkedArray({arr->Slice(0, 15), arr->Slice(15)}));
+          new ChunkedArray({arr->Slice(0, 100), arr->Slice(100)}));
       std::vector<Datum> args = {Datum(carr)};
       ASSERT_OK_AND_ASSIGN(Datum result, CallFunction(exec_ctx_.get(), func_name, args));
       std::shared_ptr<ChunkedArray> actual = result.chunked_array();
@@ -759,14 +759,14 @@ TEST_F(TestCallScalarFunction, PreallocationCases) {
     {
       std::vector<Datum> args = {Datum(arr)};
       exec_ctx_->set_preallocate_contiguous(false);
-      exec_ctx_->set_exec_chunksize(20);
+      exec_ctx_->set_exec_chunksize(400);
       ASSERT_OK_AND_ASSIGN(Datum result, CallFunction(exec_ctx_.get(), func_name, args));
       ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind());
       const ChunkedArray& carr = *result.chunked_array();
       ASSERT_EQ(3, carr.num_chunks());
-      AssertArraysEqual(*arr->Slice(0, 20), *carr.chunk(0));
-      AssertArraysEqual(*arr->Slice(20, 20), *carr.chunk(1));
-      AssertArraysEqual(*arr->Slice(40), *carr.chunk(2));
+      AssertArraysEqual(*arr->Slice(0, 400), *carr.chunk(0));
+      AssertArraysEqual(*arr->Slice(400, 400), *carr.chunk(1));
+      AssertArraysEqual(*arr->Slice(800), *carr.chunk(2));
     }
   };
 
@@ -783,7 +783,7 @@ TEST_F(TestCallScalarFunction, BasicNonStandardCases) {
 
   double null_prob = 0.2;
 
-  auto arr = GetUInt8Array(10000, null_prob);
+  auto arr = GetUInt8Array(1000, null_prob);
   std::vector<Datum> args = {Datum(arr)};
 
   auto CheckFunction = [&](std::string func_name) {
@@ -798,14 +798,14 @@ TEST_F(TestCallScalarFunction, BasicNonStandardCases) {
 
     // Split execution into 3 chunks
     {
-      exec_ctx_->set_exec_chunksize(4000);
+      exec_ctx_->set_exec_chunksize(400);
       ASSERT_OK_AND_ASSIGN(Datum result, CallFunction(exec_ctx_.get(), func_name, args));
       ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind());
       const ChunkedArray& carr = *result.chunked_array();
       ASSERT_EQ(3, carr.num_chunks());
-      AssertArraysEqual(*arr->Slice(0, 4000), *carr.chunk(0));
-      AssertArraysEqual(*arr->Slice(4000, 4000), *carr.chunk(1));
-      AssertArraysEqual(*arr->Slice(8000), *carr.chunk(2));
+      AssertArraysEqual(*arr->Slice(0, 400), *carr.chunk(0));
+      AssertArraysEqual(*arr->Slice(400, 400), *carr.chunk(1));
+      AssertArraysEqual(*arr->Slice(800), *carr.chunk(2));
     }
   };
 

From 877618f0d89e121a0bbe692f97899bd739f623b9 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Sun, 24 May 2020 08:13:35 -0500
Subject: [PATCH 41/41] Do not crash on null options passed to filter function.
 Add prototype bindings for FilterOptions for Python, support to call_function

---
 .../arrow/compute/kernels/vector_filter.cc    | 11 +++-
 python/pyarrow/_compute.pyx                   | 63 ++++++++++++++-----
 python/pyarrow/array.pxi                      | 31 +--------
 python/pyarrow/compute.py                     | 19 +++++-
 python/pyarrow/table.pxi                      | 19 ++++++
 5 files changed, 96 insertions(+), 47 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_filter.cc b/cpp/src/arrow/compute/kernels/vector_filter.cc
index eef16498282..ceb05fd92c3 100644
--- a/cpp/src/arrow/compute/kernels/vector_filter.cc
+++ b/cpp/src/arrow/compute/kernels/vector_filter.cc
@@ -97,13 +97,18 @@ int64_t FilterOutputSize(FilterOptions::NullSelectionBehavior null_selection,
 }
 
 struct FilterState : public KernelState {
-  explicit FilterState(const FilterOptions& options) : options(options) {}
+  explicit FilterState(FilterOptions options) : options(std::move(options)) {}
   FilterOptions options;
 };
 
 std::unique_ptr<KernelState> InitFilter(KernelContext*, const KernelInitArgs& args) {
-  auto filter_options = static_cast<const FilterOptions*>(args.options);
-  return std::unique_ptr<KernelState>(new FilterState{*filter_options});
+  FilterOptions options;
+  if (args.options == nullptr) {
+    options = FilterOptions::Defaults();
+  } else {
+    options = *static_cast<const FilterOptions*>(args.options);
+  }
+  return std::unique_ptr<KernelState>(new FilterState(std::move(options)));
 }
 
 template <typename ValueType>
diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx
index a743c8030c4..caf4f3b44c7 100644
--- a/python/pyarrow/_compute.pyx
+++ b/python/pyarrow/_compute.pyx
@@ -169,7 +169,7 @@ num_kernels: {}
     def num_kernels(self):
         return self.base_func.num_kernels()
 
-    def call(self, args):
+    def call(self, args, options=None):
         cdef:
             const CFunctionOptions* c_options = NULL
             vector[CDatum] c_args
@@ -177,6 +177,9 @@ num_kernels: {}
 
         _pack_compute_args(args, &c_args)
 
+        if isinstance(options, FunctionOptions):
+            c_options = (<FunctionOptions> options).options()
+
         with nogil:
             result = GetResultValue(self.base_func.Execute(c_args, c_options))
 
@@ -263,21 +266,53 @@ def function_registry():
     return _global_func_registry
 
 
-def call_function(name, args):
+def call_function(name, args, options=None):
     func = _global_func_registry.get_function(name)
-    return func.call(args)
+    return func.call(args, options=options)
+
+
+cdef class FunctionOptions:
 
+    cdef const CFunctionOptions* options(self) except NULL:
+        raise NotImplementedError("Unimplemented base options")
+
+
+cdef class CastOptions(FunctionOptions):
+    cdef:
+        CCastOptions cast_options
 
-def sum(array):
-    """
-    Sum the values in a numerical (chunked) array.
+    @staticmethod
+    def safe():
+        cdef CastOptions options = CastOptions()
+        options.cast_options = CCastOptions.Safe()
 
-    Parameters
-    ----------
-    array : pyarrow.Array or pyarrow.ChunkedArray
+    @staticmethod
+    def unsafe():
+        cdef CastOptions options = CastOptions()
+        options.cast_options = CCastOptions.Unsafe()
+
+    cdef const CFunctionOptions* options(self) except NULL:
+        return &self.cast_options
+
+
+cdef class FilterOptions(FunctionOptions):
+    cdef:
+        CFilterOptions filter_options
+
+    def __init__(self, null_selection_behavior='drop'):
+        if null_selection_behavior == 'drop':
+            self.filter_options.null_selection_behavior = (
+                CFilterNullSelectionBehavior_DROP
+            )
+        elif null_selection_behavior == 'emit_null':
+            self.filter_options.null_selection_behavior = (
+                CFilterNullSelectionBehavior_EMIT_NULL
+            )
+        else:
+            raise ValueError(
+                '"{}" is not a valid null_selection_behavior'.format(
+                    null_selection_behavior)
+            )
 
-    Returns
-    -------
-    sum : pyarrow.Scalar
-    """
-    return call_function('sum', [array])
+    cdef const CFunctionOptions* options(self) except NULL:
+        return &self.filter_options
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 55952c799bc..230ef7c0651 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -481,23 +481,6 @@ def _restore_array(data):
     return pyarrow_wrap_array(MakeArray(ad))
 
 
-cdef CFilterOptions _convert_filter_option(object null_selection_behavior):
-    cdef CFilterOptions options
-
-    if null_selection_behavior == 'drop':
-        options.null_selection_behavior = \
-            CFilterNullSelectionBehavior_DROP
-    elif null_selection_behavior == 'emit_null':
-        options.null_selection_behavior = \
-            CFilterNullSelectionBehavior_EMIT_NULL
-    else:
-        raise ValueError(
-            '"{}" is not a valid null_selection_behavior'.format(
-                null_selection_behavior)
-        )
-    return options
-
-
 cdef class _PandasConvertible:
 
     def to_pandas(
@@ -1037,17 +1020,9 @@ cdef class Array(_PandasConvertible):
           "e"
         ]
         """
-        cdef:
-            CDatum out
-            CFilterOptions options
-
-        options = _convert_filter_option(null_selection_behavior)
-
-        with nogil:
-            out = GetResultValue(FilterKernel(CDatum(self.sp_array),
-                                              CDatum(mask.sp_array), options))
-
-        return wrap_datum(out)
+        pc = _pc()
+        options = pc.FilterOptions(null_selection_behavior)
+        return pc.call_function('filter', [self, mask], options)
 
     def _to_pandas(self, options, **kwargs):
         return _array_like_to_pandas(self, options)
diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
index 7f55f8d776e..eee193875cc 100644
--- a/python/pyarrow/compute.py
+++ b/python/pyarrow/compute.py
@@ -17,9 +17,24 @@
 
 
 from pyarrow._compute import (  # noqa
+    FilterOptions,
     Function,
     FunctionRegistry,
     function_registry,
-    call_function,
-    sum
+    call_function
 )
+
+
+def sum(array):
+    """
+    Sum the values in a numerical (chunked) array.
+
+    Parameters
+    ----------
+    array : pyarrow.Array or pyarrow.ChunkedArray
+
+    Returns
+    -------
+    sum : pyarrow.Scalar
+    """
+    return call_function('sum', [array])
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 44e95b974a0..322023dde8a 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -480,6 +480,25 @@ cdef class ChunkedArray(_PandasConvertible):
         return result
 
 
+# TODO: ARROW-8916, delete this once there is a Function registered for
+# filtering types other than Array, ChunkedArray
+cdef CFilterOptions _convert_filter_option(object null_selection_behavior):
+    cdef CFilterOptions options
+
+    if null_selection_behavior == 'drop':
+        options.null_selection_behavior = \
+            CFilterNullSelectionBehavior_DROP
+    elif null_selection_behavior == 'emit_null':
+        options.null_selection_behavior = \
+            CFilterNullSelectionBehavior_EMIT_NULL
+    else:
+        raise ValueError(
+            '"{}" is not a valid null_selection_behavior'.format(
+                null_selection_behavior)
+        )
+    return options
+
+
 def chunked_array(arrays, type=None):
     """
     Construct chunked array from list of array-like objects