From 28a13320f4aa98107ef3d1032747c4f709dafc74 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Wed, 12 Mar 2025 10:45:47 +0100
Subject: [PATCH 1/5] GH-45755: [C++][Compute] Add winsorize function

---
 cpp/src/arrow/CMakeLists.txt                  |   1 +
 cpp/src/arrow/compute/api_vector.cc           |   9 +
 cpp/src/arrow/compute/api_vector.h            |  19 ++
 cpp/src/arrow/compute/kernels/CMakeLists.txt  |   1 +
 .../arrow/compute/kernels/aggregate_mode.cc   |   7 +-
 .../compute/kernels/aggregate_quantile.cc     |  26 +--
 .../arrow/compute/kernels/aggregate_test.cc   |   3 +-
 .../arrow/compute/kernels/codegen_internal.h  |  11 +-
 .../compute/kernels/scalar_cast_boolean.cc    |   3 +-
 .../compute/kernels/scalar_cast_string.cc     |   3 +-
 .../compute/kernels/vector_statistics.cc      | 208 ++++++++++++++++++
 .../compute/kernels/vector_statistics_test.cc | 168 ++++++++++++++
 cpp/src/arrow/compute/registry.cc             |   1 +
 cpp/src/arrow/compute/registry_internal.h     |   1 +
 cpp/src/arrow/type.cc                         |   6 +
 cpp/src/arrow/type.h                          |   4 +
 python/pyarrow/_compute.pyx                   |  21 ++
 python/pyarrow/compute.py                     |   1 +
 python/pyarrow/includes/libarrow.pxd          |   4 +
 python/pyarrow/tests/test_compute.py          |  12 +
 20 files changed, 480 insertions(+), 29 deletions(-)
 create mode 100644 cpp/src/arrow/compute/kernels/vector_statistics.cc
 create mode 100644 cpp/src/arrow/compute/kernels/vector_statistics_test.cc
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index e3c596d3895..a77ac4abfbf 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -783,6 +783,7 @@ if(ARROW_COMPUTE)
        compute/kernels/vector_run_end_encode.cc
        compute/kernels/vector_select_k.cc
        compute/kernels/vector_sort.cc
+       compute/kernels/vector_statistics.cc
        compute/kernels/vector_swizzle.cc
        compute/key_hash_internal.cc
        compute/key_map_internal.cc
diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc
index 53ceed1b089..012e403e705 100644
--- a/cpp/src/arrow/compute/api_vector.cc
+++ b/cpp/src/arrow/compute/api_vector.cc
@@ -142,6 +142,9 @@ static auto kSortOptionsType = GetFunctionOptionsType<SortOptions>(
 static auto kPartitionNthOptionsType = GetFunctionOptionsType<PartitionNthOptions>(
     DataMember("pivot", &PartitionNthOptions::pivot),
     DataMember("null_placement", &PartitionNthOptions::null_placement));
+static auto kWinsorizeOptionsType = GetFunctionOptionsType<WinsorizeOptions>(
+    DataMember("lower_limit", &WinsorizeOptions::lower_limit),
+    DataMember("upper_limit", &WinsorizeOptions::upper_limit));
 static auto kSelectKOptionsType = GetFunctionOptionsType<SelectKOptions>(
     DataMember("k", &SelectKOptions::k),
     DataMember("sort_keys", &SelectKOptions::sort_keys));
@@ -208,6 +211,11 @@ PartitionNthOptions::PartitionNthOptions(int64_t pivot, NullPlacement null_place
       null_placement(null_placement) {}
 constexpr char PartitionNthOptions::kTypeName[];
 
+WinsorizeOptions::WinsorizeOptions(double lower_limit, double upper_limit)
+    : FunctionOptions(internal::kWinsorizeOptionsType),
+      lower_limit(lower_limit),
+      upper_limit(upper_limit) {}
+
 SelectKOptions::SelectKOptions(int64_t k, std::vector<SortKey> sort_keys)
     : FunctionOptions(internal::kSelectKOptionsType),
       k(k),
@@ -275,6 +283,7 @@ void RegisterVectorOptions(FunctionRegistry* registry) {
   DCHECK_OK(registry->AddFunctionOptionsType(kListFlattenOptionsType));
   DCHECK_OK(registry->AddFunctionOptionsType(kInversePermutationOptionsType));
   DCHECK_OK(registry->AddFunctionOptionsType(kScatterOptionsType));
+  DCHECK_OK(registry->AddFunctionOptionsType(kWinsorizeOptionsType));
 }
 }  // namespace internal
 
diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h
index 22bb1647197..69e4b243c97 100644
--- a/cpp/src/arrow/compute/api_vector.h
+++ b/cpp/src/arrow/compute/api_vector.h
@@ -228,6 +228,25 @@ class ARROW_EXPORT PartitionNthOptions : public FunctionOptions {
   NullPlacement null_placement;
 };
 
+class ARROW_EXPORT WinsorizeOptions : public FunctionOptions {
+ public:
+  WinsorizeOptions(double lower_limit, double upper_limit);
+  WinsorizeOptions() : WinsorizeOptions(0, 1) {}
+  static constexpr char const kTypeName[] = "WinsorizeOptions";
+
+  /// The quantile below which all values are replaced with the quantile's value.
+  ///
+  /// For example, if lower_limit = 0.05, then all values in the lower 5% percentile
+  /// will be replaced with the 5% percentile value.
+  double lower_limit;
+
+  /// The quantile above which all values are replaced with the quantile's value.
+  ///
+  /// For example, if upper_limit = 0.95, then all values in the upper 95% percentile
+  /// will be replaced with the 95% percentile value.
+  double upper_limit;
+};
+
 /// \brief Options for cumulative functions
 /// \note Also aliased as CumulativeSumOptions for backward compatibility
 class ARROW_EXPORT CumulativeOptions : public FunctionOptions {
diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt
index 4dedd1f23e0..81b7adeb4aa 100644
--- a/cpp/src/arrow/compute/kernels/CMakeLists.txt
+++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt
@@ -105,6 +105,7 @@ add_arrow_compute_test(vector_test
                        vector_nested_test.cc
                        vector_replace_test.cc
                        vector_run_end_encode_test.cc
+                       vector_statistics_test.cc
                        select_k_test.cc
                        EXTRA_LINK_LIBS
                        arrow_compute_kernels_testing
diff --git a/cpp/src/arrow/compute/kernels/aggregate_mode.cc b/cpp/src/arrow/compute/kernels/aggregate_mode.cc
index 3f84c0a5ee4..e9723cef7b0 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_mode.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_mode.cc
@@ -495,10 +495,9 @@ void RegisterScalarAggregateMode(FunctionRegistry* registry) {
                     ModeExecutorChunked<StructType, BooleanType>::Exec)));
   for (const auto& type : NumericTypes()) {
     // TODO(wesm):
-    DCHECK_OK(func->AddKernel(NewModeKernel(
-        type, GenerateNumeric<ModeExecutor, StructType>(*type),
-        GenerateNumeric<ModeExecutorChunked, StructType, VectorKernel::ChunkedExec>(
-            *type))));
+    DCHECK_OK(func->AddKernel(
+        NewModeKernel(type, GenerateNumeric<ModeExecutor, StructType>(*type),
+                      GenerateNumeric<ModeExecutorChunked, StructType>(*type))));
   }
   // Type parameters are ignored
   DCHECK_OK(func->AddKernel(
diff --git a/cpp/src/arrow/compute/kernels/aggregate_quantile.cc b/cpp/src/arrow/compute/kernels/aggregate_quantile.cc
index f4826229dd4..5e6007a0c1e 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_quantile.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_quantile.cc
@@ -76,6 +76,12 @@ template <typename T>
 double DataPointToDouble(T value, const DataType&) {
   return static_cast<double>(value);
 }
+double DataPointToDouble(const Decimal32& value, const DataType& ty) {
+  return value.ToDouble(checked_cast<const DecimalType&>(ty).scale());
+}
+double DataPointToDouble(const Decimal64& value, const DataType& ty) {
+  return value.ToDouble(checked_cast<const DecimalType&>(ty).scale());
+}
 double DataPointToDouble(const Decimal128& value, const DataType& ty) {
   return value.ToDouble(checked_cast<const DecimalType&>(ty).scale());
 }
@@ -524,23 +530,13 @@ void AddQuantileKernels(VectorFunction* func) {
     base.signature = KernelSignature::Make({InputType(ty)}, OutputType(ResolveOutput));
     // output type is determined at runtime, set template argument to nulltype
     base.exec = GenerateNumeric<QuantileExecutor, NullType>(*ty);
-    base.exec_chunked =
-        GenerateNumeric<QuantileExecutorChunked, NullType, VectorKernel::ChunkedExec>(
-            *ty);
-    DCHECK_OK(func->AddKernel(base));
-  }
-  {
-    base.signature =
-        KernelSignature::Make({InputType(Type::DECIMAL128)}, OutputType(ResolveOutput));
-    base.exec = QuantileExecutor<NullType, Decimal128Type>::Exec;
-    base.exec_chunked = QuantileExecutorChunked<NullType, Decimal128Type>::Exec;
+    base.exec_chunked = GenerateNumeric<QuantileExecutorChunked, NullType>(*ty);
     DCHECK_OK(func->AddKernel(base));
   }
-  {
-    base.signature =
-        KernelSignature::Make({InputType(Type::DECIMAL256)}, OutputType(ResolveOutput));
-    base.exec = QuantileExecutor<NullType, Decimal256Type>::Exec;
-    base.exec_chunked = QuantileExecutorChunked<NullType, Decimal256Type>::Exec;
+  for (auto type_id : DecimalTypeIds()) {
+    base.signature = KernelSignature::Make({type_id}, OutputType(ResolveOutput));
+    base.exec = GenerateDecimal<QuantileExecutor, NullType>(type_id);
+    base.exec_chunked = GenerateDecimal<QuantileExecutorChunked, NullType>(type_id);
     DCHECK_OK(func->AddKernel(base));
   }
 }
diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc
index 766b5b1cd6b..ec012a42cd3 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_test.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc
@@ -4263,7 +4263,8 @@ TEST(TestQuantileKernel, Decimal) {
     ValidateOutput(*out_array);
     AssertArraysEqual(*expected, *out_array, /*verbose=*/true);
   };
-  for (const auto& ty : {decimal128(3, 2), decimal256(3, 2)}) {
+  for (const auto& ty :
+       {decimal32(3, 2), decimal64(3, 2), decimal128(3, 2), decimal256(3, 2)}) {
     check(ArrayFromJSON(ty, R"(["1.00", "5.00", null])"),
           QuantileOptions(0.5, QuantileOptions::LINEAR),
           ArrayFromJSON(float64(), R"([3.00])"));
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h
index 2a492f581f5..1d49579a5a9 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.h
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -988,9 +988,9 @@ struct FailFunctor<VectorKernel::ChunkedExec> {
 };
 
 // GD for numeric types (integer and floating point)
-template <template <typename...> class Generator, typename Type0,
-          typename KernelType = ArrayKernelExec, typename... Args>
-KernelType GenerateNumeric(detail::GetTypeId get_id) {
+template <template <typename...> class Generator, typename Type0, typename... Args>
+auto GenerateNumeric(detail::GetTypeId get_id) {
+  using KernelType = decltype(&Generator<Type0, Int8Type, Args...>::Exec);
   switch (get_id.id) {
     case Type::INT8:
       return Generator<Type0, Int8Type, Args...>::Exec;
@@ -1367,7 +1367,8 @@ ArrayKernelExec GenerateTemporal(detail::GetTypeId get_id) {
 //
 // See "Numeric" above for description of the generator functor
 template <template <typename...> class Generator, typename Type0, typename... Args>
-ArrayKernelExec GenerateDecimal(detail::GetTypeId get_id) {
+auto GenerateDecimal(detail::GetTypeId get_id) {
+  using KernelType = decltype(&Generator<Type0, Decimal256Type, Args...>::Exec);
   switch (get_id.id) {
     case Type::DECIMAL32:
       return Generator<Type0, Decimal32Type, Args...>::Exec;
@@ -1379,7 +1380,7 @@ ArrayKernelExec GenerateDecimal(detail::GetTypeId get_id) {
       return Generator<Type0, Decimal256Type, Args...>::Exec;
     default:
       DCHECK(false);
-      return nullptr;
+      return KernelType(nullptr);
   }
 }
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
index cb1a67bad90..260981e1c36 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
@@ -54,8 +54,7 @@ std::vector<std::shared_ptr<CastFunction>> GetBooleanCasts() {
 
   for (const auto& ty : NumericTypes()) {
     ArrayKernelExec exec =
-        GenerateNumeric<applicator::ScalarUnary, BooleanType, ArrayKernelExec, IsNonZero>(
-            *ty);
+        GenerateNumeric<applicator::ScalarUnary, BooleanType, IsNonZero>(*ty);
     DCHECK_OK(func->AddKernel(ty->id(), {ty}, boolean(), exec));
   }
   for (const auto& ty : BaseBinaryTypes()) {
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
index 7186612d25a..74b80a4eb2a 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
@@ -683,8 +683,7 @@ void AddNumberToStringCasts(CastFunction* func) {
 template <typename OutType>
 void AddDecimalToStringCasts(CastFunction* func) {
   auto out_ty = TypeTraits<OutType>::type_singleton();
-  for (const auto& in_tid : std::vector<Type::type>{Type::DECIMAL32, Type::DECIMAL64,
-                                                    Type::DECIMAL128, Type::DECIMAL256}) {
+  for (const auto& in_tid : DecimalTypeIds()) {
     DCHECK_OK(
         func->AddKernel(in_tid, {in_tid}, out_ty,
                         GenerateDecimal<DecimalToStringCastFunctor, OutType>(in_tid),
diff --git a/cpp/src/arrow/compute/kernels/vector_statistics.cc b/cpp/src/arrow/compute/kernels/vector_statistics.cc
new file mode 100644
index 00000000000..cb8a0ac546a
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/vector_statistics.cc
@@ -0,0 +1,208 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include "arrow/compute/api_aggregate.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/exec.h"
+#include "arrow/compute/function.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/compute/kernels/codegen_internal.h"
+#include "arrow/compute/registry.h"
+#include "arrow/result.h"
+#include "arrow/scalar.h"
+#include "arrow/status.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+
+namespace arrow::compute::internal {
+
+using ::arrow::internal::checked_cast;
+
+namespace {
+
+Status ValidateOptions(const WinsorizeOptions& options) {
+  if (!(options.lower_limit >= 0 && options.lower_limit <= 1) ||
+      !(options.upper_limit >= 0 && options.upper_limit <= 1)) {
+    return Status::Invalid("winsorize limits must be between 0 and 1");
+  }
+  if (options.lower_limit > options.upper_limit) {
+    return Status::Invalid(
+        "winsorize upper limit must be equal or greater than lower limit");
+  }
+  return Status::OK();
+}
+
+using WinsorizeState = internal::OptionsWrapper<WinsorizeOptions>;
+
+// We have a first unused template parameter for compatibility with GenerateNumeric.
+template <typename Unused, typename Type>
+struct Winsorize {
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+  using CType = typename TypeTraits<Type>::CType;
+
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    const auto& options = WinsorizeState::Get(ctx);
+    RETURN_NOT_OK(ValidateOptions(options));
+    ARROW_ASSIGN_OR_RAISE(auto maybe_quantiles,
+                          GetQuantileValues(ctx, batch.ToExecBatch(), options));
+    auto data = batch.values[0].array.ToArrayData();
+    auto out_data = out->array_data_mutable();
+    if (!maybe_quantiles.has_value()) {
+      // Only nulls and NaNs => return input as-is
+      out_data->null_count = data->null_count.load();
+      out_data->length = data->length;
+      out_data->buffers = data->buffers;
+      return Status::OK();
+    }
+    return ClipValues(*data, maybe_quantiles.value(), out_data, ctx);
+  }
+
+  static Status ExecChunked(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const auto& options = WinsorizeState::Get(ctx);
+    RETURN_NOT_OK(ValidateOptions(options));
+    ARROW_ASSIGN_OR_RAISE(auto maybe_quantiles, GetQuantileValues(ctx, batch, options));
+    const auto& chunked_array = batch.values[0].chunked_array();
+    if (!maybe_quantiles.has_value()) {
+      // Only nulls and NaNs => return input as-is
+      *out = chunked_array;
+      return Status::OK();
+    }
+    ArrayVector out_chunks;
+    out_chunks.reserve(chunked_array->num_chunks());
+    for (const auto& chunk : chunked_array->chunks()) {
+      auto out_data = chunk->data()->Copy();
+      RETURN_NOT_OK(
+          ClipValues(*chunk->data(), maybe_quantiles.value(), out_data.get(), ctx));
+      out_chunks.push_back(MakeArray(out_data));
+    }
+    return ChunkedArray::Make(std::move(out_chunks)).Value(out);
+  }
+
+  struct QuantileValues {
+    CType lower_bound, upper_bound;
+  };
+
+  static Result<std::optional<QuantileValues>> GetQuantileValues(
+      KernelContext* ctx, const ExecBatch& batch, const WinsorizeOptions& options) {
+    // We use "nearest" to avoid the conversion of quantile values to double.
+    QuantileOptions quantile_options(/*q=*/{options.lower_limit, options.upper_limit},
+                                     QuantileOptions::NEAREST);
+    ARROW_ASSIGN_OR_RAISE(
+        auto quantile,
+        CallFunction("quantile", batch, &quantile_options, ctx->exec_context()));
+    auto quantile_array = quantile.array_as<ArrayType>();
+    DCHECK_EQ(quantile_array->length(), 2);
+    if (quantile_array->null_count() == 2) {
+      return std::nullopt;
+    }
+    DCHECK_EQ(quantile_array->null_count(), 0);
+    return QuantileValues{CType(quantile_array->Value(0)),
+                          CType(quantile_array->Value(1))};
+  }
+
+  static Status ClipValues(const ArrayData& data, QuantileValues quantiles,
+                           ArrayData* out, KernelContext* ctx) {
+    DCHECK_EQ(out->buffers.size(), data.buffers.size());
+    out->null_count = data.null_count.load();
+    out->length = data.length;
+    out->buffers[0] = data.buffers[0];
+    ARROW_ASSIGN_OR_RAISE(out->buffers[1], ctx->Allocate(out->length * sizeof(CType)));
+    // Avoid leaving uninitialized memory under null entries
+    std::memset(out->buffers[1]->mutable_data(), 0, out->length * sizeof(CType));
+
+    const CType* in_values = data.GetValues<CType>(1);
+    CType* out_values = out->GetMutableValues<CType>(1);
+
+    auto visit = [&](int64_t position, int64_t length) {
+      for (int64_t i = position; i < position + length; ++i) {
+        if (in_values[i] < quantiles.lower_bound) {
+          out_values[i] = quantiles.lower_bound;
+        } else if (in_values[i] > quantiles.upper_bound) {
+          out_values[i] = quantiles.upper_bound;
+        } else {
+          // NaNs also fall here
+          out_values[i] = in_values[i];
+        }
+      }
+    };
+    arrow::internal::VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
+                                         visit);
+    return Status::OK();
+  }
+};
+
+template <typename Unused, typename Type>
+struct WinsorizeChunked {
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    return Winsorize<Unused, Type>::ExecChunked(ctx, batch, out);
+  }
+};
+
+Result<TypeHolder> ResolveWinsorizeOutput(KernelContext* ctx,
+                                          const std::vector<TypeHolder>& in_types) {
+  DCHECK_EQ(in_types.size(), 1);
+  return in_types[0];
+}
+
+const FunctionDoc winsorize_doc(
+    "Winsorize an array",
+    ("This function applies a winsorization transform to the input array\n"
+     "so as to reduce the influence of potential outliers.\n"
+     "NaNs and nulls in the input are ignored for the purpose of computing\n"
+     "the lower and upper quantiles.\n"
+     "The quantile limits can be changed in WinsorizeOptions."),
+    {"array"}, "WinsorizeOptions", /*options_required=*/true);
+
+}  // namespace
+
+void RegisterVectorStatistics(FunctionRegistry* registry) {
+  static const auto default_winsorize_options = WinsorizeOptions();
+
+  auto winsorize = std::make_shared<VectorFunction>(
+      "winsorize", Arity::Unary(), winsorize_doc, &default_winsorize_options);
+
+  VectorKernel base;
+  base.init = WinsorizeState::Init;
+  base.mem_allocation = MemAllocation::NO_PREALLOCATE;
+  base.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+  base.can_execute_chunkwise = false;
+  // The variable is ill-named, but since we output a ChunkedArray ourselves,
+  // the function execution logic shouldn't try to wrap it again.
+  base.output_chunked = false;
+
+  for (const auto& ty : NumericTypes()) {
+    base.signature = KernelSignature::Make({ty->id()}, &ResolveWinsorizeOutput);
+    base.exec = GenerateNumeric<Winsorize, /*Unused*/ void>(ty->id());
+    base.exec_chunked = GenerateNumeric<WinsorizeChunked, /*Unused*/ void>(ty->id());
+    DCHECK_OK(winsorize->AddKernel(base));
+  }
+  for (auto type_id : DecimalTypeIds()) {
+    base.signature = KernelSignature::Make({type_id}, &ResolveWinsorizeOutput);
+    base.exec = GenerateDecimal<Winsorize, /*Unused*/ void>(type_id);
+    base.exec_chunked = GenerateDecimal<WinsorizeChunked, /*Unused*/ void>(type_id);
+    DCHECK_OK(winsorize->AddKernel(base));
+  }
+  DCHECK_OK(registry->AddFunction(std::move(winsorize)));
+}
+
+}  // namespace arrow::compute::internal
diff --git a/cpp/src/arrow/compute/kernels/vector_statistics_test.cc b/cpp/src/arrow/compute/kernels/vector_statistics_test.cc
new file mode 100644
index 00000000000..00382511a9d
--- /dev/null
+++ b/cpp/src/arrow/compute/kernels/vector_statistics_test.cc
@@ -0,0 +1,168 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <memory>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "arrow/chunked_array.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/kernels/test_util_internal.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/type.h"
+
+namespace arrow::compute {
+
+using ::arrow::internal::checked_cast;
+
+class TestWinsorize : public ::testing::Test {
+ public:
+  void CheckWinsorize(const std::shared_ptr<DataType>& type, std::string_view json_input,
+                      std::string_view json_expected) {
+    auto input = ArrayFromJSON(type, json_input);
+    auto expected = ArrayFromJSON(type, json_expected);
+    CheckWinsorize(input, expected);
+  }
+
+  void CheckWinsorize(const std::shared_ptr<DataType>& type,
+                      const std::vector<std::string>& json_input,
+                      const std::vector<std::string>& json_expected) {
+    auto input = ChunkedArrayFromJSON(type, json_input);
+    auto expected = ChunkedArrayFromJSON(type, json_expected);
+    CheckWinsorize(input, expected);
+  }
+
+  void CheckWinsorize(const Datum& input, const Datum& expected) {
+    CheckVectorUnary("winsorize", input, expected, &options_);
+  }
+
+  WinsorizeOptions options_;
+};
+
+TEST_F(TestWinsorize, FloatingPoint) {
+  for (auto type : FloatingPointTypes()) {
+    options_.lower_limit = 0.25;
+    options_.upper_limit = 0.75;
+    CheckWinsorize(type, "[]", "[]");
+    CheckWinsorize(type, "[null, null]", "[null, null]");
+    CheckWinsorize(type, "[1.1, 2.2, 3.3, 44, 55]", "[2.2, 2.2, 3.3, 44, 44]");
+    CheckWinsorize(type, "[2.2, 1.1, 44, 55, 3.3]", "[2.2, 2.2, 44, 44, 3.3]");
+    CheckWinsorize(type, "[2.2, 1.1, null, null, 44, 55, 3.3]",
+                   "[2.2, 2.2, null, null, 44, 44, 3.3]");
+    CheckWinsorize(type, "[2.2, 1.1, null, null, NaN, 44, 55, 3.3]",
+                   "[2.2, 2.2, null, null, NaN, 44, 44, 3.3]");
+    // Chunked
+    CheckWinsorize(type, {"[2.2, 1.1, null]", "[]", "[NaN, 44, 55, 3.3]"},
+                   {"[2.2, 2.2, null]", "[]", "[NaN, 44, 44, 3.3]"});
+    CheckWinsorize(type, {"[]", "[]"}, {"[]", "[]"});
+    CheckWinsorize(ChunkedArrayFromJSON(type, {}), ChunkedArrayFromJSON(type, {}));
+
+    options_.lower_limit = 0.05;
+    CheckWinsorize(type, "[2.2, 1.1, 44, 55, 3.3]", "[2.2, 1.1, 44, 44, 3.3]");
+    options_.upper_limit = 0.95;
+    CheckWinsorize(type, "[2.2, 1.1, 44, 55, 3.3]", "[2.2, 1.1, 44, 55, 3.3]");
+    options_.lower_limit = 0;
+    options_.upper_limit = 1;
+    CheckWinsorize(type, "[2.2, 1.1, 44, 55, 3.3]", "[2.2, 1.1, 44, 55, 3.3]");
+    options_.lower_limit = options_.upper_limit = 0.5;
+    CheckWinsorize(type, "[2.2, 1.1, 44, 55, 3.3]", "[3.3, 3.3, 3.3, 3.3, 3.3]");
+  }
+}
+
+TEST_F(TestWinsorize, Integral) {
+  for (auto type : IntTypes()) {
+    options_.lower_limit = 0.25;
+    options_.upper_limit = 0.75;
+    CheckWinsorize(type, "[]", "[]");
+    CheckWinsorize(type, "[null, null]", "[null, null]");
+    CheckWinsorize(type, "[1, 2, 3, 44, 55]", "[2, 2, 3, 44, 44]");
+    CheckWinsorize(type, "[2, 1, 44, 55, 3]", "[2, 2, 44, 44, 3]");
+    CheckWinsorize(type, "[2, 1, null, null, 44, 55, 3]",
+                   "[2, 2, null, null, 44, 44, 3]");
+    // Chunked
+    CheckWinsorize(type, {"[2, 1, null]", "[]", "[null, 44, 55, 3]"},
+                   {"[2, 2, null]", "[]", "[null, 44, 44, 3]"});
+    CheckWinsorize(type, {"[]", "[]"}, {"[]", "[]"});
+    CheckWinsorize(ChunkedArrayFromJSON(type, {}), ChunkedArrayFromJSON(type, {}));
+
+    options_.lower_limit = 0.05;
+    CheckWinsorize(type, "[2, 1, 44, 55, 3]", "[2, 1, 44, 44, 3]");
+    options_.upper_limit = 0.95;
+    CheckWinsorize(type, "[2, 1, 44, 55, 3]", "[2, 1, 44, 55, 3]");
+  }
+}
+
+TEST_F(TestWinsorize, Decimal) {
+  for (auto type :
+       {decimal32(3, 1), decimal64(3, 1), decimal128(3, 1), decimal256(3, 1)}) {
+    options_.lower_limit = 0.25;
+    options_.upper_limit = 0.75;
+    CheckWinsorize(type, "[]", "[]");
+    CheckWinsorize(type, "[null, null]", "[null, null]");
+    CheckWinsorize(type, R"(["1.1", "2.2", "3.3", "44.4", "55.5"])",
+                   R"(["2.2", "2.2", "3.3", "44.4", "44.4"])");
+    CheckWinsorize(type, R"(["2.2", "1.1", "44.4", "55.5", "3.3"])",
+                   R"(["2.2", "2.2", "44.4", "44.4", "3.3"])");
+    CheckWinsorize(type, R"(["2.2", "1.1", null, null, "44.4", "55.5", "3.3"])",
+                   R"(["2.2", "2.2", null, null, "44.4", "44.4", "3.3"])");
+    // Chunked
+    CheckWinsorize(type, {R"(["2.2", "1.1"])", R"([null, null, "44.4", "55.5", "3.3"])"},
+                   {R"(["2.2", "2.2"])", R"([null, null, "44.4", "44.4", "3.3"])"});
+    CheckWinsorize(type, {"[]", "[]"}, {"[]", "[]"});
+    CheckWinsorize(ChunkedArrayFromJSON(type, {}), ChunkedArrayFromJSON(type, {}));
+
+    options_.lower_limit = 0.05;
+    CheckWinsorize(type, R"(["2.2", "1.1", "44.4", "55.5", "3.3"])",
+                   R"(["2.2", "1.1", "44.4", "44.4", "3.3"])");
+    options_.upper_limit = 0.95;
+    CheckWinsorize(type, R"(["2.2", "1.1", "44.4", "55.5", "3.3"])",
+                   R"(["2.2", "1.1", "44.4", "55.5", "3.3"])");
+  }
+}
+
+TEST_F(TestWinsorize, InvalidOptions) {
+  auto input = ArrayFromJSON(float64(), "[]");
+
+  options_.lower_limit = -0.1;
+  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
+                                  ::testing::HasSubstr("limits must be between 0 and 1"),
+                                  CallFunction("winsorize", {input}, &options_));
+  options_.lower_limit = 1.1;
+  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
+                                  ::testing::HasSubstr("limits must be between 0 and 1"),
+                                  CallFunction("winsorize", {input}, &options_));
+  options_.lower_limit = 0.1;
+  options_.upper_limit = -0.1;
+  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
+                                  ::testing::HasSubstr("limits must be between 0 and 1"),
+                                  CallFunction("winsorize", {input}, &options_));
+  options_.upper_limit = 1.1;
+  EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid,
+                                  ::testing::HasSubstr("limits must be between 0 and 1"),
+                                  CallFunction("winsorize", {input}, &options_));
+  options_.upper_limit = 0.1;
+  options_.lower_limit = 0.9;
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid, ::testing::HasSubstr("upper limit must be equal or greater"),
+      CallFunction("winsorize", {input}, &options_));
+}
+
+}  // namespace arrow::compute
diff --git a/cpp/src/arrow/compute/registry.cc b/cpp/src/arrow/compute/registry.cc
index 43982195414..b4f1c0f2f97 100644
--- a/cpp/src/arrow/compute/registry.cc
+++ b/cpp/src/arrow/compute/registry.cc
@@ -321,6 +321,7 @@ static std::unique_ptr<FunctionRegistry> CreateBuiltInRegistry() {
   RegisterVectorRunEndEncode(registry.get());
   RegisterVectorRunEndDecode(registry.get());
   RegisterVectorPairwise(registry.get());
+  RegisterVectorStatistics(registry.get());
   RegisterVectorSwizzle(registry.get());
 
   // Aggregate functions
diff --git a/cpp/src/arrow/compute/registry_internal.h b/cpp/src/arrow/compute/registry_internal.h
index b7749674913..5b9d7f8d608 100644
--- a/cpp/src/arrow/compute/registry_internal.h
+++ b/cpp/src/arrow/compute/registry_internal.h
@@ -56,6 +56,7 @@ void RegisterVectorSort(FunctionRegistry* registry);
 void RegisterVectorRunEndEncode(FunctionRegistry* registry);
 void RegisterVectorRunEndDecode(FunctionRegistry* registry);
 void RegisterVectorPairwise(FunctionRegistry* registry);
+void RegisterVectorStatistics(FunctionRegistry* registry);
 void RegisterVectorSwizzle(FunctionRegistry* registry);
 void RegisterVectorOptions(FunctionRegistry* registry);
 
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index 2ebe57b2cc4..0ffa07a882f 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -3547,4 +3547,10 @@ const std::vector<TimeUnit::type>& TimeUnit::values() {
   return units;
 }
 
+const std::vector<Type::type>& DecimalTypeIds() {
+  static std::vector<Type::type> type_ids = {Type::DECIMAL32, Type::DECIMAL64,
+                                             Type::DECIMAL128, Type::DECIMAL256};
+  return type_ids;
+}
+
 }  // namespace arrow
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 6b4f2c9f37f..0dd1d56c652 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -2630,4 +2630,8 @@ const std::vector<std::shared_ptr<DataType>>& DurationTypes();
 ARROW_EXPORT
 const std::vector<std::shared_ptr<DataType>>& PrimitiveTypes();
 
+/// \brief Decimal type ids
+ARROW_EXPORT
+const std::vector<Type::type>& DecimalTypeIds();
+
 }  // namespace arrow
diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx
index ed19cd71b3e..abfd9a5f1ee 100644
--- a/python/pyarrow/_compute.pyx
+++ b/python/pyarrow/_compute.pyx
@@ -2021,6 +2021,27 @@ class PartitionNthOptions(_PartitionNthOptions):
         self._set_options(pivot, null_placement)
 
 
+cdef class _WinsorizeOptions(FunctionOptions):
+    def _set_options(self, lower_limit, upper_limit):
+        self.wrapped.reset(new CWinsorizeOptions(lower_limit, upper_limit))
+
+
+class WinsorizeOptions(_WinsorizeOptions):
+    """
+    Options for the `winsorize` function.
+
+    Parameters
+    ----------
+    lower_limit : float, between 0 and 1
+        The quantile below which all values are replaced with the quantile's value.
+    upper_limit : float, between 0 and 1
+        The quantile above which all values are replaced with the quantile's value.
+    """
+
+    def __init__(self, lower_limit, upper_limit):
+        self._set_options(lower_limit, upper_limit)
+
+
 cdef class _CumulativeOptions(FunctionOptions):
     def _set_options(self, start, skip_nulls):
         if start is None:
diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
index 1809c74afc5..2daf12dec4c 100644
--- a/python/pyarrow/compute.py
+++ b/python/pyarrow/compute.py
@@ -82,6 +82,7 @@
     Utf8NormalizeOptions,
     VarianceOptions,
     WeekOptions,
+    WinsorizeOptions,
     # Functions
     call_function,
     function_registry,
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 00e5b9e9126..bb1135204f3 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -2715,6 +2715,10 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
         int64_t pivot
         CNullPlacement null_placement
 
+    cdef cppclass CWinsorizeOptions \
+            "arrow::compute::WinsorizeOptions"(CFunctionOptions):
+        CWinsorizeOptions(double lower_limit, double upper_limit)
+
     cdef cppclass CCumulativeOptions \
             "arrow::compute::CumulativeOptions"(CFunctionOptions):
         CCumulativeOptions(c_bool skip_nulls)
diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
index eaceabcb9cc..2710c6971af 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -198,6 +198,7 @@ def test_option_class_equality(request):
         pc.TrimOptions(" "),
         pc.Utf8NormalizeOptions("NFKC"),
         pc.VarianceOptions(),
+        pc.WinsorizeOptions(0.05, 0.9),
         pc.WeekOptions(week_starts_monday=True, count_from_zero=False,
                        first_week_is_fully_in_year=False),
     ]
@@ -3853,3 +3854,14 @@ def test_pivot_wider():
     with pytest.raises(ValueError, match="Encountered more than one non-null value"):
         result = pc.pivot_wider(["height", "width", "height"], [10, None, 11],
                                 key_names=key_names)
+
+
+def test_winsorize():
+    arr = pa.array([10, 4, 9, 8, 5, 3, 7, 2, 1, 6])
+
+    result = pc.winsorize(arr, 0.1, 0.8)
+    assert result.to_pylist() == [8, 4, 8, 8, 5, 3, 7, 2, 2, 6]
+
+    result = pc.winsorize(
+        arr, options=pc.WinsorizeOptions(lower_limit=0.1, upper_limit=0.8))
+    assert result.to_pylist() == [8, 4, 8, 8, 5, 3, 7, 2, 2, 6]

From 2f57ff0ba27d71c2970021c5835c135b87c86ebf Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Mon, 17 Mar 2025 11:34:20 +0100
Subject: [PATCH 2/5] Call quantile function with Datum

---
 cpp/src/arrow/compute/kernels/vector_statistics.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_statistics.cc b/cpp/src/arrow/compute/kernels/vector_statistics.cc
index cb8a0ac546a..bf72651105c 100644
--- a/cpp/src/arrow/compute/kernels/vector_statistics.cc
+++ b/cpp/src/arrow/compute/kernels/vector_statistics.cc
@@ -63,9 +63,8 @@ struct Winsorize {
   static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const auto& options = WinsorizeState::Get(ctx);
     RETURN_NOT_OK(ValidateOptions(options));
-    ARROW_ASSIGN_OR_RAISE(auto maybe_quantiles,
-                          GetQuantileValues(ctx, batch.ToExecBatch(), options));
     auto data = batch.values[0].array.ToArrayData();
+    ARROW_ASSIGN_OR_RAISE(auto maybe_quantiles, GetQuantileValues(ctx, data, options));
     auto out_data = out->array_data_mutable();
     if (!maybe_quantiles.has_value()) {
       // Only nulls and NaNs => return input as-is
@@ -80,8 +79,9 @@ struct Winsorize {
   static Status ExecChunked(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     const auto& options = WinsorizeState::Get(ctx);
     RETURN_NOT_OK(ValidateOptions(options));
-    ARROW_ASSIGN_OR_RAISE(auto maybe_quantiles, GetQuantileValues(ctx, batch, options));
     const auto& chunked_array = batch.values[0].chunked_array();
+    ARROW_ASSIGN_OR_RAISE(auto maybe_quantiles,
+                          GetQuantileValues(ctx, chunked_array, options));
     if (!maybe_quantiles.has_value()) {
       // Only nulls and NaNs => return input as-is
       *out = chunked_array;
@@ -103,13 +103,13 @@ struct Winsorize {
   };
 
   static Result<std::optional<QuantileValues>> GetQuantileValues(
-      KernelContext* ctx, const ExecBatch& batch, const WinsorizeOptions& options) {
+      KernelContext* ctx, const Datum& input, const WinsorizeOptions& options) {
     // We use "nearest" to avoid the conversion of quantile values to double.
     QuantileOptions quantile_options(/*q=*/{options.lower_limit, options.upper_limit},
                                      QuantileOptions::NEAREST);
     ARROW_ASSIGN_OR_RAISE(
         auto quantile,
-        CallFunction("quantile", batch, &quantile_options, ctx->exec_context()));
+        CallFunction("quantile", {input}, &quantile_options, ctx->exec_context()));
     auto quantile_array = quantile.array_as<ArrayType>();
     DCHECK_EQ(quantile_array->length(), 2);
     if (quantile_array->null_count() == 2) {

From af26f6fec7e36e28781c04e78bee34eb9a41f9d8 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Mon, 17 Mar 2025 14:27:58 +0100
Subject: [PATCH 3/5] Add docs

---
 docs/source/cpp/compute.rst | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index 57673dfe1fc..c09d0974cb7 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -1728,6 +1728,18 @@ overflow is detected.
 
 * \(2) :member:`CumulativeOptions::start` is ignored.
 
+Other numeric functions
+~~~~~~~~~~~~~~~~~~~~~~~
+
++-------------------------+-------+-------------+-------------+--------------------------------+-----------+
+| Function name           | Arity | Input types | Output type | Options class                  | Notes     |
++=========================+=======+=============+=============+================================+===========+
+| winsorize               | Unary | Numeric     | Numeric     | :struct:`WinsorizeOptions`     | \(1)      |
++-------------------------+-------+-------------+-------------+--------------------------------+-----------+
+
+* \(1) Clamp values in the lower and upper quantiles to reduce the statistical
+  influence of outliers. The quantiles can be configured in :struct:`WinsorizeOptions`.
+
 Associative transforms
 ~~~~~~~~~~~~~~~~~~~~~~
 

From f70088536ab18faf08f63394dac500c42d286769 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Mon, 17 Mar 2025 16:46:30 +0100
Subject: [PATCH 4/5] Disambiguate function overrides for C++20

---
 .../compute/kernels/vector_statistics_test.cc | 25 ++++++++++---------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_statistics_test.cc b/cpp/src/arrow/compute/kernels/vector_statistics_test.cc
index 00382511a9d..97715cdaedd 100644
--- a/cpp/src/arrow/compute/kernels/vector_statistics_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_statistics_test.cc
@@ -42,9 +42,9 @@ class TestWinsorize : public ::testing::Test {
     CheckWinsorize(input, expected);
   }
 
-  void CheckWinsorize(const std::shared_ptr<DataType>& type,
-                      const std::vector<std::string>& json_input,
-                      const std::vector<std::string>& json_expected) {
+  void CheckWinsorizeChunked(const std::shared_ptr<DataType>& type,
+                             const std::vector<std::string>& json_input,
+                             const std::vector<std::string>& json_expected) {
     auto input = ChunkedArrayFromJSON(type, json_input);
     auto expected = ChunkedArrayFromJSON(type, json_expected);
     CheckWinsorize(input, expected);
@@ -70,9 +70,9 @@ TEST_F(TestWinsorize, FloatingPoint) {
     CheckWinsorize(type, "[2.2, 1.1, null, null, NaN, 44, 55, 3.3]",
                    "[2.2, 2.2, null, null, NaN, 44, 44, 3.3]");
     // Chunked
-    CheckWinsorize(type, {"[2.2, 1.1, null]", "[]", "[NaN, 44, 55, 3.3]"},
-                   {"[2.2, 2.2, null]", "[]", "[NaN, 44, 44, 3.3]"});
-    CheckWinsorize(type, {"[]", "[]"}, {"[]", "[]"});
+    CheckWinsorizeChunked(type, {"[2.2, 1.1, null]", "[]", "[NaN, 44, 55, 3.3]"},
+                          {"[2.2, 2.2, null]", "[]", "[NaN, 44, 44, 3.3]"});
+    CheckWinsorizeChunked(type, {"[]", "[]"}, {"[]", "[]"});
     CheckWinsorize(ChunkedArrayFromJSON(type, {}), ChunkedArrayFromJSON(type, {}));
 
     options_.lower_limit = 0.05;
@@ -98,9 +98,9 @@ TEST_F(TestWinsorize, Integral) {
     CheckWinsorize(type, "[2, 1, null, null, 44, 55, 3]",
                    "[2, 2, null, null, 44, 44, 3]");
     // Chunked
-    CheckWinsorize(type, {"[2, 1, null]", "[]", "[null, 44, 55, 3]"},
-                   {"[2, 2, null]", "[]", "[null, 44, 44, 3]"});
-    CheckWinsorize(type, {"[]", "[]"}, {"[]", "[]"});
+    CheckWinsorizeChunked(type, {"[2, 1, null]", "[]", "[null, 44, 55, 3]"},
+                          {"[2, 2, null]", "[]", "[null, 44, 44, 3]"});
+    CheckWinsorizeChunked(type, {"[]", "[]"}, {"[]", "[]"});
     CheckWinsorize(ChunkedArrayFromJSON(type, {}), ChunkedArrayFromJSON(type, {}));
 
     options_.lower_limit = 0.05;
@@ -124,9 +124,10 @@ TEST_F(TestWinsorize, Decimal) {
     CheckWinsorize(type, R"(["2.2", "1.1", null, null, "44.4", "55.5", "3.3"])",
                    R"(["2.2", "2.2", null, null, "44.4", "44.4", "3.3"])");
     // Chunked
-    CheckWinsorize(type, {R"(["2.2", "1.1"])", R"([null, null, "44.4", "55.5", "3.3"])"},
-                   {R"(["2.2", "2.2"])", R"([null, null, "44.4", "44.4", "3.3"])"});
-    CheckWinsorize(type, {"[]", "[]"}, {"[]", "[]"});
+    CheckWinsorizeChunked(
+        type, {R"(["2.2", "1.1"])", R"([null, null, "44.4", "55.5", "3.3"])"},
+        {R"(["2.2", "2.2"])", R"([null, null, "44.4", "44.4", "3.3"])"});
+    CheckWinsorizeChunked(type, {"[]", "[]"}, {"[]", "[]"});
     CheckWinsorize(ChunkedArrayFromJSON(type, {}), ChunkedArrayFromJSON(type, {}));
 
     options_.lower_limit = 0.05;

From 6750680c42ae7425ac4fbb4e70df70b72690ba4f Mon Sep 17 00:00:00 2001
From: Antoine Pitrou <antoine@python.org>
Date: Thu, 20 Mar 2025 11:02:52 +0100
Subject: [PATCH 5/5] Address review comments

---
 cpp/src/arrow/compute/kernels/vector_statistics.cc | 1 +
 docs/source/cpp/compute.rst                        | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_statistics.cc b/cpp/src/arrow/compute/kernels/vector_statistics.cc
index bf72651105c..0f4e5df2dc7 100644
--- a/cpp/src/arrow/compute/kernels/vector_statistics.cc
+++ b/cpp/src/arrow/compute/kernels/vector_statistics.cc
@@ -112,6 +112,7 @@ struct Winsorize {
         CallFunction("quantile", {input}, &quantile_options, ctx->exec_context()));
     auto quantile_array = quantile.array_as<ArrayType>();
     DCHECK_EQ(quantile_array->length(), 2);
+    // The quantile function outputs either all nulls or no nulls at all.
     if (quantile_array->null_count() == 2) {
       return std::nullopt;
     }
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index c09d0974cb7..6bbcac00740 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -1728,8 +1728,8 @@ overflow is detected.
 
 * \(2) :member:`CumulativeOptions::start` is ignored.
 
-Other numeric functions
-~~~~~~~~~~~~~~~~~~~~~~~
+Statistical functions
+~~~~~~~~~~~~~~~~~~~~~
 
 +-------------------------+-------+-------------+-------------+--------------------------------+-----------+
 | Function name           | Arity | Input types | Output type | Options class                  | Notes     |