From 1e85ca0bfafe78ffeca1fb23bcb286f8833ecdb4 Mon Sep 17 00:00:00 2001
From: David Li
Date: Thu, 17 Jun 2021 10:30:09 -0400
Subject: [PATCH] ARROW-13064: [C++] Implement 'coalesce' function
---
.../arrow/compute/kernels/codegen_internal.cc | 4 +-
.../arrow/compute/kernels/scalar_if_else.cc | 273 ++++++++++++++++++
.../kernels/scalar_if_else_benchmark.cc | 61 ++++
.../compute/kernels/scalar_if_else_test.cc | 187 ++++++++++++
cpp/src/arrow/util/bit_block_counter.h | 13 +
docs/source/cpp/compute.rst | 43 +--
docs/source/python/api/compute.rst | 1 +
7 files changed, 562 insertions(+), 20 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc
index 673db088eae..bab8e7000cd 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.cc
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc
@@ -185,7 +185,9 @@ const std::vector>& ExampleParametricTypes() {
// work above
Result FirstType(KernelContext*, const std::vector& descrs) {
- return descrs[0];
+ ValueDescr result = descrs.front();
+ result.shape = GetBroadcastShape(descrs);
+ return result;
}
void EnsureDictionaryDecoded(std::vector* descrs) {
diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
index e8578305bf6..a745eb4fa6e 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
@@ -1163,6 +1163,22 @@ void CopyValues(const Datum& in_values, const int64_t in_offset, const int64_t l
}
}
+// Specialized helper to copy a single value from a source array. Allows avoiding
+// repeatedly calling MayHaveNulls and Buffer::data() which have internal checks that
+// add up when called in a loop.
+template
+void CopyOneArrayValue(const DataType& type, const uint8_t* in_valid,
+ const uint8_t* in_values, const int64_t in_offset,
+ uint8_t* out_valid, uint8_t* out_values,
+ const int64_t out_offset) {
+ if (out_valid) {
+ BitUtil::SetBitTo(out_valid, out_offset,
+ !in_valid || BitUtil::GetBit(in_valid, in_offset));
+ }
+ CopyFixedWidth::CopyArray(type, in_values, in_offset, /*length=*/1, out_values,
+ out_offset);
+}
+
struct CaseWhenFunction : ScalarFunction {
using ScalarFunction::ScalarFunction;
@@ -1372,6 +1388,221 @@ struct CaseWhenFunctor {
}
};
+struct CoalesceFunction : ScalarFunction {
+ using ScalarFunction::ScalarFunction;
+
+ Result DispatchBest(std::vector* values) const override {
+ RETURN_NOT_OK(CheckArity(*values));
+ using arrow::compute::detail::DispatchExactImpl;
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+ EnsureDictionaryDecoded(values);
+ if (auto type = CommonNumeric(*values)) {
+ ReplaceTypes(type, values);
+ }
+ if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+ return arrow::compute::detail::NoMatchingKernel(this, *values);
+ }
+};
+
+// Implement a 'coalesce' (SQL) operator for any number of scalar inputs
+Status ExecScalarCoalesce(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ for (const auto& datum : batch.values) {
+ if (datum.scalar()->is_valid) {
+ *out = datum;
+ break;
+ }
+ }
+ return Status::OK();
+}
+
+// Helper: copy from a source datum into all null slots of the output
+template
+void CopyValuesAllValid(Datum source, uint8_t* out_valid, uint8_t* out_values,
+ const int64_t out_offset, const int64_t length) {
+ BitBlockCounter counter(out_valid, out_offset, length);
+ int64_t offset = 0;
+ while (offset < length) {
+ const auto block = counter.NextWord();
+ if (block.NoneSet()) {
+ CopyValues(source, offset, block.length, out_valid, out_values,
+ out_offset + offset);
+ } else if (!block.AllSet()) {
+ for (int64_t j = 0; j < block.length; ++j) {
+ if (!BitUtil::GetBit(out_valid, out_offset + offset + j)) {
+ CopyValues(source, offset + j, 1, out_valid, out_values,
+ out_offset + offset + j);
+ }
+ }
+ }
+ offset += block.length;
+ }
+}
+
+// Helper: zero the values buffer of the output wherever the slot is null
+void InitializeNullSlots(const DataType& type, uint8_t* out_valid, uint8_t* out_values,
+ const int64_t out_offset, const int64_t length) {
+ BitBlockCounter counter(out_valid, out_offset, length);
+ int64_t offset = 0;
+ auto bit_width = checked_cast(type).bit_width();
+ auto byte_width = BitUtil::BytesForBits(bit_width);
+ while (offset < length) {
+ const auto block = counter.NextWord();
+ if (block.NoneSet()) {
+ if (bit_width == 1) {
+ BitUtil::SetBitsTo(out_values, out_offset + offset, block.length, false);
+ } else {
+ std::memset(out_values + (out_offset + offset) * byte_width, 0x00,
+ byte_width * block.length);
+ }
+ } else if (!block.AllSet()) {
+ for (int64_t j = 0; j < block.length; ++j) {
+ if (BitUtil::GetBit(out_valid, out_offset + offset + j)) continue;
+ if (bit_width == 1) {
+ BitUtil::ClearBit(out_values, out_offset + offset + j);
+ } else {
+ std::memset(out_values + (out_offset + offset + j) * byte_width, 0x00,
+ byte_width);
+ }
+ }
+ }
+ offset += block.length;
+ }
+}
+
+// Implement 'coalesce' for any mix of scalar/array arguments for any fixed-width type
+template
+Status ExecArrayCoalesce(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ ArrayData* output = out->mutable_array();
+ const int64_t out_offset = output->offset;
+ // Use output validity buffer as mask to decide what values to copy
+ uint8_t* out_valid = output->buffers[0]->mutable_data();
+ // Clear output buffer - no values are set initially
+ BitUtil::SetBitsTo(out_valid, out_offset, batch.length, false);
+ uint8_t* out_values = output->buffers[1]->mutable_data();
+
+ for (const auto& datum : batch.values) {
+ if ((datum.is_scalar() && datum.scalar()->is_valid) ||
+ (datum.is_array() && !datum.array()->MayHaveNulls())) {
+ // Valid scalar, or all-valid array
+ CopyValuesAllValid(datum, out_valid, out_values, out_offset, batch.length);
+ break;
+ } else if (datum.is_array()) {
+ // Array with nulls
+ const ArrayData& arr = *datum.array();
+ const DataType& type = *datum.type();
+ const uint8_t* in_valid = arr.buffers[0]->data();
+ const uint8_t* in_values = arr.buffers[1]->data();
+ BinaryBitBlockCounter counter(in_valid, arr.offset, out_valid, out_offset,
+ batch.length);
+ int64_t offset = 0;
+ while (offset < batch.length) {
+ const auto block = counter.NextAndNotWord();
+ if (block.AllSet()) {
+ CopyValues(datum, offset, block.length, out_valid, out_values,
+ out_offset + offset);
+ } else if (block.popcount) {
+ for (int64_t j = 0; j < block.length; ++j) {
+ if (!BitUtil::GetBit(out_valid, out_offset + offset + j) &&
+ BitUtil::GetBit(in_valid, arr.offset + offset + j)) {
+ // This version lets us avoid calling MayHaveNulls() on every iteration
+ // (which does an atomic load and can add up)
+ CopyOneArrayValue(type, in_valid, in_values, arr.offset + offset + j,
+ out_valid, out_values, out_offset + offset + j);
+ }
+ }
+ }
+ offset += block.length;
+ }
+ }
+ }
+
+ // Initialize any remaining null slots (uninitialized memory)
+ InitializeNullSlots(*out->type(), out_valid, out_values, out_offset, batch.length);
+ return Status::OK();
+}
+
+template
+struct CoalesceFunctor {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ for (const auto& datum : batch.values) {
+ if (datum.is_array()) {
+ return ExecArrayCoalesce(ctx, batch, out);
+ }
+ }
+ return ExecScalarCoalesce(ctx, batch, out);
+ }
+};
+
+template <>
+struct CoalesceFunctor {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ return Status::OK();
+ }
+};
+
+template
+struct CoalesceFunctor> {
+ using offset_type = typename Type::offset_type;
+ using BuilderType = typename TypeTraits::BuilderType;
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ for (const auto& datum : batch.values) {
+ if (datum.is_array()) {
+ return ExecArray(ctx, batch, out);
+ }
+ }
+ return ExecScalarCoalesce(ctx, batch, out);
+ }
+
+ static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // Special case: grab any leading non-null scalar or array arguments
+ for (const auto& datum : batch.values) {
+ if (datum.is_scalar()) {
+ if (!datum.scalar()->is_valid) continue;
+ ARROW_ASSIGN_OR_RAISE(
+ *out, MakeArrayFromScalar(*datum.scalar(), batch.length, ctx->memory_pool()));
+ return Status::OK();
+ } else if (datum.is_array() && !datum.array()->MayHaveNulls()) {
+ *out = datum;
+ return Status::OK();
+ }
+ break;
+ }
+ ArrayData* output = out->mutable_array();
+ BuilderType builder(batch[0].type(), ctx->memory_pool());
+ RETURN_NOT_OK(builder.Reserve(batch.length));
+ for (int64_t i = 0; i < batch.length; i++) {
+ bool set = false;
+ for (const auto& datum : batch.values) {
+ if (datum.is_scalar()) {
+ if (datum.scalar()->is_valid) {
+ RETURN_NOT_OK(builder.Append(UnboxScalar::Unbox(*datum.scalar())));
+ set = true;
+ break;
+ }
+ } else {
+ const ArrayData& source = *datum.array();
+ if (!source.MayHaveNulls() ||
+ BitUtil::GetBit(source.buffers[0]->data(), source.offset + i)) {
+ const uint8_t* data = source.buffers[2]->data();
+ const offset_type* offsets = source.GetValues(1);
+ const offset_type offset0 = offsets[i];
+ const offset_type offset1 = offsets[i + 1];
+ RETURN_NOT_OK(builder.Append(data + offset0, offset1 - offset0));
+ set = true;
+ break;
+ }
+ }
+ }
+ if (!set) RETURN_NOT_OK(builder.AppendNull());
+ }
+ ARROW_ASSIGN_OR_RAISE(auto temp_output, builder.Finish());
+ *output = *temp_output->data();
+ // Builder type != logical type due to GenerateTypeAgnosticVarBinaryBase
+ output->type = batch[0].type();
+ return Status::OK();
+ }
+};
+
Result LastType(KernelContext*, const std::vector& descrs) {
ValueDescr result = descrs.back();
result.shape = GetBroadcastShape(descrs);
@@ -1399,6 +1630,25 @@ void AddPrimitiveCaseWhenKernels(const std::shared_ptr& scalar
}
}
+void AddCoalesceKernel(const std::shared_ptr& scalar_function,
+ detail::GetTypeId get_id, ArrayKernelExec exec) {
+ ScalarKernel kernel(KernelSignature::Make({InputType(get_id.id)}, OutputType(FirstType),
+ /*is_varargs=*/true),
+ exec);
+ kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE;
+ kernel.mem_allocation = MemAllocation::PREALLOCATE;
+ kernel.can_write_into_slices = is_fixed_width(get_id.id);
+ DCHECK_OK(scalar_function->AddKernel(std::move(kernel)));
+}
+
+void AddPrimitiveCoalesceKernels(const std::shared_ptr& scalar_function,
+ const std::vector>& types) {
+ for (auto&& type : types) {
+ auto exec = GenerateTypeAgnosticPrimitive(*type);
+ AddCoalesceKernel(scalar_function, type, std::move(exec));
+ }
+}
+
const FunctionDoc if_else_doc{"Choose values based on a condition",
("`cond` must be a Boolean scalar/ array. \n`left` or "
"`right` must be of the same type scalar/ array.\n"
@@ -1419,6 +1669,13 @@ const FunctionDoc case_when_doc{
"Essentially, this implements a switch-case or if-else, if-else... "
"statement."),
{"cond", "*cases"}};
+
+const FunctionDoc coalesce_doc{
+ "Select the first non-null value in each slot",
+ ("Each row of the output will be the value from the first corresponding input "
+ "for which the value is not null. If all inputs are null in a row, the output "
+ "will be null."),
+ {"*values"}};
} // namespace
void RegisterScalarIfElse(FunctionRegistry* registry) {
@@ -1447,6 +1704,22 @@ void RegisterScalarIfElse(FunctionRegistry* registry) {
AddCaseWhenKernel(func, Type::DECIMAL256, CaseWhenFunctor::Exec);
DCHECK_OK(registry->AddFunction(std::move(func)));
}
+ {
+ auto func = std::make_shared(
+ "coalesce", Arity::VarArgs(/*min_args=*/1), &coalesce_doc);
+ AddPrimitiveCoalesceKernels(func, NumericTypes());
+ AddPrimitiveCoalesceKernels(func, TemporalTypes());
+ AddPrimitiveCoalesceKernels(
+ func, {boolean(), null(), day_time_interval(), month_interval()});
+ AddCoalesceKernel(func, Type::FIXED_SIZE_BINARY,
+ CoalesceFunctor::Exec);
+ AddCoalesceKernel(func, Type::DECIMAL128, CoalesceFunctor::Exec);
+ AddCoalesceKernel(func, Type::DECIMAL256, CoalesceFunctor::Exec);
+ for (const auto& ty : BaseBinaryTypes()) {
+ AddCoalesceKernel(func, ty, GenerateTypeAgnosticVarBinaryBase(ty));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
}
} // namespace internal
diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc
index 3b8df47162d..a63492987eb 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc
@@ -227,6 +227,61 @@ static void CaseWhenBench64Contiguous(benchmark::State& state) {
return CaseWhenBenchContiguous(state);
}
+template
+static void CoalesceBench(benchmark::State& state) {
+ using CType = typename Type::c_type;
+ auto type = TypeTraits::type_singleton();
+
+ int64_t len = state.range(0);
+ int64_t offset = state.range(1);
+
+ random::RandomArrayGenerator rand(/*seed=*/0);
+
+ std::vector arguments;
+ for (int i = 0; i < 4; i++) {
+ arguments.emplace_back(
+ rand.ArrayOf(type, len, /*null_probability=*/0.25)->Slice(offset));
+ }
+
+ for (auto _ : state) {
+ ABORT_NOT_OK(CallFunction("coalesce", arguments));
+ }
+
+ state.SetBytesProcessed(state.iterations() * arguments.size() * (len - offset) *
+ sizeof(CType));
+}
+
+template
+static void CoalesceNonNullBench(benchmark::State& state) {
+ using CType = typename Type::c_type;
+ auto type = TypeTraits::type_singleton();
+
+ int64_t len = state.range(0);
+ int64_t offset = state.range(1);
+
+ random::RandomArrayGenerator rand(/*seed=*/0);
+
+ std::vector arguments;
+ arguments.emplace_back(
+ rand.ArrayOf(type, len, /*null_probability=*/0.25)->Slice(offset));
+ arguments.emplace_back(rand.ArrayOf(type, len, /*null_probability=*/0)->Slice(offset));
+
+ for (auto _ : state) {
+ ABORT_NOT_OK(CallFunction("coalesce", arguments));
+ }
+
+ state.SetBytesProcessed(state.iterations() * arguments.size() * (len - offset) *
+ sizeof(CType));
+}
+
+static void CoalesceBench64(benchmark::State& state) {
+ return CoalesceBench(state);
+}
+
+static void CoalesceNonNullBench64(benchmark::State& state) {
+ return CoalesceBench(state);
+}
+
BENCHMARK(IfElseBench32)->Args({elems, 0});
BENCHMARK(IfElseBench64)->Args({elems, 0});
@@ -251,5 +306,11 @@ BENCHMARK(CaseWhenBench64)->Args({elems, 99});
BENCHMARK(CaseWhenBench64Contiguous)->Args({elems, 0});
BENCHMARK(CaseWhenBench64Contiguous)->Args({elems, 99});
+BENCHMARK(CoalesceBench64)->Args({elems, 0});
+BENCHMARK(CoalesceBench64)->Args({elems, 99});
+
+BENCHMARK(CoalesceNonNullBench64)->Args({elems, 0});
+BENCHMARK(CoalesceNonNullBench64)->Args({elems, 99});
+
} // namespace compute
} // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
index 8ff86f3ec29..48b0cdb457d 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
@@ -853,5 +853,192 @@ TEST(TestCaseWhen, DispatchBest) {
ArrayFromJSON(int64(), "[]"),
ArrayFromJSON(utf8(), "[]")}));
}
+
+template
+class TestCoalesceNumeric : public ::testing::Test {};
+template
+class TestCoalesceBinary : public ::testing::Test {};
+
+TYPED_TEST_SUITE(TestCoalesceNumeric, NumericBasedTypes);
+TYPED_TEST_SUITE(TestCoalesceBinary, BinaryTypes);
+
+TYPED_TEST(TestCoalesceNumeric, FixedSize) {
+ auto type = default_type_instance();
+ auto scalar_null = ScalarFromJSON(type, "null");
+ auto scalar1 = ScalarFromJSON(type, "20");
+ auto values_null = ArrayFromJSON(type, "[null, null, null, null]");
+ auto values1 = ArrayFromJSON(type, "[null, 10, 11, 12]");
+ auto values2 = ArrayFromJSON(type, "[13, 14, 15, 16]");
+ auto values3 = ArrayFromJSON(type, "[17, 18, 19, null]");
+ // N.B. all-scalar cases are checked in CheckScalar
+ CheckScalar("coalesce", {values_null}, values_null);
+ CheckScalar("coalesce", {values_null, scalar1},
+ ArrayFromJSON(type, "[20, 20, 20, 20]"));
+ CheckScalar("coalesce", {values_null, values1}, values1);
+ CheckScalar("coalesce", {values_null, values2}, values2);
+ CheckScalar("coalesce", {values1, values_null}, values1);
+ CheckScalar("coalesce", {values2, values_null}, values2);
+ CheckScalar("coalesce", {scalar_null, values1}, values1);
+ CheckScalar("coalesce", {values1, scalar_null}, values1);
+ CheckScalar("coalesce", {values2, values1, values_null}, values2);
+ CheckScalar("coalesce", {values1, scalar1}, ArrayFromJSON(type, "[20, 10, 11, 12]"));
+ CheckScalar("coalesce", {values1, values2}, ArrayFromJSON(type, "[13, 10, 11, 12]"));
+ CheckScalar("coalesce", {values1, values2, values3},
+ ArrayFromJSON(type, "[13, 10, 11, 12]"));
+ CheckScalar("coalesce", {scalar1, values1}, ArrayFromJSON(type, "[20, 20, 20, 20]"));
+}
+
+TYPED_TEST(TestCoalesceBinary, Basics) {
+ auto type = default_type_instance();
+ auto scalar_null = ScalarFromJSON(type, "null");
+ auto scalar1 = ScalarFromJSON(type, R"("a")");
+ auto values_null = ArrayFromJSON(type, R"([null, null, null, null])");
+ auto values1 = ArrayFromJSON(type, R"([null, "bc", "def", "ghij"])");
+ auto values2 = ArrayFromJSON(type, R"(["klmno", "p", "qr", "stu"])");
+ auto values3 = ArrayFromJSON(type, R"(["vwxy", "zabc", "d", null])");
+ // N.B. all-scalar cases are checked in CheckScalar
+ CheckScalar("coalesce", {values_null}, values_null);
+ CheckScalar("coalesce", {values_null, scalar1},
+ ArrayFromJSON(type, R"(["a", "a", "a", "a"])"));
+ CheckScalar("coalesce", {values_null, values1}, values1);
+ CheckScalar("coalesce", {values_null, values2}, values2);
+ CheckScalar("coalesce", {values1, values_null}, values1);
+ CheckScalar("coalesce", {values2, values_null}, values2);
+ CheckScalar("coalesce", {scalar_null, values1}, values1);
+ CheckScalar("coalesce", {values1, scalar_null}, values1);
+ CheckScalar("coalesce", {values2, values1, values_null}, values2);
+ CheckScalar("coalesce", {values1, scalar1},
+ ArrayFromJSON(type, R"(["a", "bc", "def", "ghij"])"));
+ CheckScalar("coalesce", {values1, values2},
+ ArrayFromJSON(type, R"(["klmno", "bc", "def", "ghij"])"));
+ CheckScalar("coalesce", {values1, values2, values3},
+ ArrayFromJSON(type, R"(["klmno", "bc", "def", "ghij"])"));
+ CheckScalar("coalesce", {scalar1, values1},
+ ArrayFromJSON(type, R"(["a", "a", "a", "a"])"));
+}
+
+TEST(TestCoalesce, Null) {
+ auto type = null();
+ auto scalar_null = ScalarFromJSON(type, "null");
+ auto values_null = ArrayFromJSON(type, "[null, null, null, null]");
+ CheckScalar("coalesce", {values_null}, values_null);
+ CheckScalar("coalesce", {values_null, scalar_null}, values_null);
+}
+
+TEST(TestCoalesce, Boolean) {
+ auto type = boolean();
+ auto scalar_null = ScalarFromJSON(type, "null");
+ auto scalar1 = ScalarFromJSON(type, "false");
+ auto values_null = ArrayFromJSON(type, "[null, null, null, null]");
+ auto values1 = ArrayFromJSON(type, "[null, true, false, true]");
+ auto values2 = ArrayFromJSON(type, "[true, false, true, false]");
+ auto values3 = ArrayFromJSON(type, "[false, true, false, null]");
+ CheckScalar("coalesce", {values_null}, values_null);
+ CheckScalar("coalesce", {values_null, scalar1},
+ ArrayFromJSON(type, "[false, false, false, false]"));
+ CheckScalar("coalesce", {values_null, values1}, values1);
+ CheckScalar("coalesce", {values_null, values2}, values2);
+ CheckScalar("coalesce", {values1, values_null}, values1);
+ CheckScalar("coalesce", {values2, values_null}, values2);
+ CheckScalar("coalesce", {scalar_null, values1}, values1);
+ CheckScalar("coalesce", {values1, scalar_null}, values1);
+ CheckScalar("coalesce", {values2, values1, values_null}, values2);
+ CheckScalar("coalesce", {values1, scalar1},
+ ArrayFromJSON(type, "[false, true, false, true]"));
+ CheckScalar("coalesce", {values1, values2},
+ ArrayFromJSON(type, "[true, true, false, true]"));
+ CheckScalar("coalesce", {values1, values2, values3},
+ ArrayFromJSON(type, "[true, true, false, true]"));
+ CheckScalar("coalesce", {scalar1, values1},
+ ArrayFromJSON(type, "[false, false, false, false]"));
+}
+
+TEST(TestCoalesce, DayTimeInterval) {
+ auto type = day_time_interval();
+ auto scalar_null = ScalarFromJSON(type, "null");
+ auto scalar1 = ScalarFromJSON(type, "[1, 2]");
+ auto values_null = ArrayFromJSON(type, "[null, null, null, null]");
+ auto values1 = ArrayFromJSON(type, "[null, [3, 4], [5, 6], [7, 8]]");
+ auto values2 = ArrayFromJSON(type, "[[9, 10], [11, 12], [13, 14], [15, 16]]");
+ auto values3 = ArrayFromJSON(type, "[[17, 18], [19, 20], [21, 22], null]");
+ // N.B. all-scalar cases are checked in CheckScalar
+ CheckScalar("coalesce", {values_null}, values_null);
+ CheckScalar("coalesce", {values_null, scalar1},
+ ArrayFromJSON(type, "[[1, 2], [1, 2], [1, 2], [1, 2]]"));
+ CheckScalar("coalesce", {values_null, values1}, values1);
+ CheckScalar("coalesce", {values_null, values2}, values2);
+ CheckScalar("coalesce", {values1, values_null}, values1);
+ CheckScalar("coalesce", {values2, values_null}, values2);
+ CheckScalar("coalesce", {scalar_null, values1}, values1);
+ CheckScalar("coalesce", {values1, scalar_null}, values1);
+ CheckScalar("coalesce", {values2, values1, values_null}, values2);
+ CheckScalar("coalesce", {values1, scalar1},
+ ArrayFromJSON(type, "[[1, 2], [3, 4], [5, 6], [7, 8]]"));
+ CheckScalar("coalesce", {values1, values2},
+ ArrayFromJSON(type, "[[9, 10], [3, 4], [5, 6], [7, 8]]"));
+ CheckScalar("coalesce", {values1, values2, values3},
+ ArrayFromJSON(type, "[[9, 10], [3, 4], [5, 6], [7, 8]]"));
+ CheckScalar("coalesce", {scalar1, values1},
+ ArrayFromJSON(type, "[[1, 2], [1, 2], [1, 2], [1, 2]]"));
+}
+
+TEST(TestCoalesce, Decimal) {
+ for (const auto& type :
+ std::vector>{decimal128(3, 2), decimal256(3, 2)}) {
+ auto scalar_null = ScalarFromJSON(type, "null");
+ auto scalar1 = ScalarFromJSON(type, R"("1.23")");
+ auto values_null = ArrayFromJSON(type, R"([null, null, null, null])");
+ auto values1 = ArrayFromJSON(type, R"([null, "4.56", "7.89", "1.34"])");
+ auto values2 = ArrayFromJSON(type, R"(["1.45", "2.34", "3.45", "4.56"])");
+ auto values3 = ArrayFromJSON(type, R"(["5.67", "6.78", "7.91", null])");
+ CheckScalar("coalesce", {values_null}, values_null);
+ CheckScalar("coalesce", {values_null, scalar1},
+ ArrayFromJSON(type, R"(["1.23", "1.23", "1.23", "1.23"])"));
+ CheckScalar("coalesce", {values_null, values1}, values1);
+ CheckScalar("coalesce", {values_null, values2}, values2);
+ CheckScalar("coalesce", {values1, values_null}, values1);
+ CheckScalar("coalesce", {values2, values_null}, values2);
+ CheckScalar("coalesce", {scalar_null, values1}, values1);
+ CheckScalar("coalesce", {values1, scalar_null}, values1);
+ CheckScalar("coalesce", {values2, values1, values_null}, values2);
+ CheckScalar("coalesce", {values1, scalar1},
+ ArrayFromJSON(type, R"(["1.23", "4.56", "7.89", "1.34"])"));
+ CheckScalar("coalesce", {values1, values2},
+ ArrayFromJSON(type, R"(["1.45", "4.56", "7.89", "1.34"])"));
+ CheckScalar("coalesce", {values1, values2, values3},
+ ArrayFromJSON(type, R"(["1.45", "4.56", "7.89", "1.34"])"));
+ CheckScalar("coalesce", {scalar1, values1},
+ ArrayFromJSON(type, R"(["1.23", "1.23", "1.23", "1.23"])"));
+ }
+}
+
+TEST(TestCoalesce, FixedSizeBinary) {
+ auto type = fixed_size_binary(3);
+ auto scalar_null = ScalarFromJSON(type, "null");
+ auto scalar1 = ScalarFromJSON(type, R"("abc")");
+ auto values_null = ArrayFromJSON(type, R"([null, null, null, null])");
+ auto values1 = ArrayFromJSON(type, R"([null, "def", "ghi", "jkl"])");
+ auto values2 = ArrayFromJSON(type, R"(["mno", "pqr", "stu", "vwx"])");
+ auto values3 = ArrayFromJSON(type, R"(["yza", "bcd", "efg", null])");
+ CheckScalar("coalesce", {values_null}, values_null);
+ CheckScalar("coalesce", {values_null, scalar1},
+ ArrayFromJSON(type, R"(["abc", "abc", "abc", "abc"])"));
+ CheckScalar("coalesce", {values_null, values1}, values1);
+ CheckScalar("coalesce", {values_null, values2}, values2);
+ CheckScalar("coalesce", {values1, values_null}, values1);
+ CheckScalar("coalesce", {values2, values_null}, values2);
+ CheckScalar("coalesce", {scalar_null, values1}, values1);
+ CheckScalar("coalesce", {values1, scalar_null}, values1);
+ CheckScalar("coalesce", {values2, values1, values_null}, values2);
+ CheckScalar("coalesce", {values1, scalar1},
+ ArrayFromJSON(type, R"(["abc", "def", "ghi", "jkl"])"));
+ CheckScalar("coalesce", {values1, values2},
+ ArrayFromJSON(type, R"(["mno", "def", "ghi", "jkl"])"));
+ CheckScalar("coalesce", {values1, values2, values3},
+ ArrayFromJSON(type, R"(["mno", "def", "ghi", "jkl"])"));
+ CheckScalar("coalesce", {scalar1, values1},
+ ArrayFromJSON(type, R"(["abc", "abc", "abc", "abc"])"));
+}
+
} // namespace compute
} // namespace arrow
diff --git a/cpp/src/arrow/util/bit_block_counter.h b/cpp/src/arrow/util/bit_block_counter.h
index 803b825e1b2..63036af52a4 100644
--- a/cpp/src/arrow/util/bit_block_counter.h
+++ b/cpp/src/arrow/util/bit_block_counter.h
@@ -57,6 +57,16 @@ struct BitBlockAnd {
static bool Call(bool left, bool right) { return left && right; }
};
+template
+struct BitBlockAndNot {
+ static T Call(T left, T right) { return left & ~right; }
+};
+
+template <>
+struct BitBlockAndNot {
+ static bool Call(bool left, bool right) { return left && !right; }
+};
+
template
struct BitBlockOr {
static T Call(T left, T right) { return left | right; }
@@ -266,6 +276,9 @@ class ARROW_EXPORT BinaryBitBlockCounter {
/// blocks in subsequent invocations.
BitBlockCount NextAndWord() { return NextWord(); }
+ /// \brief Computes "x & ~y" block for each available run of bits.
+ BitBlockCount NextAndNotWord() { return NextWord(); }
+
/// \brief Computes "x | y" block for each available run of bits.
BitBlockCount NextOrWord() { return NextWord(); }
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index 35011a786a6..9455a78367a 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -869,25 +869,27 @@ Structural transforms
+--------------------------+------------+---------------------------------------------------+---------------------+---------+
| Function name | Arity | Input types | Output type | Notes |
+==========================+============+===================================================+=====================+=========+
-| case_when | Varargs | Struct of Boolean (Arg 0), Any fixed-width (rest) | Input type | \(1) |
+| case_when | Varargs | Struct of Boolean (Arg 0), Any fixed-width (rest) | Input type | \(1) |
+--------------------------+------------+---------------------------------------------------+---------------------+---------+
-| fill_null | Binary | Boolean, Null, Numeric, Temporal, String-like | Input type | \(2) |
+| coalesce | Varargs | Any | Input type | \(2) |
+--------------------------+------------+---------------------------------------------------+---------------------+---------+
-| if_else | Ternary | Boolean, Null, Numeric, Temporal | Input type | \(3) |
+| fill_null | Binary | Boolean, Null, Numeric, Temporal, String-like | Input type | \(3) |
+--------------------------+------------+---------------------------------------------------+---------------------+---------+
-| is_finite | Unary | Float, Double | Boolean | \(4) |
+| if_else | Ternary | Boolean, Null, Numeric, Temporal | Input type | \(4) |
+--------------------------+------------+---------------------------------------------------+---------------------+---------+
-| is_inf | Unary | Float, Double | Boolean | \(5) |
+| is_finite | Unary | Float, Double | Boolean | \(5) |
+--------------------------+------------+---------------------------------------------------+---------------------+---------+
-| is_nan | Unary | Float, Double | Boolean | \(6) |
+| is_inf | Unary | Float, Double | Boolean | \(6) |
+--------------------------+------------+---------------------------------------------------+---------------------+---------+
-| is_null | Unary | Any | Boolean | \(7) |
+| is_nan | Unary | Float, Double | Boolean | \(7) |
+--------------------------+------------+---------------------------------------------------+---------------------+---------+
-| is_valid | Unary | Any | Boolean | \(8) |
+| is_null | Unary | Any | Boolean | \(8) |
+--------------------------+------------+---------------------------------------------------+---------------------+---------+
-| list_value_length | Unary | List-like | Int32 or Int64 | \(9) |
+| is_valid | Unary | Any | Boolean | \(9) |
+--------------------------+------------+---------------------------------------------------+---------------------+---------+
-| project | Varargs | Any | Struct | \(10) |
+| list_value_length | Unary | List-like | Int32 or Int64 | \(10) |
++--------------------------+------------+---------------------------------------------------+---------------------+---------+
+| make_struct | Varargs | Any | Struct | \(11) |
+--------------------------+------------+---------------------------------------------------+---------------------+---------+
* \(1) This function acts like a SQL 'case when' statement or switch-case. The
@@ -899,11 +901,14 @@ Structural transforms
the first value datum for which the corresponding Boolean is true, or the
corresponding value from the 'default' input, or null otherwise.
-* \(2) First input must be an array, second input a scalar of the same type.
+* \(2) Each row of the output will be the corresponding value of the first
+ input which is non-null for that row, otherwise null.
+
+* \(3) First input must be an array, second input a scalar of the same type.
Output is an array of the same type as the inputs, and with the same values
as the first input, except for nulls replaced with the second input value.
-* \(3) First input must be a Boolean scalar or array. Second and third inputs
+* \(4) First input must be a Boolean scalar or array. Second and third inputs
could be scalars or arrays and must be of the same type. Output is an array
(or scalar if all inputs are scalar) of the same type as the second/ third
input. If the nulls present on the first input, they will be promoted to the
@@ -911,21 +916,21 @@ Structural transforms
Also see: :ref:`replace_with_mask `.
-* \(4) Output is true iff the corresponding input element is finite (not Infinity,
+* \(5) Output is true iff the corresponding input element is finite (not Infinity,
-Infinity, or NaN).
-* \(5) Output is true iff the corresponding input element is Infinity/-Infinity.
+* \(6) Output is true iff the corresponding input element is Infinity/-Infinity.
-* \(6) Output is true iff the corresponding input element is NaN.
+* \(7) Output is true iff the corresponding input element is NaN.
-* \(7) Output is true iff the corresponding input element is null.
+* \(8) Output is true iff the corresponding input element is null.
-* \(8) Output is true iff the corresponding input element is non-null.
+* \(9) Output is true iff the corresponding input element is non-null.
-* \(9) Each output element is the length of the corresponding input element
+* \(10) Each output element is the length of the corresponding input element
(null if input is null). Output type is Int32 for List, Int64 for LargeList.
-* \(10) The output struct's field types are the types of its arguments. The
+* \(11) The output struct's field types are the types of its arguments. The
field names are specified using an instance of :struct:`MakeStructOptions`.
The output shape will be scalar if all inputs are scalar, otherwise any
scalars will be broadcast to arrays.
diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst
index fd32d08f85c..85a6200436a 100644
--- a/docs/source/python/api/compute.rst
+++ b/docs/source/python/api/compute.rst
@@ -337,6 +337,7 @@ Structural Transforms
binary_length
case_when
+ coalesce
fill_null
if_else
is_finite