From 8a5545fad496395bc251203344486a36eff52a5e Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 21 Jul 2021 09:42:04 -0400 Subject: [PATCH 01/31] ARROW-13222: [C++] Improve type support for case_when kernel --- .../arrow/compute/kernels/scalar_if_else.cc | 163 +++++++++++++++++- .../compute/kernels/scalar_if_else_test.cc | 82 +++++++++ 2 files changed, 242 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index cb261ec59a7..e130d2b3ad9 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -1413,6 +1413,148 @@ struct CaseWhenFunctor { } }; +template +struct CaseWhenFunctor> { + using offset_type = typename Type::offset_type; + using BuilderType = typename TypeTraits::BuilderType; + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + if (batch[0].null_count() > 0) { + return Status::Invalid("cond struct must not have outer nulls"); + } + if (batch[0].is_scalar()) { + return ExecScalar(ctx, batch, out); + } + return ExecArray(ctx, batch, out); + } + + static Status ExecScalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + const auto& conds = checked_cast(*batch.values[0].scalar()); + Datum result; + for (size_t i = 0; i < batch.values.size() - 1; i++) { + if (i < conds.value.size()) { + const Scalar& cond = *conds.value[i]; + if (cond.is_valid && internal::UnboxScalar::Unbox(cond)) { + result = batch[i + 1]; + break; + } + } else { + // ELSE clause + result = batch[i + 1]; + break; + } + } + if (out->is_scalar()) { + *out = result.is_scalar() ? result.scalar() : MakeNullScalar(out->type()); + return Status::OK(); + } + ArrayData* output = out->mutable_array(); + if (!result.is_value()) { + // All conditions false, no 'else' argument + ARROW_ASSIGN_OR_RAISE( + auto array, MakeArrayOfNull(output->type, batch.length, ctx->memory_pool())); + *output = *array->data(); + } else if (result.is_scalar()) { + ARROW_ASSIGN_OR_RAISE( + auto array, + MakeArrayFromScalar(*result.scalar(), batch.length, ctx->memory_pool())); + *output = *array->data(); + } else { + // Copy offsets/data + const ArrayData& source = *result.array(); + output->length = source.length; + output->SetNullCount(source.null_count); + if (source.MayHaveNulls()) { + ARROW_ASSIGN_OR_RAISE( + output->buffers[0], + arrow::internal::CopyBitmap(ctx->memory_pool(), source.buffers[0]->data(), + source.offset, source.length)); + } + ARROW_ASSIGN_OR_RAISE(output->buffers[1], + ctx->Allocate(sizeof(offset_type) * (source.length + 1))); + const offset_type* in_offsets = source.GetValues(1); + offset_type* out_offsets = output->GetMutableValues(1); + std::transform(in_offsets, in_offsets + source.length + 1, out_offsets, + [&](offset_type offset) { return offset - in_offsets[0]; }); + auto data_length = out_offsets[output->length] - out_offsets[0]; + ARROW_ASSIGN_OR_RAISE(output->buffers[2], ctx->Allocate(data_length)); + std::memcpy(output->buffers[2]->mutable_data(), + source.buffers[2]->data() + in_offsets[0], data_length); + } + return Status::OK(); + } + + static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + const auto& conds_array = *batch.values[0].array(); + if (conds_array.GetNullCount() > 0) { + return Status::Invalid("cond struct must not have top-level nulls"); + } + ArrayData* output = out->mutable_array(); + const bool have_else_arg = + static_cast(conds_array.type->num_fields()) < (batch.values.size() - 1); + BuilderType builder(batch[0].type(), ctx->memory_pool()); + RETURN_NOT_OK(builder.Reserve(batch.length)); + int64_t reservation = 0; + for (size_t arg = 1; arg < batch.values.size(); arg++) { + auto source = batch.values[arg]; + if (source.is_scalar()) { + const auto& scalar = checked_cast(*source.scalar()); + if (!scalar.value) continue; + reservation = std::max(reservation, batch.length * scalar.value->size()); + } else { + const auto& array = *source.array(); + const auto& offsets = array.GetValues(1); + reservation = std::max(reservation, offsets[array.length] - offsets[0]); + } + } + RETURN_NOT_OK(builder.ReserveData(reservation)); + + for (int64_t row = 0; row < batch.length; row++) { + int64_t selected = have_else_arg ? batch.values.size() - 1 : -1; + for (int64_t arg = 0; static_cast(arg) < conds_array.child_data.size(); + arg++) { + const ArrayData& cond_array = *conds_array.child_data[arg]; + if ((!cond_array.buffers[0] || + BitUtil::GetBit(cond_array.buffers[0]->data(), + conds_array.offset + cond_array.offset + row)) && + BitUtil::GetBit(cond_array.buffers[1]->data(), + conds_array.offset + cond_array.offset + row)) { + selected = arg + 1; + break; + } + } + if (selected < 0) { + RETURN_NOT_OK(builder.AppendNull()); + continue; + } + const Datum& source = batch.values[selected]; + if (source.is_scalar()) { + const auto& scalar = checked_cast(*source.scalar()); + if (!scalar.is_valid) { + RETURN_NOT_OK(builder.AppendNull()); + } else { + RETURN_NOT_OK(builder.Append(scalar.value->data(), scalar.value->size())); + } + } else { + const auto& array = *source.array(); + if (!array.buffers[0] || + BitUtil::GetBit(array.buffers[0]->data(), array.offset + row)) { + const offset_type* offsets = array.GetValues(1); + RETURN_NOT_OK(builder.Append(array.buffers[2]->data() + offsets[row], + offsets[row + 1] - offsets[row])); + } else { + RETURN_NOT_OK(builder.AppendNull()); + } + } + } + + ARROW_ASSIGN_OR_RAISE(auto temp_output, builder.Finish()); + *output = *temp_output->data(); + // Builder type != logical type due to GenerateTypeAgnosticVarBinaryBase + output->type = batch[1].type(); + return Status::OK(); + } +}; + struct CoalesceFunction : ScalarFunction { using ScalarFunction::ScalarFunction; @@ -1841,9 +1983,15 @@ void AddCaseWhenKernel(const std::shared_ptr& scalar_function, OutputType(LastType), /*is_varargs=*/true), exec); - kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE; - kernel.mem_allocation = MemAllocation::PREALLOCATE; - kernel.can_write_into_slices = is_fixed_width(get_id.id); + if (is_fixed_width(get_id.id)) { + kernel.null_handling = NullHandling::COMPUTED_PREALLOCATE; + kernel.mem_allocation = MemAllocation::PREALLOCATE; + kernel.can_write_into_slices = true; + } else { + kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; + kernel.mem_allocation = MemAllocation::NO_PREALLOCATE; + kernel.can_write_into_slices = false; + } DCHECK_OK(scalar_function->AddKernel(std::move(kernel))); } @@ -1855,6 +2003,14 @@ void AddPrimitiveCaseWhenKernels(const std::shared_ptr& scalar } } +void AddBinaryCaseWhenKernels(const std::shared_ptr& scalar_function, + const std::vector>& types) { + for (auto&& type : types) { + auto exec = GenerateTypeAgnosticVarBinaryBase(*type); + AddCaseWhenKernel(scalar_function, type, std::move(exec)); + } +} + void AddCoalesceKernel(const std::shared_ptr& scalar_function, detail::GetTypeId get_id, ArrayKernelExec exec) { ScalarKernel kernel(KernelSignature::Make({InputType(get_id.id)}, OutputType(FirstType), @@ -1957,6 +2113,7 @@ void RegisterScalarIfElse(FunctionRegistry* registry) { CaseWhenFunctor::Exec); AddCaseWhenKernel(func, Type::DECIMAL128, CaseWhenFunctor::Exec); AddCaseWhenKernel(func, Type::DECIMAL256, CaseWhenFunctor::Exec); + AddBinaryCaseWhenKernels(func, BaseBinaryTypes()); DCHECK_OK(registry->AddFunction(std::move(func))); } { diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc index 8a6ccd69865..90f73107338 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc @@ -531,6 +531,10 @@ TYPED_TEST(TestCaseWhenNumeric, FixedSize) { CheckScalar("case_when", {MakeStruct({}), values1}, values1); CheckScalar("case_when", {MakeStruct({}), values_null}, values_null); + CheckScalar("case_when", {MakeStruct({cond_true}), scalar1, values1}, + *MakeArrayFromScalar(*scalar1, 4)); + CheckScalar("case_when", {MakeStruct({cond_false}), scalar1, values1}, values1); + CheckScalar("case_when", {MakeStruct({cond_true}), values1}, values1); CheckScalar("case_when", {MakeStruct({cond_false}), values1}, values_null); CheckScalar("case_when", {MakeStruct({cond_null}), values1}, values_null); @@ -632,6 +636,10 @@ TEST(TestCaseWhen, Boolean) { CheckScalar("case_when", {MakeStruct({}), values1}, values1); CheckScalar("case_when", {MakeStruct({}), values_null}, values_null); + CheckScalar("case_when", {MakeStruct({cond_true}), scalar1, values1}, + *MakeArrayFromScalar(*scalar1, 4)); + CheckScalar("case_when", {MakeStruct({cond_false}), scalar1, values1}, values1); + CheckScalar("case_when", {MakeStruct({cond_true}), values1}, values1); CheckScalar("case_when", {MakeStruct({cond_false}), values1}, values_null); CheckScalar("case_when", {MakeStruct({cond_null}), values1}, values_null); @@ -685,6 +693,10 @@ TEST(TestCaseWhen, DayTimeInterval) { CheckScalar("case_when", {MakeStruct({}), values1}, values1); CheckScalar("case_when", {MakeStruct({}), values_null}, values_null); + CheckScalar("case_when", {MakeStruct({cond_true}), scalar1, values1}, + *MakeArrayFromScalar(*scalar1, 4)); + CheckScalar("case_when", {MakeStruct({cond_false}), scalar1, values1}, values1); + CheckScalar("case_when", {MakeStruct({cond_true}), values1}, values1); CheckScalar("case_when", {MakeStruct({cond_false}), values1}, values_null); CheckScalar("case_when", {MakeStruct({cond_null}), values1}, values_null); @@ -739,6 +751,10 @@ TEST(TestCaseWhen, Decimal) { CheckScalar("case_when", {MakeStruct({}), values1}, values1); CheckScalar("case_when", {MakeStruct({}), values_null}, values_null); + CheckScalar("case_when", {MakeStruct({cond_true}), scalar1, values1}, + *MakeArrayFromScalar(*scalar1, 4)); + CheckScalar("case_when", {MakeStruct({cond_false}), scalar1, values1}, values1); + CheckScalar("case_when", {MakeStruct({cond_true}), values1}, values1); CheckScalar("case_when", {MakeStruct({cond_false}), values1}, values_null); CheckScalar("case_when", {MakeStruct({cond_null}), values1}, values_null); @@ -794,6 +810,10 @@ TEST(TestCaseWhen, FixedSizeBinary) { CheckScalar("case_when", {MakeStruct({}), values1}, values1); CheckScalar("case_when", {MakeStruct({}), values_null}, values_null); + CheckScalar("case_when", {MakeStruct({cond_true}), scalar1, values1}, + *MakeArrayFromScalar(*scalar1, 4)); + CheckScalar("case_when", {MakeStruct({cond_false}), scalar1, values1}, values1); + CheckScalar("case_when", {MakeStruct({cond_true}), values1}, values1); CheckScalar("case_when", {MakeStruct({cond_false}), values1}, values_null); CheckScalar("case_when", {MakeStruct({cond_null}), values1}, values_null); @@ -830,6 +850,68 @@ TEST(TestCaseWhen, FixedSizeBinary) { ArrayFromJSON(type, R"([null, null, null, "efg"])")); } +template +class TestCaseWhenBinary : public ::testing::Test {}; + +TYPED_TEST_SUITE(TestCaseWhenBinary, BinaryTypes); + +TYPED_TEST(TestCaseWhenBinary, Basics) { + auto type = default_type_instance(); + auto cond_true = ScalarFromJSON(boolean(), "true"); + auto cond_false = ScalarFromJSON(boolean(), "false"); + auto cond_null = ScalarFromJSON(boolean(), "null"); + auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]"); + auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]"); + auto scalar_null = ScalarFromJSON(type, "null"); + auto scalar1 = ScalarFromJSON(type, R"("aBxYz")"); + auto scalar2 = ScalarFromJSON(type, R"("b")"); + auto values_null = ArrayFromJSON(type, "[null, null, null, null]"); + auto values1 = ArrayFromJSON(type, R"(["cDE", null, "degfhi", "efg"])"); + auto values2 = ArrayFromJSON(type, R"(["fghijk", "ghi", null, "hi"])"); + + CheckScalar("case_when", {MakeStruct({}), values1}, values1); + CheckScalar("case_when", {MakeStruct({}), values_null}, values_null); + + CheckScalar("case_when", {MakeStruct({cond_true}), scalar1, values1}, + *MakeArrayFromScalar(*scalar1, 4)); + CheckScalar("case_when", {MakeStruct({cond_false}), scalar1, values1}, values1); + + CheckScalar("case_when", {MakeStruct({cond_true}), values1}, values1); + CheckScalar("case_when", {MakeStruct({cond_false}), values1}, values_null); + CheckScalar("case_when", {MakeStruct({cond_null}), values1}, values_null); + CheckScalar("case_when", {MakeStruct({cond_true}), values1, values2}, values1); + CheckScalar("case_when", {MakeStruct({cond_false}), values1, values2}, values2); + CheckScalar("case_when", {MakeStruct({cond_null}), values1, values2}, values2); + + CheckScalar("case_when", {MakeStruct({cond_true, cond_true}), values1, values2}, + values1); + CheckScalar("case_when", {MakeStruct({cond_false, cond_false}), values1, values2}, + values_null); + CheckScalar("case_when", {MakeStruct({cond_true, cond_false}), values1, values2}, + values1); + CheckScalar("case_when", {MakeStruct({cond_false, cond_true}), values1, values2}, + values2); + CheckScalar("case_when", {MakeStruct({cond_null, cond_true}), values1, values2}, + values2); + CheckScalar("case_when", + {MakeStruct({cond_false, cond_false}), values1, values2, values2}, values2); + + CheckScalar("case_when", {MakeStruct({cond1, cond2}), scalar1, scalar2}, + ArrayFromJSON(type, R"(["aBxYz", "aBxYz", "b", null])")); + CheckScalar("case_when", {MakeStruct({cond1}), scalar_null}, values_null); + CheckScalar("case_when", {MakeStruct({cond1}), scalar_null, scalar1}, + ArrayFromJSON(type, R"([null, null, "aBxYz", "aBxYz"])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), scalar1, scalar2, scalar1}, + ArrayFromJSON(type, R"(["aBxYz", "aBxYz", "b", "aBxYz"])")); + + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2}, + ArrayFromJSON(type, R"(["cDE", null, null, null])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2, values1}, + ArrayFromJSON(type, R"(["cDE", null, null, "efg"])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values_null, values2, values1}, + ArrayFromJSON(type, R"([null, null, null, "efg"])")); +} + TEST(TestCaseWhen, DispatchBest) { CheckDispatchBest("case_when", {struct_({field("", boolean())}), int64(), int32()}, {struct_({field("", boolean())}), int64(), int64()}); From 2202b74434e7f1892c7b269363a0fe4cd0d321d8 Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 21 Jul 2021 14:56:58 -0400 Subject: [PATCH 02/31] ARROW-13222: [C++] Add list type support for case_when kernel --- .../arrow/compute/kernels/scalar_if_else.cc | 206 +++++++++++++----- .../compute/kernels/scalar_if_else_test.cc | 67 +++++- cpp/src/arrow/testing/gtest_util.h | 2 + 3 files changed, 217 insertions(+), 58 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index e130d2b3ad9..d946c5ed78b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +#include #include #include #include @@ -1413,6 +1414,46 @@ struct CaseWhenFunctor { } }; +template +static Status ExecVarWidthScalarCaseWhen(KernelContext* ctx, const ExecBatch& batch, + Datum* out, CopyArray copy_array) { + const auto& conds = checked_cast(*batch.values[0].scalar()); + Datum result; + for (size_t i = 0; i < batch.values.size() - 1; i++) { + if (i < conds.value.size()) { + const Scalar& cond = *conds.value[i]; + if (cond.is_valid && internal::UnboxScalar::Unbox(cond)) { + result = batch[i + 1]; + break; + } + } else { + // ELSE clause + result = batch[i + 1]; + break; + } + } + if (out->is_scalar()) { + *out = result.is_scalar() ? result.scalar() : MakeNullScalar(out->type()); + return Status::OK(); + } + ArrayData* output = out->mutable_array(); + if (!result.is_value()) { + // All conditions false, no 'else' argument + ARROW_ASSIGN_OR_RAISE( + auto array, MakeArrayOfNull(output->type, batch.length, ctx->memory_pool())); + *output = *array->data(); + } else if (result.is_scalar()) { + ARROW_ASSIGN_OR_RAISE(auto array, MakeArrayFromScalar(*result.scalar(), batch.length, + ctx->memory_pool())); + *output = *array->data(); + } else { + // Copy offsets/data + const ArrayData& source = *result.array(); + RETURN_NOT_OK(copy_array(ctx, source, output)); + } + return Status::OK(); +} + template struct CaseWhenFunctor> { using offset_type = typename Type::offset_type; @@ -1428,70 +1469,37 @@ struct CaseWhenFunctor> { } static Status ExecScalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - const auto& conds = checked_cast(*batch.values[0].scalar()); - Datum result; - for (size_t i = 0; i < batch.values.size() - 1; i++) { - if (i < conds.value.size()) { - const Scalar& cond = *conds.value[i]; - if (cond.is_valid && internal::UnboxScalar::Unbox(cond)) { - result = batch[i + 1]; - break; - } - } else { - // ELSE clause - result = batch[i + 1]; - break; - } - } - if (out->is_scalar()) { - *out = result.is_scalar() ? result.scalar() : MakeNullScalar(out->type()); - return Status::OK(); - } - ArrayData* output = out->mutable_array(); - if (!result.is_value()) { - // All conditions false, no 'else' argument - ARROW_ASSIGN_OR_RAISE( - auto array, MakeArrayOfNull(output->type, batch.length, ctx->memory_pool())); - *output = *array->data(); - } else if (result.is_scalar()) { - ARROW_ASSIGN_OR_RAISE( - auto array, - MakeArrayFromScalar(*result.scalar(), batch.length, ctx->memory_pool())); - *output = *array->data(); - } else { - // Copy offsets/data - const ArrayData& source = *result.array(); - output->length = source.length; - output->SetNullCount(source.null_count); - if (source.MayHaveNulls()) { - ARROW_ASSIGN_OR_RAISE( - output->buffers[0], - arrow::internal::CopyBitmap(ctx->memory_pool(), source.buffers[0]->data(), - source.offset, source.length)); - } - ARROW_ASSIGN_OR_RAISE(output->buffers[1], - ctx->Allocate(sizeof(offset_type) * (source.length + 1))); - const offset_type* in_offsets = source.GetValues(1); - offset_type* out_offsets = output->GetMutableValues(1); - std::transform(in_offsets, in_offsets + source.length + 1, out_offsets, - [&](offset_type offset) { return offset - in_offsets[0]; }); - auto data_length = out_offsets[output->length] - out_offsets[0]; - ARROW_ASSIGN_OR_RAISE(output->buffers[2], ctx->Allocate(data_length)); - std::memcpy(output->buffers[2]->mutable_data(), - source.buffers[2]->data() + in_offsets[0], data_length); - } - return Status::OK(); + return ExecVarWidthScalarCaseWhen( + ctx, batch, out, + [](KernelContext* ctx, const ArrayData& source, ArrayData* output) { + output->length = source.length; + output->SetNullCount(source.null_count); + if (source.MayHaveNulls()) { + ARROW_ASSIGN_OR_RAISE( + output->buffers[0], + arrow::internal::CopyBitmap(ctx->memory_pool(), source.buffers[0]->data(), + source.offset, source.length)); + } + ARROW_ASSIGN_OR_RAISE(output->buffers[1], + ctx->Allocate(sizeof(offset_type) * (source.length + 1))); + const offset_type* in_offsets = source.GetValues(1); + offset_type* out_offsets = output->GetMutableValues(1); + std::transform(in_offsets, in_offsets + source.length + 1, out_offsets, + [&](offset_type offset) { return offset - in_offsets[0]; }); + auto data_length = out_offsets[output->length] - out_offsets[0]; + ARROW_ASSIGN_OR_RAISE(output->buffers[2], ctx->Allocate(data_length)); + std::memcpy(output->buffers[2]->mutable_data(), + source.buffers[2]->data() + in_offsets[0], data_length); + return Status::OK(); + }); } static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { const auto& conds_array = *batch.values[0].array(); - if (conds_array.GetNullCount() > 0) { - return Status::Invalid("cond struct must not have top-level nulls"); - } ArrayData* output = out->mutable_array(); const bool have_else_arg = static_cast(conds_array.type->num_fields()) < (batch.values.size() - 1); - BuilderType builder(batch[0].type(), ctx->memory_pool()); + BuilderType builder(out->type(), ctx->memory_pool()); RETURN_NOT_OK(builder.Reserve(batch.length)); int64_t reservation = 0; for (size_t arg = 1; arg < batch.values.size(); arg++) { @@ -1555,6 +1563,88 @@ struct CaseWhenFunctor> { } }; +template +struct CaseWhenFunctor> { + using offset_type = typename Type::offset_type; + using BuilderType = typename TypeTraits::BuilderType; + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + if (batch[0].null_count() > 0) { + return Status::Invalid("cond struct must not have outer nulls"); + } + if (batch[0].is_scalar()) { + return ExecScalar(ctx, batch, out); + } + return ExecArray(ctx, batch, out); + } + + static Status ExecScalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + return ExecVarWidthScalarCaseWhen( + ctx, batch, out, + [](KernelContext* ctx, const ArrayData& source, ArrayData* output) { + *output = source; + return Status::OK(); + }); + } + static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + const auto& conds_array = *batch.values[0].array(); + ArrayData* output = out->mutable_array(); + const bool have_else_arg = + static_cast(conds_array.type->num_fields()) < (batch.values.size() - 1); + std::unique_ptr raw_builder; + RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), out->type(), &raw_builder)); + BuilderType* builder = checked_cast(raw_builder.get()); + RETURN_NOT_OK(builder->Reserve(batch.length)); + + for (int64_t row = 0; row < batch.length; row++) { + int64_t selected = have_else_arg ? batch.values.size() - 1 : -1; + for (int64_t arg = 0; static_cast(arg) < conds_array.child_data.size(); + arg++) { + const ArrayData& cond_array = *conds_array.child_data[arg]; + if ((!cond_array.buffers[0] || + BitUtil::GetBit(cond_array.buffers[0]->data(), + conds_array.offset + cond_array.offset + row)) && + BitUtil::GetBit(cond_array.buffers[1]->data(), + conds_array.offset + cond_array.offset + row)) { + selected = arg + 1; + break; + } + } + if (selected < 0) { + RETURN_NOT_OK(builder->AppendNull()); + continue; + } + const Datum& source = batch.values[selected]; + // This is horrendously slow, but generic + if (source.is_scalar()) { + const auto& scalar = *source.scalar(); + if (!scalar.is_valid) { + RETURN_NOT_OK(builder->AppendNull()); + } else { + RETURN_NOT_OK(builder->AppendScalar(scalar)); + } + } else { + const auto& array = *source.array(); + if (!array.buffers[0] || + BitUtil::GetBit(array.buffers[0]->data(), array.offset + row)) { + const auto boxed_array = source.make_array(); + if (boxed_array->IsValid(row)) { + ARROW_ASSIGN_OR_RAISE(auto element, boxed_array->GetScalar(row)); + RETURN_NOT_OK(builder->AppendScalar(*element)); + } else { + RETURN_NOT_OK(builder->AppendNull()); + } + } else { + RETURN_NOT_OK(builder->AppendNull()); + } + } + } + + ARROW_ASSIGN_OR_RAISE(auto temp_output, builder->Finish()); + *output = *temp_output->data(); + return Status::OK(); + } +}; + struct CoalesceFunction : ScalarFunction { using ScalarFunction::ScalarFunction; @@ -2114,6 +2204,8 @@ void RegisterScalarIfElse(FunctionRegistry* registry) { AddCaseWhenKernel(func, Type::DECIMAL128, CaseWhenFunctor::Exec); AddCaseWhenKernel(func, Type::DECIMAL256, CaseWhenFunctor::Exec); AddBinaryCaseWhenKernels(func, BaseBinaryTypes()); + AddCaseWhenKernel(func, Type::LIST, CaseWhenFunctor::Exec); + AddCaseWhenKernel(func, Type::LARGE_LIST, CaseWhenFunctor::Exec); DCHECK_OK(registry->AddFunction(std::move(func))); } { diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc index 90f73107338..39ff7a90d08 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc @@ -853,7 +853,7 @@ TEST(TestCaseWhen, FixedSizeBinary) { template class TestCaseWhenBinary : public ::testing::Test {}; -TYPED_TEST_SUITE(TestCaseWhenBinary, BinaryTypes); +TYPED_TEST_SUITE(TestCaseWhenBinary, BinaryArrowTypes); TYPED_TEST(TestCaseWhenBinary, Basics) { auto type = default_type_instance(); @@ -912,6 +912,71 @@ TYPED_TEST(TestCaseWhenBinary, Basics) { ArrayFromJSON(type, R"([null, null, null, "efg"])")); } +template +class TestCaseWhenList : public ::testing::Test {}; + +TYPED_TEST_SUITE(TestCaseWhenList, ListArrowTypes); + +TYPED_TEST(TestCaseWhenList, ListOfString) { + auto type = std::make_shared(utf8()); + auto cond_true = ScalarFromJSON(boolean(), "true"); + auto cond_false = ScalarFromJSON(boolean(), "false"); + auto cond_null = ScalarFromJSON(boolean(), "null"); + auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]"); + auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]"); + auto scalar_null = ScalarFromJSON(type, "null"); + auto scalar1 = ScalarFromJSON(type, R"(["aB", "xYz"])"); + auto scalar2 = ScalarFromJSON(type, R"(["b", null])"); + auto values_null = ArrayFromJSON(type, "[null, null, null, null]"); + auto values1 = + ArrayFromJSON(type, R"([["cD", "E"], null, ["de", "gf", "hi"], ["ef", "g"]])"); + auto values2 = ArrayFromJSON(type, R"([["f", "ghi", "jk"], ["ghi"], null, ["hi"]])"); + + CheckScalar("case_when", {MakeStruct({}), values1}, values1); + CheckScalar("case_when", {MakeStruct({}), values_null}, values_null); + + CheckScalar("case_when", {MakeStruct({cond_true}), scalar1, values1}, + *MakeArrayFromScalar(*scalar1, 4)); + CheckScalar("case_when", {MakeStruct({cond_false}), scalar1, values1}, values1); + + CheckScalar("case_when", {MakeStruct({cond_true}), values1}, values1); + CheckScalar("case_when", {MakeStruct({cond_false}), values1}, values_null); + CheckScalar("case_when", {MakeStruct({cond_null}), values1}, values_null); + CheckScalar("case_when", {MakeStruct({cond_true}), values1, values2}, values1); + CheckScalar("case_when", {MakeStruct({cond_false}), values1, values2}, values2); + CheckScalar("case_when", {MakeStruct({cond_null}), values1, values2}, values2); + + CheckScalar("case_when", {MakeStruct({cond_true, cond_true}), values1, values2}, + values1); + CheckScalar("case_when", {MakeStruct({cond_false, cond_false}), values1, values2}, + values_null); + CheckScalar("case_when", {MakeStruct({cond_true, cond_false}), values1, values2}, + values1); + CheckScalar("case_when", {MakeStruct({cond_false, cond_true}), values1, values2}, + values2); + CheckScalar("case_when", {MakeStruct({cond_null, cond_true}), values1, values2}, + values2); + CheckScalar("case_when", + {MakeStruct({cond_false, cond_false}), values1, values2, values2}, values2); + + CheckScalar( + "case_when", {MakeStruct({cond1, cond2}), scalar1, scalar2}, + ArrayFromJSON(type, R"([["aB", "xYz"], ["aB", "xYz"], ["b", null], null])")); + CheckScalar("case_when", {MakeStruct({cond1}), scalar_null}, values_null); + CheckScalar("case_when", {MakeStruct({cond1}), scalar_null, scalar1}, + ArrayFromJSON(type, R"([null, null, ["aB", "xYz"], ["aB", "xYz"]])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), scalar1, scalar2, scalar1}, + ArrayFromJSON( + type, R"([["aB", "xYz"], ["aB", "xYz"], ["b", null], ["aB", "xYz"]])")); + + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2}, + ArrayFromJSON(type, R"([["cD", "E"], null, null, null])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2, values1}, + ArrayFromJSON(type, R"([["cD", "E"], null, null, ["ef", "g"]])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values_null, values2, values1}, + ArrayFromJSON(type, R"([null, null, null, ["ef", "g"]])")); +} + TEST(TestCaseWhen, DispatchBest) { CheckDispatchBest("case_when", {struct_({field("", boolean())}), int64(), int32()}, {struct_({field("", boolean())}), int64(), int64()}); diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index 3f9408ecdcb..a58064e4261 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -170,6 +170,8 @@ using BinaryArrowTypes = using StringArrowTypes = ::testing::Types; +using ListArrowTypes = ::testing::Types; + using UnionArrowTypes = ::testing::Types; class Array; From 737347a707999af31b2e4f76c5037fbc680e067b Mon Sep 17 00:00:00 2001 From: David Li Date: Thu, 22 Jul 2021 12:47:46 -0400 Subject: [PATCH 03/31] ARROW-13222: [C++] Add more benchmarks for case_when kernel --- .../arrow/compute/kernels/scalar_if_else.cc | 235 +++++++++--------- .../kernels/scalar_if_else_benchmark.cc | 103 ++++++-- 2 files changed, 196 insertions(+), 142 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index d946c5ed78b..70ad1676de4 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -1454,6 +1454,62 @@ static Status ExecVarWidthScalarCaseWhen(KernelContext* ctx, const ExecBatch& ba return Status::OK(); } +template +static Status ExecVarWidthArrayCaseWhen(KernelContext* ctx, const ExecBatch& batch, + Datum* out, ReserveData reserve_data, + AppendScalar append_scalar, + AppendArray append_array) { + const auto& conds_array = *batch.values[0].array(); + ArrayData* output = out->mutable_array(); + const bool have_else_arg = + static_cast(conds_array.type->num_fields()) < (batch.values.size() - 1); + std::unique_ptr raw_builder; + RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), out->type(), &raw_builder)); + RETURN_NOT_OK(raw_builder->Reserve(batch.length)); + reserve_data(raw_builder.get()); + + for (int64_t row = 0; row < batch.length; row++) { + int64_t selected = have_else_arg ? batch.values.size() - 1 : -1; + for (int64_t arg = 0; static_cast(arg) < conds_array.child_data.size(); + arg++) { + const ArrayData& cond_array = *conds_array.child_data[arg]; + if ((!cond_array.buffers[0] || + BitUtil::GetBit(cond_array.buffers[0]->data(), + conds_array.offset + cond_array.offset + row)) && + BitUtil::GetBit(cond_array.buffers[1]->data(), + conds_array.offset + cond_array.offset + row)) { + selected = arg + 1; + break; + } + } + if (selected < 0) { + RETURN_NOT_OK(raw_builder->AppendNull()); + continue; + } + const Datum& source = batch.values[selected]; + if (source.is_scalar()) { + const auto& scalar = *source.scalar(); + if (!scalar.is_valid) { + RETURN_NOT_OK(raw_builder->AppendNull()); + } else { + RETURN_NOT_OK(append_scalar(raw_builder.get(), scalar)); + } + } else { + const auto& array = source.array(); + if (!array->buffers[0] || + BitUtil::GetBit(array->buffers[0]->data(), array->offset + row)) { + RETURN_NOT_OK(append_array(raw_builder.get(), array, row)); + } else { + RETURN_NOT_OK(raw_builder->AppendNull()); + } + } + } + + ARROW_ASSIGN_OR_RAISE(auto temp_output, raw_builder->Finish()); + *output = *temp_output->data(); + return Status::OK(); +} + template struct CaseWhenFunctor> { using offset_type = typename Type::offset_type; @@ -1495,71 +1551,43 @@ struct CaseWhenFunctor> { } static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - const auto& conds_array = *batch.values[0].array(); - ArrayData* output = out->mutable_array(); - const bool have_else_arg = - static_cast(conds_array.type->num_fields()) < (batch.values.size() - 1); - BuilderType builder(out->type(), ctx->memory_pool()); - RETURN_NOT_OK(builder.Reserve(batch.length)); - int64_t reservation = 0; - for (size_t arg = 1; arg < batch.values.size(); arg++) { - auto source = batch.values[arg]; - if (source.is_scalar()) { - const auto& scalar = checked_cast(*source.scalar()); - if (!scalar.value) continue; - reservation = std::max(reservation, batch.length * scalar.value->size()); - } else { - const auto& array = *source.array(); - const auto& offsets = array.GetValues(1); - reservation = std::max(reservation, offsets[array.length] - offsets[0]); - } - } - RETURN_NOT_OK(builder.ReserveData(reservation)); - - for (int64_t row = 0; row < batch.length; row++) { - int64_t selected = have_else_arg ? batch.values.size() - 1 : -1; - for (int64_t arg = 0; static_cast(arg) < conds_array.child_data.size(); - arg++) { - const ArrayData& cond_array = *conds_array.child_data[arg]; - if ((!cond_array.buffers[0] || - BitUtil::GetBit(cond_array.buffers[0]->data(), - conds_array.offset + cond_array.offset + row)) && - BitUtil::GetBit(cond_array.buffers[1]->data(), - conds_array.offset + cond_array.offset + row)) { - selected = arg + 1; - break; - } - } - if (selected < 0) { - RETURN_NOT_OK(builder.AppendNull()); - continue; - } - const Datum& source = batch.values[selected]; - if (source.is_scalar()) { - const auto& scalar = checked_cast(*source.scalar()); - if (!scalar.is_valid) { - RETURN_NOT_OK(builder.AppendNull()); - } else { - RETURN_NOT_OK(builder.Append(scalar.value->data(), scalar.value->size())); - } - } else { - const auto& array = *source.array(); - if (!array.buffers[0] || - BitUtil::GetBit(array.buffers[0]->data(), array.offset + row)) { - const offset_type* offsets = array.GetValues(1); - RETURN_NOT_OK(builder.Append(array.buffers[2]->data() + offsets[row], - offsets[row + 1] - offsets[row])); - } else { - RETURN_NOT_OK(builder.AppendNull()); - } - } - } - - ARROW_ASSIGN_OR_RAISE(auto temp_output, builder.Finish()); - *output = *temp_output->data(); - // Builder type != logical type due to GenerateTypeAgnosticVarBinaryBase - output->type = batch[1].type(); - return Status::OK(); + return ExecVarWidthArrayCaseWhen( + ctx, batch, out, + // ReserveData + [&](ArrayBuilder* raw_builder) { + int64_t reservation = 0; + for (size_t arg = 1; arg < batch.values.size(); arg++) { + auto source = batch.values[arg]; + if (source.is_scalar()) { + const auto& scalar = + checked_cast(*source.scalar()); + if (!scalar.value) continue; + reservation = + std::max(reservation, batch.length * scalar.value->size()); + } else { + const auto& array = *source.array(); + const auto& offsets = array.GetValues(1); + reservation = + std::max(reservation, offsets[array.length] - offsets[0]); + } + } + // checked_cast works since (Large)StringBuilder <: (Large)BinaryBuilder + return checked_cast(raw_builder)->ReserveData(reservation); + }, + // AppendScalar + [](ArrayBuilder* raw_builder, const Scalar& raw_scalar) { + const auto& scalar = checked_cast(raw_scalar); + return checked_cast(raw_builder) + ->Append(scalar.value->data(), scalar.value->size()); + }, + // AppendArray + [](ArrayBuilder* raw_builder, const std::shared_ptr& array, + const int64_t row) { + const offset_type* offsets = array->GetValues(1); + return checked_cast(raw_builder) + ->Append(array->buffers[2]->data() + offsets[row], + offsets[row + 1] - offsets[row]); + }); } }; @@ -1586,65 +1614,30 @@ struct CaseWhenFunctor> { }); } static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - const auto& conds_array = *batch.values[0].array(); - ArrayData* output = out->mutable_array(); - const bool have_else_arg = - static_cast(conds_array.type->num_fields()) < (batch.values.size() - 1); - std::unique_ptr raw_builder; - RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), out->type(), &raw_builder)); - BuilderType* builder = checked_cast(raw_builder.get()); - RETURN_NOT_OK(builder->Reserve(batch.length)); - - for (int64_t row = 0; row < batch.length; row++) { - int64_t selected = have_else_arg ? batch.values.size() - 1 : -1; - for (int64_t arg = 0; static_cast(arg) < conds_array.child_data.size(); - arg++) { - const ArrayData& cond_array = *conds_array.child_data[arg]; - if ((!cond_array.buffers[0] || - BitUtil::GetBit(cond_array.buffers[0]->data(), - conds_array.offset + cond_array.offset + row)) && - BitUtil::GetBit(cond_array.buffers[1]->data(), - conds_array.offset + cond_array.offset + row)) { - selected = arg + 1; - break; - } - } - if (selected < 0) { - RETURN_NOT_OK(builder->AppendNull()); - continue; - } - const Datum& source = batch.values[selected]; - // This is horrendously slow, but generic - if (source.is_scalar()) { - const auto& scalar = *source.scalar(); - if (!scalar.is_valid) { - RETURN_NOT_OK(builder->AppendNull()); - } else { - RETURN_NOT_OK(builder->AppendScalar(scalar)); - } - } else { - const auto& array = *source.array(); - if (!array.buffers[0] || - BitUtil::GetBit(array.buffers[0]->data(), array.offset + row)) { - const auto boxed_array = source.make_array(); - if (boxed_array->IsValid(row)) { - ARROW_ASSIGN_OR_RAISE(auto element, boxed_array->GetScalar(row)); - RETURN_NOT_OK(builder->AppendScalar(*element)); - } else { - RETURN_NOT_OK(builder->AppendNull()); - } - } else { - RETURN_NOT_OK(builder->AppendNull()); - } - } - } - - ARROW_ASSIGN_OR_RAISE(auto temp_output, builder->Finish()); - *output = *temp_output->data(); - return Status::OK(); + // TODO: horrendously slow, but generic + return ExecVarWidthArrayCaseWhen( + ctx, batch, out, + // ReserveData + [](ArrayBuilder*) {}, + // AppendScalar + [](ArrayBuilder* raw_builder, const Scalar& scalar) { + return raw_builder->AppendScalar(scalar); + }, + // AppendArray + [](ArrayBuilder* raw_builder, const std::shared_ptr& array, + const int64_t row) { + ARROW_ASSIGN_OR_RAISE(auto scalar, MakeArray(array)->GetScalar(row)); + return raw_builder->AppendScalar(*scalar); + }); } }; +// TODO: map, fixed size list, struct, union, dictionary + +// TODO: file separate issue for dictionary? need utility to unify +// dictionary and return mapping. what is an R factor? may need to +// promote index type + struct CoalesceFunction : ScalarFunction { using ScalarFunction::ScalarFunction; diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc index 9b59d54c3da..dbe6f957247 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc @@ -27,32 +27,31 @@ namespace arrow { namespace compute { const int64_t kNumItems = 1024 * 1024; +const int64_t kFewItems = 64 * 1024; template -struct SetBytesProcessed {}; +struct GetBytesProcessed {}; + +template <> +struct GetBytesProcessed { + static int64_t Get(const std::shared_ptr& arr) { return arr->length() / 8; } +}; template -struct SetBytesProcessed> { - static void Set(const std::shared_ptr& cond, const std::shared_ptr& left, - const std::shared_ptr& right, benchmark::State* state) { +struct GetBytesProcessed> { + static int64_t Get(const std::shared_ptr& arr) { using CType = typename Type::c_type; - state->SetBytesProcessed(state->iterations() * - (cond->length() / 8 + 2 * cond->length() * sizeof(CType))); + return arr->length() * sizeof(CType); } }; template -struct SetBytesProcessed> { - static void Set(const std::shared_ptr& cond, const std::shared_ptr& left, - const std::shared_ptr& right, benchmark::State* state) { +struct GetBytesProcessed> { + static int64_t Get(const std::shared_ptr& arr) { using ArrayType = typename TypeTraits::ArrayType; using OffsetType = typename TypeTraits::OffsetType::c_type; - - state->SetBytesProcessed( - state->iterations() * - (cond->length() / 8 + 2 * cond->length() * sizeof(OffsetType) + - std::static_pointer_cast(left)->total_values_length() + - std::static_pointer_cast(right)->total_values_length())); + return arr->length() * sizeof(OffsetType) + + std::static_pointer_cast(arr)->total_values_length(); } }; @@ -80,7 +79,10 @@ static void IfElseBench(benchmark::State& state) { ABORT_NOT_OK(IfElse(cond, left, right)); } - SetBytesProcessed::Set(cond, left, right, &state); + state.SetBytesProcessed(state.iterations() * + (GetBytesProcessed::Get(cond) + + GetBytesProcessed::Get(left) + + GetBytesProcessed::Get(right))); } template @@ -109,7 +111,10 @@ static void IfElseBenchContiguous(benchmark::State& state) { ABORT_NOT_OK(IfElse(cond, left, right)); } - SetBytesProcessed::Set(cond, left, right, &state); + state.SetBytesProcessed(state.iterations() * + (GetBytesProcessed::Get(cond) + + GetBytesProcessed::Get(left) + + GetBytesProcessed::Get(right))); } static void IfElseBench64(benchmark::State& state) { @@ -146,7 +151,6 @@ static void IfElseBenchString32Contiguous(benchmark::State& state) { template static void CaseWhenBench(benchmark::State& state) { - using CType = typename Type::c_type; auto type = TypeTraits::type_singleton(); using ArrayType = typename TypeTraits::ArrayType; @@ -180,12 +184,50 @@ static void CaseWhenBench(benchmark::State& state) { val3->Slice(offset), val4->Slice(offset)})); } - state.SetBytesProcessed(state.iterations() * (len - offset) * sizeof(CType)); + // Set bytes processed to ~length of output + state.SetBytesProcessed(state.iterations() * GetBytesProcessed::Get(val1)); + state.SetItemsProcessed(state.iterations() * (len - offset)); +} + +static void CaseWhenBenchList(benchmark::State& state) { + auto type = list(int64()); + auto fld = field("", type); + + int64_t len = state.range(0); + int64_t offset = state.range(1); + + random::RandomArrayGenerator rand(/*seed=*/0); + + auto cond1 = std::static_pointer_cast( + rand.ArrayOf(boolean(), len, /*null_probability=*/0.01)); + auto cond2 = std::static_pointer_cast( + rand.ArrayOf(boolean(), len, /*null_probability=*/0.01)); + auto cond3 = std::static_pointer_cast( + rand.ArrayOf(boolean(), len, /*null_probability=*/0.01)); + auto cond_field = + field("cond", boolean(), key_value_metadata({{"null_probability", "0.01"}})); + auto cond = rand.ArrayOf(*field("", struct_({cond_field, cond_field, cond_field}), + key_value_metadata({{"null_probability", "0.0"}})), + len); + auto val1 = rand.ArrayOf(*fld, len); + auto val2 = rand.ArrayOf(*fld, len); + auto val3 = rand.ArrayOf(*fld, len); + auto val4 = rand.ArrayOf(*fld, len); + for (auto _ : state) { + ABORT_NOT_OK( + CaseWhen(cond->Slice(offset), {val1->Slice(offset), val2->Slice(offset), + val3->Slice(offset), val4->Slice(offset)})); + } + + // Set bytes processed to ~length of output + state.SetBytesProcessed(state.iterations() * + GetBytesProcessed::Get( + std::static_pointer_cast(val1)->values())); + state.SetItemsProcessed(state.iterations() * (len - offset)); } template static void CaseWhenBenchContiguous(benchmark::State& state) { - using CType = typename Type::c_type; auto type = TypeTraits::type_singleton(); using ArrayType = typename TypeTraits::ArrayType; @@ -216,7 +258,9 @@ static void CaseWhenBenchContiguous(benchmark::State& state) { val3->Slice(offset)})); } - state.SetBytesProcessed(state.iterations() * (len - offset) * sizeof(CType)); + // Set bytes processed to ~length of output + state.SetBytesProcessed(state.iterations() * GetBytesProcessed::Get(val1)); + state.SetItemsProcessed(state.iterations() * (len - offset)); } static void CaseWhenBench64(benchmark::State& state) { @@ -227,6 +271,14 @@ static void CaseWhenBench64Contiguous(benchmark::State& state) { return CaseWhenBenchContiguous(state); } +static void CaseWhenBenchString(benchmark::State& state) { + return CaseWhenBench(state); +} + +static void CaseWhenBenchStringContiguous(benchmark::State& state) { + return CaseWhenBenchContiguous(state); +} + template static void CoalesceBench(benchmark::State& state) { using CType = typename Type::c_type; @@ -337,6 +389,15 @@ BENCHMARK(CaseWhenBench64)->Args({kNumItems, 99}); BENCHMARK(CaseWhenBench64Contiguous)->Args({kNumItems, 0}); BENCHMARK(CaseWhenBench64Contiguous)->Args({kNumItems, 99}); +BENCHMARK(CaseWhenBenchList)->Args({kFewItems, 0}); +BENCHMARK(CaseWhenBenchList)->Args({kFewItems, 99}); + +BENCHMARK(CaseWhenBenchString)->Args({kFewItems, 0}); +BENCHMARK(CaseWhenBenchString)->Args({kFewItems, 99}); + +BENCHMARK(CaseWhenBenchStringContiguous)->Args({kFewItems, 0}); +BENCHMARK(CaseWhenBenchStringContiguous)->Args({kFewItems, 99}); + BENCHMARK(CoalesceBench64)->Args({kNumItems, 0}); BENCHMARK(CoalesceBench64)->Args({kNumItems, 99}); From 6ffb2727955fa4f6f264628b4ba2b172ea198bbd Mon Sep 17 00:00:00 2001 From: David Li Date: Thu, 22 Jul 2021 15:10:05 -0400 Subject: [PATCH 04/31] ARROW-13222: [C++] Add basic implementations for map, fixed size list --- .../arrow/compute/kernels/scalar_if_else.cc | 83 +++++++- .../compute/kernels/scalar_if_else_test.cc | 184 ++++++++++++++++++ 2 files changed, 265 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index 70ad1676de4..d7522988136 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -1593,8 +1593,84 @@ struct CaseWhenFunctor> { template struct CaseWhenFunctor> { - using offset_type = typename Type::offset_type; - using BuilderType = typename TypeTraits::BuilderType; + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + if (batch[0].null_count() > 0) { + return Status::Invalid("cond struct must not have outer nulls"); + } + if (batch[0].is_scalar()) { + return ExecScalar(ctx, batch, out); + } + return ExecArray(ctx, batch, out); + } + + static Status ExecScalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + return ExecVarWidthScalarCaseWhen( + ctx, batch, out, + [](KernelContext* ctx, const ArrayData& source, ArrayData* output) { + *output = source; + return Status::OK(); + }); + } + static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + // TODO: horrendously slow, but generic + return ExecVarWidthArrayCaseWhen( + ctx, batch, out, + // ReserveData + [](ArrayBuilder*) {}, + // AppendScalar + [](ArrayBuilder* raw_builder, const Scalar& scalar) { + return raw_builder->AppendScalar(scalar); + }, + // AppendArray + [](ArrayBuilder* raw_builder, const std::shared_ptr& array, + const int64_t row) { + ARROW_ASSIGN_OR_RAISE(auto scalar, MakeArray(array)->GetScalar(row)); + return raw_builder->AppendScalar(*scalar); + }); + } +}; + +template <> +struct CaseWhenFunctor { + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + if (batch[0].null_count() > 0) { + return Status::Invalid("cond struct must not have outer nulls"); + } + if (batch[0].is_scalar()) { + return ExecScalar(ctx, batch, out); + } + return ExecArray(ctx, batch, out); + } + + static Status ExecScalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + return ExecVarWidthScalarCaseWhen( + ctx, batch, out, + [](KernelContext* ctx, const ArrayData& source, ArrayData* output) { + *output = source; + return Status::OK(); + }); + } + static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + // TODO: horrendously slow, but generic + return ExecVarWidthArrayCaseWhen( + ctx, batch, out, + // ReserveData + [](ArrayBuilder*) {}, + // AppendScalar + [](ArrayBuilder* raw_builder, const Scalar& scalar) { + return raw_builder->AppendScalar(scalar); + }, + // AppendArray + [](ArrayBuilder* raw_builder, const std::shared_ptr& array, + const int64_t row) { + ARROW_ASSIGN_OR_RAISE(auto scalar, MakeArray(array)->GetScalar(row)); + return raw_builder->AppendScalar(*scalar); + }); + } +}; + +template <> +struct CaseWhenFunctor { static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { if (batch[0].null_count() > 0) { return Status::Invalid("cond struct must not have outer nulls"); @@ -2199,6 +2275,9 @@ void RegisterScalarIfElse(FunctionRegistry* registry) { AddBinaryCaseWhenKernels(func, BaseBinaryTypes()); AddCaseWhenKernel(func, Type::LIST, CaseWhenFunctor::Exec); AddCaseWhenKernel(func, Type::LARGE_LIST, CaseWhenFunctor::Exec); + AddCaseWhenKernel(func, Type::FIXED_SIZE_LIST, + CaseWhenFunctor::Exec); + AddCaseWhenKernel(func, Type::MAP, CaseWhenFunctor::Exec); DCHECK_OK(registry->AddFunction(std::move(func))); } { diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc index 39ff7a90d08..b1056ca94fe 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc @@ -977,6 +977,190 @@ TYPED_TEST(TestCaseWhenList, ListOfString) { ArrayFromJSON(type, R"([null, null, null, ["ef", "g"]])")); } +TEST(TestCaseWhen, Map) { + auto type = map(int64(), utf8()); + auto cond_true = ScalarFromJSON(boolean(), "true"); + auto cond_false = ScalarFromJSON(boolean(), "false"); + auto cond_null = ScalarFromJSON(boolean(), "null"); + auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]"); + auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]"); + auto scalar_null = ScalarFromJSON(type, "null"); + auto scalar1 = ScalarFromJSON(type, R"([[1, "abc"], [2, "de"]])"); + auto scalar2 = ScalarFromJSON(type, R"([[3, "fghi"]])"); + auto values_null = ArrayFromJSON(type, "[null, null, null, null]"); + auto values1 = + ArrayFromJSON(type, R"([[[4, "kl"]], null, [[5, "mn"]], [[6, "o"], [7, "pq"]]])"); + auto values2 = ArrayFromJSON(type, R"([[[8, "r"], [9, "st"]], [[10, "u"]], null, []])"); + + CheckScalar("case_when", {MakeStruct({}), values1}, values1); + CheckScalar("case_when", {MakeStruct({}), values_null}, values_null); + + CheckScalar("case_when", {MakeStruct({cond_true}), scalar1, values1}, + *MakeArrayFromScalar(*scalar1, 4)); + CheckScalar("case_when", {MakeStruct({cond_false}), scalar1, values1}, values1); + + CheckScalar("case_when", {MakeStruct({cond_true}), values1}, values1); + CheckScalar("case_when", {MakeStruct({cond_false}), values1}, values_null); + CheckScalar("case_when", {MakeStruct({cond_null}), values1}, values_null); + CheckScalar("case_when", {MakeStruct({cond_true}), values1, values2}, values1); + CheckScalar("case_when", {MakeStruct({cond_false}), values1, values2}, values2); + CheckScalar("case_when", {MakeStruct({cond_null}), values1, values2}, values2); + + CheckScalar("case_when", {MakeStruct({cond_true, cond_true}), values1, values2}, + values1); + CheckScalar("case_when", {MakeStruct({cond_false, cond_false}), values1, values2}, + values_null); + CheckScalar("case_when", {MakeStruct({cond_true, cond_false}), values1, values2}, + values1); + CheckScalar("case_when", {MakeStruct({cond_false, cond_true}), values1, values2}, + values2); + CheckScalar("case_when", {MakeStruct({cond_null, cond_true}), values1, values2}, + values2); + CheckScalar("case_when", + {MakeStruct({cond_false, cond_false}), values1, values2, values2}, values2); + + CheckScalar( + "case_when", {MakeStruct({cond1, cond2}), scalar1, scalar2}, + ArrayFromJSON( + type, + R"([[[1, "abc"], [2, "de"]], [[1, "abc"], [2, "de"]], [[3, "fghi"]], null])")); + CheckScalar("case_when", {MakeStruct({cond1}), scalar_null}, values_null); + CheckScalar( + "case_when", {MakeStruct({cond1}), scalar_null, scalar1}, + ArrayFromJSON(type, + R"([null, null, [[1, "abc"], [2, "de"]], [[1, "abc"], [2, "de"]]])")); + CheckScalar( + "case_when", {MakeStruct({cond1, cond2}), scalar1, scalar2, scalar1}, + ArrayFromJSON( + type, + R"([[[1, "abc"], [2, "de"]], [[1, "abc"], [2, "de"]], [[3, "fghi"]], [[1, "abc"], [2, "de"]]])")); + + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2}, + ArrayFromJSON(type, R"([[[4, "kl"]], null, null, null])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2, values1}, + ArrayFromJSON(type, R"([[[4, "kl"]], null, null, [[6, "o"], [7, "pq"]]])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values_null, values2, values1}, + ArrayFromJSON(type, R"([null, null, null, [[6, "o"], [7, "pq"]]])")); +} + +TEST(TestCaseWhen, FixedSizeListOfInt) { + auto type = fixed_size_list(int64(), 2); + auto cond_true = ScalarFromJSON(boolean(), "true"); + auto cond_false = ScalarFromJSON(boolean(), "false"); + auto cond_null = ScalarFromJSON(boolean(), "null"); + auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]"); + auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]"); + auto scalar_null = ScalarFromJSON(type, "null"); + auto scalar1 = ScalarFromJSON(type, R"([1, 2])"); + auto scalar2 = ScalarFromJSON(type, R"([3, null])"); + auto values_null = ArrayFromJSON(type, "[null, null, null, null]"); + auto values1 = ArrayFromJSON(type, R"([[4, 5], null, [6, 7], [8, 9]])"); + auto values2 = ArrayFromJSON(type, R"([[10, 11], [12, null], null, [null, 13]])"); + + CheckScalar("case_when", {MakeStruct({}), values1}, values1); + CheckScalar("case_when", {MakeStruct({}), values_null}, values_null); + + CheckScalar("case_when", {MakeStruct({cond_true}), scalar1, values1}, + *MakeArrayFromScalar(*scalar1, 4)); + CheckScalar("case_when", {MakeStruct({cond_false}), scalar1, values1}, values1); + + CheckScalar("case_when", {MakeStruct({cond_true}), values1}, values1); + CheckScalar("case_when", {MakeStruct({cond_false}), values1}, values_null); + CheckScalar("case_when", {MakeStruct({cond_null}), values1}, values_null); + CheckScalar("case_when", {MakeStruct({cond_true}), values1, values2}, values1); + CheckScalar("case_when", {MakeStruct({cond_false}), values1, values2}, values2); + CheckScalar("case_when", {MakeStruct({cond_null}), values1, values2}, values2); + + CheckScalar("case_when", {MakeStruct({cond_true, cond_true}), values1, values2}, + values1); + CheckScalar("case_when", {MakeStruct({cond_false, cond_false}), values1, values2}, + values_null); + CheckScalar("case_when", {MakeStruct({cond_true, cond_false}), values1, values2}, + values1); + CheckScalar("case_when", {MakeStruct({cond_false, cond_true}), values1, values2}, + values2); + CheckScalar("case_when", {MakeStruct({cond_null, cond_true}), values1, values2}, + values2); + CheckScalar("case_when", + {MakeStruct({cond_false, cond_false}), values1, values2, values2}, values2); + + CheckScalar("case_when", {MakeStruct({cond1, cond2}), scalar1, scalar2}, + ArrayFromJSON(type, R"([[1, 2], [1, 2], [3, null], null])")); + CheckScalar("case_when", {MakeStruct({cond1}), scalar_null}, values_null); + CheckScalar("case_when", {MakeStruct({cond1}), scalar_null, scalar1}, + ArrayFromJSON(type, R"([null, null, [1, 2], [1, 2]])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), scalar1, scalar2, scalar1}, + ArrayFromJSON(type, R"([[1, 2], [1, 2], [3, null], [1, 2]])")); + + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2}, + ArrayFromJSON(type, R"([[4, 5], null, null, null])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2, values1}, + ArrayFromJSON(type, R"([[4, 5], null, null, [8, 9]])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values_null, values2, values1}, + ArrayFromJSON(type, R"([null, null, null, [8, 9]])")); +} + +TEST(TestCaseWhen, FixedSizeListOfString) { + auto type = fixed_size_list(utf8(), 2); + auto cond_true = ScalarFromJSON(boolean(), "true"); + auto cond_false = ScalarFromJSON(boolean(), "false"); + auto cond_null = ScalarFromJSON(boolean(), "null"); + auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]"); + auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]"); + auto scalar_null = ScalarFromJSON(type, "null"); + auto scalar1 = ScalarFromJSON(type, R"(["aB", "xYz"])"); + auto scalar2 = ScalarFromJSON(type, R"(["b", null])"); + auto values_null = ArrayFromJSON(type, "[null, null, null, null]"); + auto values1 = + ArrayFromJSON(type, R"([["cD", "E"], null, ["de", "gfhi"], ["ef", "g"]])"); + auto values2 = + ArrayFromJSON(type, R"([["fghi", "jk"], ["ghi", null], null, [null, "hi"]])"); + + CheckScalar("case_when", {MakeStruct({}), values1}, values1); + CheckScalar("case_when", {MakeStruct({}), values_null}, values_null); + + CheckScalar("case_when", {MakeStruct({cond_true}), scalar1, values1}, + *MakeArrayFromScalar(*scalar1, 4)); + CheckScalar("case_when", {MakeStruct({cond_false}), scalar1, values1}, values1); + + CheckScalar("case_when", {MakeStruct({cond_true}), values1}, values1); + CheckScalar("case_when", {MakeStruct({cond_false}), values1}, values_null); + CheckScalar("case_when", {MakeStruct({cond_null}), values1}, values_null); + CheckScalar("case_when", {MakeStruct({cond_true}), values1, values2}, values1); + CheckScalar("case_when", {MakeStruct({cond_false}), values1, values2}, values2); + CheckScalar("case_when", {MakeStruct({cond_null}), values1, values2}, values2); + + CheckScalar("case_when", {MakeStruct({cond_true, cond_true}), values1, values2}, + values1); + CheckScalar("case_when", {MakeStruct({cond_false, cond_false}), values1, values2}, + values_null); + CheckScalar("case_when", {MakeStruct({cond_true, cond_false}), values1, values2}, + values1); + CheckScalar("case_when", {MakeStruct({cond_false, cond_true}), values1, values2}, + values2); + CheckScalar("case_when", {MakeStruct({cond_null, cond_true}), values1, values2}, + values2); + CheckScalar("case_when", + {MakeStruct({cond_false, cond_false}), values1, values2, values2}, values2); + + CheckScalar( + "case_when", {MakeStruct({cond1, cond2}), scalar1, scalar2}, + ArrayFromJSON(type, R"([["aB", "xYz"], ["aB", "xYz"], ["b", null], null])")); + CheckScalar("case_when", {MakeStruct({cond1}), scalar_null}, values_null); + CheckScalar("case_when", {MakeStruct({cond1}), scalar_null, scalar1}, + ArrayFromJSON(type, R"([null, null, ["aB", "xYz"], ["aB", "xYz"]])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), scalar1, scalar2, scalar1}, + ArrayFromJSON( + type, R"([["aB", "xYz"], ["aB", "xYz"], ["b", null], ["aB", "xYz"]])")); + + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2}, + ArrayFromJSON(type, R"([["cD", "E"], null, null, null])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2, values1}, + ArrayFromJSON(type, R"([["cD", "E"], null, null, ["ef", "g"]])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values_null, values2, values1}, + ArrayFromJSON(type, R"([null, null, null, ["ef", "g"]])")); +} + TEST(TestCaseWhen, DispatchBest) { CheckDispatchBest("case_when", {struct_({field("", boolean())}), int64(), int32()}, {struct_({field("", boolean())}), int64(), int64()}); From 0d42d408a6af540ea7370269ba2237503997c445 Mon Sep 17 00:00:00 2001 From: David Li Date: Fri, 23 Jul 2021 15:22:10 -0400 Subject: [PATCH 05/31] ARROW-13222: [C++] Much faster, but less generic case_when for nested types --- cpp/src/arrow/array/builder_base.h | 11 ++ cpp/src/arrow/array/builder_primitive.h | 15 ++ cpp/src/arrow/buffer_builder.h | 11 ++ .../arrow/compute/kernels/scalar_if_else.cc | 166 ++++++++++++++++-- .../compute/kernels/scalar_if_else_test.cc | 35 ++++ 5 files changed, 220 insertions(+), 18 deletions(-) diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index c2aba4e959f..b52a6b668ec 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -189,6 +189,17 @@ class ARROW_EXPORT ArrayBuilder { null_count_ = null_bitmap_builder_.false_count(); } + // Vector append. Copy from a given bitmap. If bitmap is null assume + // all of length bits are valid. + void UnsafeAppendToBitmap(const uint8_t* bitmap, int64_t offset, int64_t length) { + if (bitmap == NULLPTR) { + return UnsafeSetNotNull(length); + } + null_bitmap_builder_.UnsafeAppend(bitmap, offset, length); + length_ += length; + null_count_ = null_bitmap_builder_.false_count(); + } + // Append the same validity value a given number of times. void UnsafeAppendToBitmap(const int64_t num_bits, bool value) { if (value) { diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h index e0f39f97967..2643006b072 100644 --- a/cpp/src/arrow/array/builder_primitive.h +++ b/cpp/src/arrow/array/builder_primitive.h @@ -153,6 +153,21 @@ class NumericBuilder : public ArrayBuilder { return Status::OK(); } + /// \brief Append a sequence of elements in one shot + /// \param[in] values a contiguous C array of values + /// \param[in] length the number of values to append + /// \param[in] bitmap a validity bitmap to copy (may be null) + /// \param[in] bitmap_offset an offset into the validity bitmap + /// \return Status + Status AppendValues(const value_type* values, int64_t length, const uint8_t* bitmap, + int64_t bitmap_offset) { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(values, length); + // length_ is update by these + ArrayBuilder::UnsafeAppendToBitmap(bitmap, bitmap_offset, length); + return Status::OK(); + } + /// \brief Append a sequence of elements in one shot /// \param[in] values a contiguous C array of values /// \param[in] length the number of values to append diff --git a/cpp/src/arrow/buffer_builder.h b/cpp/src/arrow/buffer_builder.h index eb3f68affc0..2c5afde1fdc 100644 --- a/cpp/src/arrow/buffer_builder.h +++ b/cpp/src/arrow/buffer_builder.h @@ -350,6 +350,17 @@ class TypedBufferBuilder { bit_length_ += num_elements; } + void UnsafeAppend(const uint8_t* bitmap, int64_t offset, int64_t num_elements) { + if (num_elements == 0) return; + int64_t i = offset; + internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements, [&] { + bool value = BitUtil::GetBit(bitmap, i++); + false_count_ += !value; + return value; + }); + bit_length_ += num_elements; + } + void UnsafeAppend(const int64_t num_copies, bool value) { BitUtil::SetBitsTo(mutable_data(), bit_length_, num_copies, value); false_count_ += num_copies * !value; diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index d7522988136..78187cc0cde 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -16,6 +16,7 @@ // under the License. #include +#include #include #include #include @@ -1591,8 +1592,101 @@ struct CaseWhenFunctor> { } }; +using ArrayAppenderFunc = std::function&, int64_t, int64_t)>; + +static Status GetValueAppenders(const DataType& type, ArrayAppenderFunc* array_appender); + +struct GetAppenders { + template + enable_if_number Visit(const T&) { + using BuilderType = typename TypeTraits::BuilderType; + using c_type = typename T::c_type; + array_appender = [](ArrayBuilder* raw_builder, + const std::shared_ptr& array, const int64_t offset, + const int64_t length) { + return checked_cast(raw_builder) + ->AppendValues(array->GetValues(1) + offset, length, + array->GetValues(0, 0), array->offset + offset); + }; + return Status::OK(); + } + + Status Visit(const StringType&) { + array_appender = [](ArrayBuilder* raw_builder, + const std::shared_ptr& array, const int64_t offset, + const int64_t length) { + auto builder = checked_cast(raw_builder); + auto bitmap = array->GetValues(0, 0); + auto offsets = array->GetValues(1); + auto data = array->GetValues(2, 0); + for (int64_t i = 0; i < length; i++) { + if (!bitmap || BitUtil::GetBit(bitmap, offset + i)) { + const int32_t start = offsets[offset + i]; + const int32_t end = offsets[offset + i + 1]; + RETURN_NOT_OK(builder->Append(data + start, end - start)); + } else { + RETURN_NOT_OK(builder->AppendNull()); + } + } + return Status::OK(); + }; + return Status::OK(); + } + + template + enable_if_var_size_list Visit(const T& ty) { + // TODO: reuse this below? Or make a fully generic (but runtime + // dispatched) impl of the top level case when + using BuilderType = typename TypeTraits::BuilderType; + using offset_type = typename T::offset_type; + const auto& list_ty = checked_cast(ty); + ArrayAppenderFunc sub_appender; + RETURN_NOT_OK(GetValueAppenders(*list_ty.value_type(), &sub_appender)); + array_appender = [=](ArrayBuilder* raw_builder, + const std::shared_ptr& array, const int64_t offset, + const int64_t length) { + auto builder = checked_cast(raw_builder); + auto child_builder = builder->value_builder(); + const offset_type* offsets = array->GetValues(1); + const uint8_t* validity = + array->MayHaveNulls() ? array->buffers[0]->data() : nullptr; + for (int64_t row = offset; row < offset + length; row++) { + if (!validity || BitUtil::GetBit(validity, array->offset + row)) { + RETURN_NOT_OK(builder->Append()); + int64_t length = offsets[row + 1] - offsets[row]; + RETURN_NOT_OK( + sub_appender(child_builder, array->child_data[0], offsets[row], length)); + } else { + RETURN_NOT_OK(builder->AppendNull()); + } + } + return Status::OK(); + }; + return Status::OK(); + } + + Status Visit(const DataType& ty) { + return Status::NotImplemented("Appender for type ", ty); + } + + ArrayAppenderFunc GetArrayAppender() { return array_appender; } + + ArrayAppenderFunc array_appender; +}; + +static Status GetValueAppenders(const DataType& type, ArrayAppenderFunc* array_appender) { + // TODO: should cover scalars too + GetAppenders get_appenders; + RETURN_NOT_OK(VisitTypeInline(type, &get_appenders)); + *array_appender = std::move(get_appenders.array_appender); + return Status::OK(); +} + template struct CaseWhenFunctor> { + using offset_type = typename Type::offset_type; + using BuilderType = typename TypeTraits::BuilderType; static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { if (batch[0].null_count() > 0) { return Status::Invalid("cond struct must not have outer nulls"); @@ -1612,20 +1706,45 @@ struct CaseWhenFunctor> { }); } static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - // TODO: horrendously slow, but generic + const auto& ty = checked_cast(*out->type()); + ArrayAppenderFunc array_appender; + RETURN_NOT_OK(GetValueAppenders(*ty.value_type(), &array_appender)); return ExecVarWidthArrayCaseWhen( ctx, batch, out, // ReserveData - [](ArrayBuilder*) {}, + [&](ArrayBuilder* raw_builder) { + auto builder = checked_cast(raw_builder); + auto child_builder = builder->value_builder(); + + int64_t reservation = 0; + for (size_t arg = 1; arg < batch.values.size(); arg++) { + auto source = batch.values[arg]; + if (!source.is_array()) { + const auto& scalar = checked_cast(*source.scalar()); + if (!scalar.value) continue; + reservation = + std::max(reservation, batch.length * scalar.value->length()); + } else { + const auto& array = *source.array(); + reservation = std::max(reservation, array.child_data[0]->length); + } + } + return child_builder->Reserve(reservation); + }, // AppendScalar [](ArrayBuilder* raw_builder, const Scalar& scalar) { return raw_builder->AppendScalar(scalar); }, // AppendArray - [](ArrayBuilder* raw_builder, const std::shared_ptr& array, - const int64_t row) { - ARROW_ASSIGN_OR_RAISE(auto scalar, MakeArray(array)->GetScalar(row)); - return raw_builder->AppendScalar(*scalar); + [&](ArrayBuilder* raw_builder, const std::shared_ptr& array, + const int64_t row) { + auto builder = checked_cast(raw_builder); + auto child_builder = builder->value_builder(); + RETURN_NOT_OK(builder->Append()); + const offset_type* offsets = array->GetValues(1); + int64_t length = offsets[row + 1] - offsets[row]; + return array_appender(child_builder, array->child_data[0], offsets[row], + length); }); } }; @@ -1690,30 +1809,41 @@ struct CaseWhenFunctor { }); } static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - // TODO: horrendously slow, but generic + const auto& ty = checked_cast(*out->type()); + const int64_t width = ty.list_size(); + ArrayAppenderFunc array_appender; + RETURN_NOT_OK(GetValueAppenders(*ty.value_type(), &array_appender)); return ExecVarWidthArrayCaseWhen( ctx, batch, out, // ReserveData - [](ArrayBuilder*) {}, + [&](ArrayBuilder* raw_builder) { + int64_t children = batch.length * width; + return checked_cast(raw_builder) + ->value_builder() + ->Reserve(children); + }, // AppendScalar [](ArrayBuilder* raw_builder, const Scalar& scalar) { + // // Append the boxed array to the child builder, then append a new offset + // auto child_builder = + // checked_cast(raw_builder)->value_builder(); return + // raw_builder->Append(); return raw_builder->AppendScalar(scalar); }, // AppendArray - [](ArrayBuilder* raw_builder, const std::shared_ptr& array, - const int64_t row) { - ARROW_ASSIGN_OR_RAISE(auto scalar, MakeArray(array)->GetScalar(row)); - return raw_builder->AppendScalar(*scalar); + [&](ArrayBuilder* raw_builder, const std::shared_ptr& array, + const int64_t row) { + // Append a slice of the child array to the child builder, then append a new + // offset + auto builder = checked_cast(raw_builder); + auto child_builder = builder->value_builder(); + array_appender(child_builder, array->child_data[0], + width * (array->offset + row), width); + return builder->Append(); }); } }; -// TODO: map, fixed size list, struct, union, dictionary - -// TODO: file separate issue for dictionary? need utility to unify -// dictionary and return mapping. what is an R factor? may need to -// promote index type - struct CoalesceFunction : ScalarFunction { using ScalarFunction::ScalarFunction; diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc index b1056ca94fe..02dc5f1cc94 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc @@ -977,6 +977,41 @@ TYPED_TEST(TestCaseWhenList, ListOfString) { ArrayFromJSON(type, R"([null, null, null, ["ef", "g"]])")); } +TYPED_TEST(TestCaseWhenList, ListOfInt) { + // More minimal test to check type coverage + auto type = std::make_shared(int64()); + auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]"); + auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]"); + auto values_null = ArrayFromJSON(type, "[null, null, null, null]"); + auto values1 = ArrayFromJSON(type, R"([[1, 2], null, [3, 4, 5], [6, null]])"); + auto values2 = ArrayFromJSON(type, R"([[8, 9, 10], [11], null, [12]])"); + + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2}, + ArrayFromJSON(type, R"([[1, 2], null, null, null])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2, values1}, + ArrayFromJSON(type, R"([[1, 2], null, null, [6, null]])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values_null, values2, values1}, + ArrayFromJSON(type, R"([null, null, null, [6, null]])")); +} + +TYPED_TEST(TestCaseWhenList, ListOfListOfInt) { + // More minimal test to check type coverage + auto type = std::make_shared(list(int64())); + auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]"); + auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]"); + auto values_null = ArrayFromJSON(type, "[null, null, null, null]"); + auto values1 = + ArrayFromJSON(type, R"([[[1, 2], []], null, [[3, 4, 5]], [[6, null], null]])"); + auto values2 = ArrayFromJSON(type, R"([[[8, 9, 10]], [[11]], null, [[12]]])"); + + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2}, + ArrayFromJSON(type, R"([[[1, 2], []], null, null, null])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2, values1}, + ArrayFromJSON(type, R"([[[1, 2], []], null, null, [[6, null], null]])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values_null, values2, values1}, + ArrayFromJSON(type, R"([null, null, null, [[6, null], null]])")); +} + TEST(TestCaseWhen, Map) { auto type = map(int64(), utf8()); auto cond_true = ScalarFromJSON(boolean(), "true"); From 3ec727ab83d036c7035e6cb5ea8c7a75e51a4b88 Mon Sep 17 00:00:00 2001 From: David Li Date: Mon, 26 Jul 2021 12:38:22 -0400 Subject: [PATCH 06/31] ARROW-13222: [C++] Expand type support --- .../arrow/compute/kernels/scalar_if_else.cc | 173 ++++++++++++++---- .../compute/kernels/scalar_if_else_test.cc | 121 ++++++++++++ 2 files changed, 262 insertions(+), 32 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index 78187cc0cde..e1896e591c9 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -1592,6 +1592,7 @@ struct CaseWhenFunctor> { } }; +// Given an array and a builder, append a slice of the array to the builder using ArrayAppenderFunc = std::function&, int64_t, int64_t)>; @@ -1612,13 +1613,16 @@ struct GetAppenders { return Status::OK(); } - Status Visit(const StringType&) { + template + enable_if_base_binary Visit(const T&) { + using BuilderType = typename TypeTraits::BuilderType; + using offset_type = typename T::offset_type; array_appender = [](ArrayBuilder* raw_builder, const std::shared_ptr& array, const int64_t offset, const int64_t length) { - auto builder = checked_cast(raw_builder); + auto builder = checked_cast(raw_builder); auto bitmap = array->GetValues(0, 0); - auto offsets = array->GetValues(1); + auto offsets = array->GetValues(1); auto data = array->GetValues(2, 0); for (int64_t i = 0; i < length; i++) { if (!bitmap || BitUtil::GetBit(bitmap, offset + i)) { @@ -1636,8 +1640,6 @@ struct GetAppenders { template enable_if_var_size_list Visit(const T& ty) { - // TODO: reuse this below? Or make a fully generic (but runtime - // dispatched) impl of the top level case when using BuilderType = typename TypeTraits::BuilderType; using offset_type = typename T::offset_type; const auto& list_ty = checked_cast(ty); @@ -1666,6 +1668,88 @@ struct GetAppenders { return Status::OK(); } + Status Visit(const MapType& ty) { + const auto& map_ty = checked_cast(ty); + ArrayAppenderFunc key_appender, item_appender; + RETURN_NOT_OK(GetValueAppenders(*map_ty.key_type(), &key_appender)); + RETURN_NOT_OK(GetValueAppenders(*map_ty.item_type(), &item_appender)); + array_appender = [=](ArrayBuilder* raw_builder, + const std::shared_ptr& array, const int64_t offset, + const int64_t length) { + auto builder = checked_cast(raw_builder); + auto key_builder = builder->key_builder(); + auto item_builder = builder->item_builder(); + const int32_t* offsets = array->GetValues(1); + const uint8_t* validity = + array->MayHaveNulls() ? array->buffers[0]->data() : nullptr; + for (int64_t row = offset; row < offset + length; row++) { + if (!validity || BitUtil::GetBit(validity, array->offset + row)) { + RETURN_NOT_OK(builder->Append()); + int64_t length = offsets[row + 1] - offsets[row]; + RETURN_NOT_OK(key_appender(key_builder, array->child_data[0]->child_data[0], + offsets[row], length)); + RETURN_NOT_OK(item_appender(item_builder, array->child_data[0]->child_data[1], + offsets[row], length)); + } else { + RETURN_NOT_OK(builder->AppendNull()); + } + } + return Status::OK(); + }; + return Status::OK(); + } + + Status Visit(const StructType& ty) { + const auto& struct_ty = checked_cast(ty); + std::vector appenders(struct_ty.num_fields()); + for (int i = 0; static_cast(i) < appenders.size(); i++) { + RETURN_NOT_OK(GetValueAppenders(*struct_ty.field(i)->type(), &appenders[i])); + } + array_appender = [=](ArrayBuilder* raw_builder, + const std::shared_ptr& array, const int64_t offset, + const int64_t length) { + auto builder = checked_cast(raw_builder); + for (int i = 0; static_cast(i) < appenders.size(); i++) { + RETURN_NOT_OK(appenders[i](builder->field_builder(i), array->child_data[i], + array->offset + offset, length)); + } + const uint8_t* validity = + array->MayHaveNulls() ? array->buffers[0]->data() : nullptr; + for (int64_t row = offset; row < offset + length; row++) { + RETURN_NOT_OK( + builder->Append(!validity || BitUtil::GetBit(validity, array->offset + row))); + } + return Status::OK(); + }; + return Status::OK(); + } + + Status Visit(const FixedSizeListType& ty) { + const auto& list_ty = checked_cast(ty); + const int64_t width = list_ty.list_size(); + ArrayAppenderFunc sub_appender; + RETURN_NOT_OK(GetValueAppenders(*list_ty.value_type(), &sub_appender)); + array_appender = [=](ArrayBuilder* raw_builder, + const std::shared_ptr& array, const int64_t offset, + const int64_t length) { + auto builder = checked_cast(raw_builder); + auto child_builder = builder->value_builder(); + const uint8_t* validity = + array->MayHaveNulls() ? array->buffers[0]->data() : nullptr; + for (int64_t row = offset; row < offset + length; row++) { + if (!validity || BitUtil::GetBit(validity, array->offset + row)) { + RETURN_NOT_OK(sub_appender(child_builder, array->child_data[0], + width * (array->offset + row), width)); + RETURN_NOT_OK(builder->Append()); + } else { + RETURN_NOT_OK(builder->AppendNull()); + } + } + return Status::OK(); + }; + return Status::OK(); + } + Status Visit(const DataType& ty) { return Status::NotImplemented("Appender for type ", ty); } @@ -1708,7 +1792,7 @@ struct CaseWhenFunctor> { static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { const auto& ty = checked_cast(*out->type()); ArrayAppenderFunc array_appender; - RETURN_NOT_OK(GetValueAppenders(*ty.value_type(), &array_appender)); + RETURN_NOT_OK(GetValueAppenders(ty, &array_appender)); return ExecVarWidthArrayCaseWhen( ctx, batch, out, // ReserveData @@ -1738,13 +1822,7 @@ struct CaseWhenFunctor> { // AppendArray [&](ArrayBuilder* raw_builder, const std::shared_ptr& array, const int64_t row) { - auto builder = checked_cast(raw_builder); - auto child_builder = builder->value_builder(); - RETURN_NOT_OK(builder->Append()); - const offset_type* offsets = array->GetValues(1); - int64_t length = offsets[row + 1] - offsets[row]; - return array_appender(child_builder, array->child_data[0], offsets[row], - length); + return array_appender(raw_builder, array, row, /*length=*/1); }); } }; @@ -1770,7 +1848,8 @@ struct CaseWhenFunctor { }); } static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - // TODO: horrendously slow, but generic + ArrayAppenderFunc array_appender; + RETURN_NOT_OK(GetValueAppenders(*out->type(), &array_appender)); return ExecVarWidthArrayCaseWhen( ctx, batch, out, // ReserveData @@ -1780,10 +1859,48 @@ struct CaseWhenFunctor { return raw_builder->AppendScalar(scalar); }, // AppendArray - [](ArrayBuilder* raw_builder, const std::shared_ptr& array, - const int64_t row) { - ARROW_ASSIGN_OR_RAISE(auto scalar, MakeArray(array)->GetScalar(row)); - return raw_builder->AppendScalar(*scalar); + [&](ArrayBuilder* raw_builder, const std::shared_ptr& array, + const int64_t row) { + return array_appender(raw_builder, array, row, /*length=*/1); + }); + } +}; + +template <> +struct CaseWhenFunctor { + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + if (batch[0].null_count() > 0) { + return Status::Invalid("cond struct must not have outer nulls"); + } + if (batch[0].is_scalar()) { + return ExecScalar(ctx, batch, out); + } + return ExecArray(ctx, batch, out); + } + + static Status ExecScalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + return ExecVarWidthScalarCaseWhen( + ctx, batch, out, + [](KernelContext* ctx, const ArrayData& source, ArrayData* output) { + *output = source; + return Status::OK(); + }); + } + static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + ArrayAppenderFunc array_appender; + RETURN_NOT_OK(GetValueAppenders(*out->type(), &array_appender)); + return ExecVarWidthArrayCaseWhen( + ctx, batch, out, + // ReserveData + [](ArrayBuilder*) {}, + // AppendScalar + [](ArrayBuilder* raw_builder, const Scalar& scalar) { + return raw_builder->AppendScalar(scalar); + }, + // AppendArray + [&](ArrayBuilder* raw_builder, const std::shared_ptr& array, + const int64_t row) { + return array_appender(raw_builder, array, row, /*length=*/1); }); } }; @@ -1812,7 +1929,7 @@ struct CaseWhenFunctor { const auto& ty = checked_cast(*out->type()); const int64_t width = ty.list_size(); ArrayAppenderFunc array_appender; - RETURN_NOT_OK(GetValueAppenders(*ty.value_type(), &array_appender)); + RETURN_NOT_OK(GetValueAppenders(ty, &array_appender)); return ExecVarWidthArrayCaseWhen( ctx, batch, out, // ReserveData @@ -1824,22 +1941,12 @@ struct CaseWhenFunctor { }, // AppendScalar [](ArrayBuilder* raw_builder, const Scalar& scalar) { - // // Append the boxed array to the child builder, then append a new offset - // auto child_builder = - // checked_cast(raw_builder)->value_builder(); return - // raw_builder->Append(); return raw_builder->AppendScalar(scalar); }, // AppendArray [&](ArrayBuilder* raw_builder, const std::shared_ptr& array, const int64_t row) { - // Append a slice of the child array to the child builder, then append a new - // offset - auto builder = checked_cast(raw_builder); - auto child_builder = builder->value_builder(); - array_appender(child_builder, array->child_data[0], - width * (array->offset + row), width); - return builder->Append(); + return array_appender(raw_builder, array, row, /*length=*/1); }); } }; @@ -2403,11 +2510,13 @@ void RegisterScalarIfElse(FunctionRegistry* registry) { AddCaseWhenKernel(func, Type::DECIMAL128, CaseWhenFunctor::Exec); AddCaseWhenKernel(func, Type::DECIMAL256, CaseWhenFunctor::Exec); AddBinaryCaseWhenKernels(func, BaseBinaryTypes()); - AddCaseWhenKernel(func, Type::LIST, CaseWhenFunctor::Exec); - AddCaseWhenKernel(func, Type::LARGE_LIST, CaseWhenFunctor::Exec); AddCaseWhenKernel(func, Type::FIXED_SIZE_LIST, CaseWhenFunctor::Exec); + AddCaseWhenKernel(func, Type::LIST, CaseWhenFunctor::Exec); + AddCaseWhenKernel(func, Type::LARGE_LIST, CaseWhenFunctor::Exec); AddCaseWhenKernel(func, Type::MAP, CaseWhenFunctor::Exec); + AddCaseWhenKernel(func, Type::STRUCT, CaseWhenFunctor::Exec); + // TODO: union, dictionary, extension DCHECK_OK(registry->AddFunction(std::move(func))); } { diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc index 02dc5f1cc94..573c03f2d1b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc @@ -1196,6 +1196,127 @@ TEST(TestCaseWhen, FixedSizeListOfString) { ArrayFromJSON(type, R"([null, null, null, ["ef", "g"]])")); } +TEST(TestCaseWhen, StructOfInt) { + auto type = struct_({field("a", uint32()), field("b", int64())}); + auto cond_true = ScalarFromJSON(boolean(), "true"); + auto cond_false = ScalarFromJSON(boolean(), "false"); + auto cond_null = ScalarFromJSON(boolean(), "null"); + auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]"); + auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]"); + auto scalar_null = ScalarFromJSON(type, "null"); + auto scalar1 = ScalarFromJSON(type, R"([1, -2])"); + auto scalar2 = ScalarFromJSON(type, R"([null, 3])"); + auto values_null = ArrayFromJSON(type, "[null, null, null, null]"); + auto values1 = ArrayFromJSON(type, R"([[4, null], null, [5, -6], [7, -8]])"); + auto values2 = ArrayFromJSON(type, R"([[9, 10], [11, -12], null, [null, null]])"); + + CheckScalar("case_when", {MakeStruct({}), values1}, values1); + CheckScalar("case_when", {MakeStruct({}), values_null}, values_null); + + CheckScalar("case_when", {MakeStruct({cond_true}), scalar1, values1}, + *MakeArrayFromScalar(*scalar1, 4)); + CheckScalar("case_when", {MakeStruct({cond_false}), scalar1, values1}, values1); + + CheckScalar("case_when", {MakeStruct({cond_true}), values1}, values1); + CheckScalar("case_when", {MakeStruct({cond_false}), values1}, values_null); + CheckScalar("case_when", {MakeStruct({cond_null}), values1}, values_null); + CheckScalar("case_when", {MakeStruct({cond_true}), values1, values2}, values1); + CheckScalar("case_when", {MakeStruct({cond_false}), values1, values2}, values2); + CheckScalar("case_when", {MakeStruct({cond_null}), values1, values2}, values2); + + CheckScalar("case_when", {MakeStruct({cond_true, cond_true}), values1, values2}, + values1); + CheckScalar("case_when", {MakeStruct({cond_false, cond_false}), values1, values2}, + values_null); + CheckScalar("case_when", {MakeStruct({cond_true, cond_false}), values1, values2}, + values1); + CheckScalar("case_when", {MakeStruct({cond_false, cond_true}), values1, values2}, + values2); + CheckScalar("case_when", {MakeStruct({cond_null, cond_true}), values1, values2}, + values2); + CheckScalar("case_when", + {MakeStruct({cond_false, cond_false}), values1, values2, values2}, values2); + + CheckScalar("case_when", {MakeStruct({cond1, cond2}), scalar1, scalar2}, + ArrayFromJSON(type, R"([[1, -2], [1, -2], [null, 3], null])")); + CheckScalar("case_when", {MakeStruct({cond1}), scalar_null}, values_null); + CheckScalar("case_when", {MakeStruct({cond1}), scalar_null, scalar1}, + ArrayFromJSON(type, R"([null, null, [1, -2], [1, -2]])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), scalar1, scalar2, scalar1}, + ArrayFromJSON(type, R"([[1, -2], [1, -2], [null, 3], [1, -2]])")); + + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2}, + ArrayFromJSON(type, R"([[4, null], null, null, null])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2, values1}, + ArrayFromJSON(type, R"([[4, null], null, null, [7, -8]])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values_null, values2, values1}, + ArrayFromJSON(type, R"([null, null, null, [7, -8]])")); +} + +TEST(TestCaseWhen, StructOfString) { + // More minimal test to check type coverage + auto type = struct_({field("a", utf8()), field("b", large_utf8())}); + auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]"); + auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]"); + auto scalar_null = ScalarFromJSON(type, "null"); + auto scalar1 = ScalarFromJSON(type, R"(["a", "bc"])"); + auto scalar2 = ScalarFromJSON(type, R"([null, "d"])"); + auto values_null = ArrayFromJSON(type, "[null, null, null, null]"); + auto values1 = + ArrayFromJSON(type, R"([["efg", null], null, [null, null], [null, "hi"]])"); + auto values2 = + ArrayFromJSON(type, R"([["j", "k"], [null, "lmnop"], null, ["qr", "stu"]])"); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), scalar1, scalar2}, + ArrayFromJSON(type, R"([["a", "bc"], ["a", "bc"], [null, "d"], null])")); + CheckScalar("case_when", {MakeStruct({cond1}), scalar_null}, values_null); + CheckScalar("case_when", {MakeStruct({cond1}), scalar_null, scalar1}, + ArrayFromJSON(type, R"([null, null, ["a", "bc"], ["a", "bc"]])")); + CheckScalar( + "case_when", {MakeStruct({cond1, cond2}), scalar1, scalar2, scalar1}, + ArrayFromJSON(type, R"([["a", "bc"], ["a", "bc"], [null, "d"], ["a", "bc"]])")); + + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2}, + ArrayFromJSON(type, R"([["efg", null], null, null, null])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2, values1}, + ArrayFromJSON(type, R"([["efg", null], null, null, [null, "hi"]])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values_null, values2, values1}, + ArrayFromJSON(type, R"([null, null, null, [null, "hi"]])")); +} + +TEST(TestCaseWhen, StructOfListOfInt) { + // More minimal test to check type coverage + auto type = struct_({field("a", utf8()), field("b", list(int64()))}); + auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]"); + auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]"); + auto scalar_null = ScalarFromJSON(type, "null"); + auto scalar1 = ScalarFromJSON(type, R"([null, [1, null]])"); + auto scalar2 = ScalarFromJSON(type, R"(["b", null])"); + auto values_null = ArrayFromJSON(type, "[null, null, null, null]"); + auto values1 = + ArrayFromJSON(type, R"([["efg", null], null, [null, null], [null, [null, 1]]])"); + auto values2 = + ArrayFromJSON(type, R"([["j", [2, 3]], [null, [4, 5, 6]], null, ["qr", [7]]])"); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), scalar1, scalar2}, + ArrayFromJSON( + type, R"([[null, [1, null]], [null, [1, null]], ["b", null], null])")); + CheckScalar("case_when", {MakeStruct({cond1}), scalar_null}, values_null); + CheckScalar( + "case_when", {MakeStruct({cond1}), scalar_null, scalar1}, + ArrayFromJSON(type, R"([null, null, [null, [1, null]], [null, [1, null]]])")); + CheckScalar( + "case_when", {MakeStruct({cond1, cond2}), scalar1, scalar2, scalar1}, + ArrayFromJSON( + type, + R"([[null, [1, null]], [null, [1, null]], ["b", null], [null, [1, null]]])")); + + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2}, + ArrayFromJSON(type, R"([["efg", null], null, null, null])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2, values1}, + ArrayFromJSON(type, R"([["efg", null], null, null, [null, [null, 1]]])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values_null, values2, values1}, + ArrayFromJSON(type, R"([null, null, null, [null, [null, 1]]])")); +} + TEST(TestCaseWhen, DispatchBest) { CheckDispatchBest("case_when", {struct_({field("", boolean())}), int64(), int32()}, {struct_({field("", boolean())}), int64(), int64()}); From d6b491c869c7d40a663940a54598dc82c210838c Mon Sep 17 00:00:00 2001 From: David Li Date: Mon, 26 Jul 2021 16:14:04 -0400 Subject: [PATCH 07/31] ARROW-13222: [C++] Expand type support further --- cpp/src/arrow/array/builder_binary.cc | 8 ++ cpp/src/arrow/array/builder_binary.h | 3 + cpp/src/arrow/array/builder_primitive.cc | 8 ++ cpp/src/arrow/array/builder_primitive.h | 9 ++ .../arrow/compute/kernels/scalar_if_else.cc | 30 +++++- .../compute/kernels/scalar_if_else_test.cc | 92 ++++++++++++++++++- 6 files changed, 146 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc index 6822dc89903..fd1be179816 100644 --- a/cpp/src/arrow/array/builder_binary.cc +++ b/cpp/src/arrow/array/builder_binary.cc @@ -60,6 +60,14 @@ Status FixedSizeBinaryBuilder::AppendValues(const uint8_t* data, int64_t length, return byte_builder_.Append(data, length * byte_width_); } +Status FixedSizeBinaryBuilder::AppendValues(const uint8_t* data, int64_t length, + const uint8_t* validity, + int64_t bitmap_offset) { + RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(validity, bitmap_offset, length); + return byte_builder_.Append(data, length * byte_width_); +} + Status FixedSizeBinaryBuilder::AppendNull() { RETURN_NOT_OK(Reserve(1)); UnsafeAppendNull(); diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 7653eeca5c4..1a6b254a7d3 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -486,6 +486,9 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { Status AppendValues(const uint8_t* data, int64_t length, const uint8_t* valid_bytes = NULLPTR); + Status AppendValues(const uint8_t* data, int64_t length, const uint8_t* validity, + int64_t bitmap_offset); + Status AppendNull() final; Status AppendNulls(int64_t length) final; diff --git a/cpp/src/arrow/array/builder_primitive.cc b/cpp/src/arrow/array/builder_primitive.cc index e403c42411d..769c2f7d07b 100644 --- a/cpp/src/arrow/array/builder_primitive.cc +++ b/cpp/src/arrow/array/builder_primitive.cc @@ -85,6 +85,14 @@ Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length, return Status::OK(); } +Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length, + const uint8_t* validity, int64_t offset) { + RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(values, offset, length); + ArrayBuilder::UnsafeAppendToBitmap(validity, offset, length); + return Status::OK(); +} + Status BooleanBuilder::AppendValues(const uint8_t* values, int64_t length, const std::vector& is_valid) { RETURN_NOT_OK(Reserve(length)); diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h index 2643006b072..856904e1894 100644 --- a/cpp/src/arrow/array/builder_primitive.h +++ b/cpp/src/arrow/array/builder_primitive.h @@ -378,6 +378,15 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { Status AppendValues(const uint8_t* values, int64_t length, const uint8_t* valid_bytes = NULLPTR); + /// \brief Append a sequence of elements in one shot + /// \param[in] values a bitmap of values + /// \param[in] length the number of values to append + /// \param[in] validity a validity bitmap to copy (may be null) + /// \param[in] offset an offset into the values and validity bitmaps + /// \return Status + Status AppendValues(const uint8_t* values, int64_t length, const uint8_t* validity, + int64_t offset); + /// \brief Append a sequence of elements in one shot /// \param[in] values a contiguous C array of values /// \param[in] length the number of values to append diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index e1896e591c9..fcd392ceca4 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -1600,7 +1601,8 @@ static Status GetValueAppenders(const DataType& type, ArrayAppenderFunc* array_a struct GetAppenders { template - enable_if_number Visit(const T&) { + enable_if_t::value && !is_boolean_type::value, Status> Visit( + const T&) { using BuilderType = typename TypeTraits::BuilderType; using c_type = typename T::c_type; array_appender = [](ArrayBuilder* raw_builder, @@ -1613,6 +1615,31 @@ struct GetAppenders { return Status::OK(); } + Status Visit(const BooleanType&) { + array_appender = [](ArrayBuilder* raw_builder, + const std::shared_ptr& array, const int64_t offset, + const int64_t length) { + return checked_cast(raw_builder) + ->AppendValues(array->GetValues(1, 0), length, + array->GetValues(0, 0), array->offset + offset); + }; + return Status::OK(); + } + + template + enable_if_fixed_size_binary Visit(const T& ty) { + const int32_t width = static_cast(ty).byte_width(); + array_appender = [=](ArrayBuilder* raw_builder, + const std::shared_ptr& array, const int64_t offset, + const int64_t length) { + return checked_cast(raw_builder) + ->AppendValues( + array->GetValues(1, 0) + ((array->offset + offset) * width), + length, array->GetValues(0, 0), array->offset + offset); + }; + return Status::OK(); + } + template enable_if_base_binary Visit(const T&) { using BuilderType = typename TypeTraits::BuilderType; @@ -2516,7 +2543,6 @@ void RegisterScalarIfElse(FunctionRegistry* registry) { AddCaseWhenKernel(func, Type::LARGE_LIST, CaseWhenFunctor::Exec); AddCaseWhenKernel(func, Type::MAP, CaseWhenFunctor::Exec); AddCaseWhenKernel(func, Type::STRUCT, CaseWhenFunctor::Exec); - // TODO: union, dictionary, extension DCHECK_OK(registry->AddFunction(std::move(func))); } { diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc index 573c03f2d1b..4ed4f5fb5f8 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc @@ -607,6 +607,23 @@ TYPED_TEST(TestCaseWhenNumeric, FixedSize) { {Datum(*MakeArrayOfNull(struct_({field("", boolean())}), 4)), Datum(values1)})); } +TYPED_TEST(TestCaseWhenNumeric, ListOfType) { + // More minimal test to check type coverage + auto type = list(default_type_instance()); + auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]"); + auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]"); + auto values_null = ArrayFromJSON(type, "[null, null, null, null]"); + auto values1 = ArrayFromJSON(type, R"([[1, 2], null, [3, 4, 5], [6, null]])"); + auto values2 = ArrayFromJSON(type, R"([[8, 9, 10], [11], null, [12]])"); + + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2}, + ArrayFromJSON(type, R"([[1, 2], null, null, null])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2, values1}, + ArrayFromJSON(type, R"([[1, 2], null, null, [6, null]])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values_null, values2, values1}, + ArrayFromJSON(type, R"([null, null, null, [6, null]])")); +} + TEST(TestCaseWhen, Null) { auto cond_true = ScalarFromJSON(boolean(), "true"); auto cond_false = ScalarFromJSON(boolean(), "false"); @@ -977,8 +994,24 @@ TYPED_TEST(TestCaseWhenList, ListOfString) { ArrayFromJSON(type, R"([null, null, null, ["ef", "g"]])")); } +// More minimal tests to check type coverage +TYPED_TEST(TestCaseWhenList, ListOfBool) { + auto type = std::make_shared(boolean()); + auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]"); + auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]"); + auto values_null = ArrayFromJSON(type, "[null, null, null, null]"); + auto values1 = ArrayFromJSON(type, R"([[true], null, [false], [false, null]])"); + auto values2 = ArrayFromJSON(type, R"([[false], [false], null, [true]])"); + + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2}, + ArrayFromJSON(type, R"([[true], null, null, null])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2, values1}, + ArrayFromJSON(type, R"([[true], null, null, [false, null]])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values_null, values2, values1}, + ArrayFromJSON(type, R"([null, null, null, [false, null]])")); +} + TYPED_TEST(TestCaseWhenList, ListOfInt) { - // More minimal test to check type coverage auto type = std::make_shared(int64()); auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]"); auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]"); @@ -994,8 +1027,63 @@ TYPED_TEST(TestCaseWhenList, ListOfInt) { ArrayFromJSON(type, R"([null, null, null, [6, null]])")); } +TYPED_TEST(TestCaseWhenList, ListOfDayTimeInterval) { + auto type = std::make_shared(day_time_interval()); + auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]"); + auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]"); + auto values_null = ArrayFromJSON(type, "[null, null, null, null]"); + auto values1 = + ArrayFromJSON(type, R"([[[1, 2]], null, [[3, 4], [5, 0]], [[6, 7], null]])"); + auto values2 = ArrayFromJSON(type, R"([[[8, 9], null], [[11, 12]], null, [[12, 1]]])"); + + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2}, + ArrayFromJSON(type, R"([[[1, 2]], null, null, null])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2, values1}, + ArrayFromJSON(type, R"([[[1, 2]], null, null, [[6, 7], null]])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values_null, values2, values1}, + ArrayFromJSON(type, R"([null, null, null, [[6, 7], null]])")); +} + +TYPED_TEST(TestCaseWhenList, ListOfDecimal) { + for (const auto& decimal_ty : + std::vector>{decimal128(3, 2), decimal256(3, 2)}) { + auto type = std::make_shared(decimal_ty); + auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]"); + auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]"); + auto values_null = ArrayFromJSON(type, "[null, null, null, null]"); + auto values1 = ArrayFromJSON( + type, R"([["1.23", "2.34"], null, ["3.45", "4.56", "5.67"], ["6.78", null]])"); + auto values2 = + ArrayFromJSON(type, R"([["8.90", "9.01", "1.02"], ["1.12"], null, ["1.23"]])"); + + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2}, + ArrayFromJSON(type, R"([["1.23", "2.34"], null, null, null])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2, values1}, + ArrayFromJSON(type, R"([["1.23", "2.34"], null, null, ["6.78", null]])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values_null, values2, values1}, + ArrayFromJSON(type, R"([null, null, null, ["6.78", null]])")); + } +} + +TYPED_TEST(TestCaseWhenList, ListOfFixedSizeBinary) { + auto type = std::make_shared(fixed_size_binary(4)); + auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]"); + auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]"); + auto values_null = ArrayFromJSON(type, "[null, null, null, null]"); + auto values1 = ArrayFromJSON( + type, R"([["1.23", "2.34"], null, ["3.45", "4.56", "5.67"], ["6.78", null]])"); + auto values2 = + ArrayFromJSON(type, R"([["8.90", "9.01", "1.02"], ["1.12"], null, ["1.23"]])"); + + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2}, + ArrayFromJSON(type, R"([["1.23", "2.34"], null, null, null])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2, values1}, + ArrayFromJSON(type, R"([["1.23", "2.34"], null, null, ["6.78", null]])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values_null, values2, values1}, + ArrayFromJSON(type, R"([null, null, null, ["6.78", null]])")); +} + TYPED_TEST(TestCaseWhenList, ListOfListOfInt) { - // More minimal test to check type coverage auto type = std::make_shared(list(int64())); auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]"); auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]"); From a7a3b453d6deb04fa36301a933346a493a456592 Mon Sep 17 00:00:00 2001 From: David Li Date: Mon, 26 Jul 2021 16:35:29 -0400 Subject: [PATCH 08/31] ARROW-13222: [C++] Fix typos --- cpp/src/arrow/compute/kernels/scalar_if_else.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index fcd392ceca4..0ec151bbe43 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -1468,7 +1468,7 @@ static Status ExecVarWidthArrayCaseWhen(KernelContext* ctx, const ExecBatch& bat std::unique_ptr raw_builder; RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), out->type(), &raw_builder)); RETURN_NOT_OK(raw_builder->Reserve(batch.length)); - reserve_data(raw_builder.get()); + RETURN_NOT_OK(reserve_data(raw_builder.get())); for (int64_t row = 0; row < batch.length; row++) { int64_t selected = have_else_arg ? batch.values.size() - 1 : -1; @@ -1653,8 +1653,8 @@ struct GetAppenders { auto data = array->GetValues(2, 0); for (int64_t i = 0; i < length; i++) { if (!bitmap || BitUtil::GetBit(bitmap, offset + i)) { - const int32_t start = offsets[offset + i]; - const int32_t end = offsets[offset + i + 1]; + const offset_type start = offsets[offset + i]; + const offset_type end = offsets[offset + i + 1]; RETURN_NOT_OK(builder->Append(data + start, end - start)); } else { RETURN_NOT_OK(builder->AppendNull()); From d6a50ea3041bf138c2783fb0de7311eced8e9a5e Mon Sep 17 00:00:00 2001 From: David Li Date: Mon, 26 Jul 2021 16:52:18 -0400 Subject: [PATCH 09/31] ARROW-13222: [C++] Fix types --- cpp/src/arrow/compute/kernels/scalar_if_else.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index 0ec151bbe43..1e0cd990b3a 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -1880,7 +1880,7 @@ struct CaseWhenFunctor { return ExecVarWidthArrayCaseWhen( ctx, batch, out, // ReserveData - [](ArrayBuilder*) {}, + [](ArrayBuilder*) { return Status::OK(); }, // AppendScalar [](ArrayBuilder* raw_builder, const Scalar& scalar) { return raw_builder->AppendScalar(scalar); @@ -1919,7 +1919,7 @@ struct CaseWhenFunctor { return ExecVarWidthArrayCaseWhen( ctx, batch, out, // ReserveData - [](ArrayBuilder*) {}, + [](ArrayBuilder*) { return Status::OK(); }, // AppendScalar [](ArrayBuilder* raw_builder, const Scalar& scalar) { return raw_builder->AppendScalar(scalar); From 1f47c6dc0d749e807b28659b2e73b14f3664416a Mon Sep 17 00:00:00 2001 From: David Li Date: Mon, 26 Jul 2021 17:22:17 -0400 Subject: [PATCH 10/31] ARROW-13222: [C++] casts for msyc --- cpp/src/arrow/compute/kernels/scalar_if_else.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index 1e0cd990b3a..bf01afa1d80 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -1580,7 +1580,8 @@ struct CaseWhenFunctor> { [](ArrayBuilder* raw_builder, const Scalar& raw_scalar) { const auto& scalar = checked_cast(raw_scalar); return checked_cast(raw_builder) - ->Append(scalar.value->data(), scalar.value->size()); + ->Append(scalar.value->data(), + static_cast(scalar.value->size())); }, // AppendArray [](ArrayBuilder* raw_builder, const std::shared_ptr& array, From a4336f85b2991b1690d73e961f75dd3e6dc844fe Mon Sep 17 00:00:00 2001 From: David Li Date: Tue, 27 Jul 2021 14:10:09 -0400 Subject: [PATCH 11/31] ARROW-13222: [C++] Explicit cast to avoid accidental overflow --- cpp/src/arrow/compute/kernels/scalar_if_else.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index bf01afa1d80..aeff2abebbd 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -1471,7 +1471,7 @@ static Status ExecVarWidthArrayCaseWhen(KernelContext* ctx, const ExecBatch& bat RETURN_NOT_OK(reserve_data(raw_builder.get())); for (int64_t row = 0; row < batch.length; row++) { - int64_t selected = have_else_arg ? batch.values.size() - 1 : -1; + int64_t selected = have_else_arg ? static_cast(batch.values.size() - 1) : -1; for (int64_t arg = 0; static_cast(arg) < conds_array.child_data.size(); arg++) { const ArrayData& cond_array = *conds_array.child_data[arg]; From 2bde163d479c6e67f7cf0f3fe6202e1fc3db47b3 Mon Sep 17 00:00:00 2001 From: David Li Date: Tue, 3 Aug 2021 14:22:31 -0400 Subject: [PATCH 12/31] ARROW-13222: [C++] Add support for unions --- cpp/src/arrow/array/array_test.cc | 16 ++++ cpp/src/arrow/array/util.cc | 23 +++-- .../arrow/compute/kernels/scalar_if_else.cc | 93 ++++++++++++++++++- .../compute/kernels/scalar_if_else_test.cc | 62 +++++++++++++ 4 files changed, 187 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 5a532e17519..d417f29f58f 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -377,6 +377,7 @@ TEST_F(TestArray, TestMakeArrayOfNullUnion) { const int64_t union_length = 10; auto s_union_ty = sparse_union({field("a", utf8()), field("b", int32())}, {0, 1}); ASSERT_OK_AND_ASSIGN(auto s_union_nulls, MakeArrayOfNull(s_union_ty, union_length)); + ASSERT_OK(s_union_nulls->ValidateFull()); ASSERT_EQ(s_union_nulls->null_count(), 0); { const auto& typed_union = checked_cast(*s_union_nulls); @@ -388,8 +389,23 @@ TEST_F(TestArray, TestMakeArrayOfNullUnion) { } } + s_union_ty = sparse_union({field("a", utf8()), field("b", int32())}, {2, 7}); + ASSERT_OK_AND_ASSIGN(s_union_nulls, MakeArrayOfNull(s_union_ty, union_length)); + ASSERT_OK(s_union_nulls->ValidateFull()); + ASSERT_EQ(s_union_nulls->null_count(), 0); + { + const auto& typed_union = checked_cast(*s_union_nulls); + ASSERT_EQ(typed_union.field(0)->null_count(), union_length); + + // Check type codes are all 2 + for (int i = 0; i < union_length; ++i) { + ASSERT_EQ(typed_union.raw_type_codes()[i], 2); + } + } + auto d_union_ty = dense_union({field("a", utf8()), field("b", int32())}, {0, 1}); ASSERT_OK_AND_ASSIGN(auto d_union_nulls, MakeArrayOfNull(d_union_ty, union_length)); + ASSERT_OK(d_union_nulls->ValidateFull()); ASSERT_EQ(d_union_nulls->null_count(), 0); { const auto& typed_union = checked_cast(*d_union_nulls); diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index fae379e51f4..b8b47788fe7 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -442,9 +442,18 @@ class NullArrayFactory { // First buffer is always null out_->buffers[0] = nullptr; - // Type codes are all zero, so we can use buffer_ which has had it's memory - // zeroed out_->buffers[1] = buffer_; + // buffer_ is zeroed, but 0 may not be a valid type code + if (type.type_codes()[0] != 0) { + std::memset(buffer_->mutable_data(), type.type_codes()[0], buffer_->size()); + } + + // We can't reuse the buffer unless it's zeroed + std::shared_ptr child_buffer = buffer_; + if (type.type_codes()[0] != 0) { + ARROW_ASSIGN_OR_RAISE(child_buffer, AllocateBuffer(buffer_->size(), pool_)); + std::memset(child_buffer->mutable_data(), 0, child_buffer->size()); + } // For sparse unions, we now create children with the same length as the // parent @@ -453,12 +462,13 @@ class NullArrayFactory { // For dense unions, we set the offsets to all zero and create children // with length 1 out_->buffers.resize(3); - out_->buffers[2] = buffer_; + out_->buffers[2] = child_buffer; child_length = 1; } for (int i = 0; i < type_->num_fields(); ++i) { - ARROW_ASSIGN_OR_RAISE(out_->child_data[i], CreateChild(i, child_length)); + ARROW_ASSIGN_OR_RAISE(out_->child_data[i], + CreateChild(i, child_length, child_buffer)); } return Status::OK(); } @@ -474,9 +484,10 @@ class NullArrayFactory { return Status::NotImplemented("construction of all-null ", type); } - Result> CreateChild(int i, int64_t length) { + Result> CreateChild( + int i, int64_t length, const std::shared_ptr& buffer = nullptr) { NullArrayFactory child_factory(pool_, type_->field(i)->type(), length); - child_factory.buffer_ = buffer_; + child_factory.buffer_ = buffer ? buffer : buffer_; return child_factory.Create(); } diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index aeff2abebbd..29fc9dc2a9d 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -1778,6 +1779,55 @@ struct GetAppenders { return Status::OK(); } + Status Visit(const DenseUnionType& ty) { + const auto& union_ty = checked_cast(ty); + std::vector appenders(union_ty.num_fields()); + for (int i = 0; static_cast(i) < appenders.size(); i++) { + RETURN_NOT_OK(GetValueAppenders(*union_ty.field(i)->type(), &appenders[i])); + } + array_appender = [=](ArrayBuilder* raw_builder, + const std::shared_ptr& array, const int64_t offset, + const int64_t length) { + auto builder = checked_cast(raw_builder); + const int8_t* type_codes = array->GetValues(1); + for (int64_t row = offset; row < offset + length; row++) { + const int8_t type_code = type_codes[row]; + const int child_id = + checked_cast(*builder->type()).child_ids()[type_code]; + const int32_t union_offset = array->GetValues(2)[row]; + RETURN_NOT_OK(builder->Append(type_code)); + RETURN_NOT_OK(appenders[child_id](builder->child_builder(child_id).get(), + array->child_data[child_id], union_offset, + /*length=*/1)); + } + return Status::OK(); + }; + return Status::OK(); + } + + Status Visit(const SparseUnionType& ty) { + const auto& union_ty = checked_cast(ty); + std::vector appenders(union_ty.num_fields()); + for (int i = 0; static_cast(i) < appenders.size(); i++) { + RETURN_NOT_OK(GetValueAppenders(*union_ty.field(i)->type(), &appenders[i])); + } + array_appender = [=](ArrayBuilder* raw_builder, + const std::shared_ptr& array, const int64_t offset, + const int64_t length) { + auto builder = checked_cast(raw_builder); + for (int i = 0; static_cast(i) < appenders.size(); i++) { + RETURN_NOT_OK(appenders[i](builder->child_builder(i).get(), array->child_data[i], + array->offset + offset, length)); + } + const int8_t* type_codes = array->GetValues(1); + for (int64_t row = offset; row < offset + length; row++) { + RETURN_NOT_OK(builder->Append(type_codes[row])); + } + return Status::OK(); + }; + return Status::OK(); + } + Status Visit(const DataType& ty) { return Status::NotImplemented("Appender for type ", ty); } @@ -1788,7 +1838,7 @@ struct GetAppenders { }; static Status GetValueAppenders(const DataType& type, ArrayAppenderFunc* array_appender) { - // TODO: should cover scalars too + // We could extend this to cover scalars too, avoiding a type check in AppendScalar GetAppenders get_appenders; RETURN_NOT_OK(VisitTypeInline(type, &get_appenders)); *array_appender = std::move(get_appenders.array_appender); @@ -1979,6 +2029,45 @@ struct CaseWhenFunctor { } }; +template +struct CaseWhenFunctor> { + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + if (batch[0].null_count() > 0) { + return Status::Invalid("cond struct must not have outer nulls"); + } + if (batch[0].is_scalar()) { + return ExecScalar(ctx, batch, out); + } + return ExecArray(ctx, batch, out); + } + + static Status ExecScalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + return ExecVarWidthScalarCaseWhen( + ctx, batch, out, + [](KernelContext* ctx, const ArrayData& source, ArrayData* output) { + *output = source; + return Status::OK(); + }); + } + static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + ArrayAppenderFunc array_appender; + RETURN_NOT_OK(GetValueAppenders(*out->type(), &array_appender)); + return ExecVarWidthArrayCaseWhen( + ctx, batch, out, + // ReserveData + [&](ArrayBuilder* raw_builder) { return Status::OK(); }, + // AppendScalar + [](ArrayBuilder* raw_builder, const Scalar& scalar) { + return raw_builder->AppendScalar(scalar); + }, + // AppendArray + [&](ArrayBuilder* raw_builder, const std::shared_ptr& array, + const int64_t row) { + return array_appender(raw_builder, array, row, /*length=*/1); + }); + } +}; + struct CoalesceFunction : ScalarFunction { using ScalarFunction::ScalarFunction; @@ -2544,6 +2633,8 @@ void RegisterScalarIfElse(FunctionRegistry* registry) { AddCaseWhenKernel(func, Type::LARGE_LIST, CaseWhenFunctor::Exec); AddCaseWhenKernel(func, Type::MAP, CaseWhenFunctor::Exec); AddCaseWhenKernel(func, Type::STRUCT, CaseWhenFunctor::Exec); + AddCaseWhenKernel(func, Type::DENSE_UNION, CaseWhenFunctor::Exec); + AddCaseWhenKernel(func, Type::SPARSE_UNION, CaseWhenFunctor::Exec); DCHECK_OK(registry->AddFunction(std::move(func))); } { diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc index 4ed4f5fb5f8..b3b0f26cead 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc @@ -1405,6 +1405,68 @@ TEST(TestCaseWhen, StructOfListOfInt) { ArrayFromJSON(type, R"([null, null, null, [null, [null, 1]]])")); } +TEST(TestCaseWhen, UnionBoolString) { + for (const auto& type : std::vector>{ + sparse_union({field("a", boolean()), field("b", utf8())}, {2, 7}), + dense_union({field("a", boolean()), field("b", utf8())}, {2, 7})}) { + ARROW_SCOPED_TRACE(type->ToString()); + auto cond_true = ScalarFromJSON(boolean(), "true"); + auto cond_false = ScalarFromJSON(boolean(), "false"); + auto cond_null = ScalarFromJSON(boolean(), "null"); + auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]"); + auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]"); + auto scalar_null = ScalarFromJSON(type, "null"); + auto scalar1 = ScalarFromJSON(type, R"([2, null])"); + auto scalar2 = ScalarFromJSON(type, R"([7, "foo"])"); + auto values_null = ArrayFromJSON(type, "[null, null, null, null]"); + auto values1 = ArrayFromJSON(type, R"([[2, true], null, [7, "bar"], [7, "baz"]])"); + auto values2 = ArrayFromJSON(type, R"([[7, "spam"], [2, null], null, [7, null]])"); + + CheckScalar("case_when", {MakeStruct({}), values1}, values1); + CheckScalar("case_when", {MakeStruct({}), values_null}, values_null); + + CheckScalar("case_when", {MakeStruct({cond_true}), scalar1, values1}, + *MakeArrayFromScalar(*scalar1, 4)); + CheckScalar("case_when", {MakeStruct({cond_false}), scalar1, values1}, values1); + + CheckScalar("case_when", {MakeStruct({cond_true}), values1}, values1); + CheckScalar("case_when", {MakeStruct({cond_false}), values1}, values_null); + CheckScalar("case_when", {MakeStruct({cond_null}), values1}, values_null); + CheckScalar("case_when", {MakeStruct({cond_true}), values1, values2}, values1); + CheckScalar("case_when", {MakeStruct({cond_false}), values1, values2}, values2); + CheckScalar("case_when", {MakeStruct({cond_null}), values1, values2}, values2); + + CheckScalar("case_when", {MakeStruct({cond_true, cond_true}), values1, values2}, + values1); + CheckScalar("case_when", {MakeStruct({cond_false, cond_false}), values1, values2}, + values_null); + CheckScalar("case_when", {MakeStruct({cond_true, cond_false}), values1, values2}, + values1); + CheckScalar("case_when", {MakeStruct({cond_false, cond_true}), values1, values2}, + values2); + CheckScalar("case_when", {MakeStruct({cond_null, cond_true}), values1, values2}, + values2); + CheckScalar("case_when", + {MakeStruct({cond_false, cond_false}), values1, values2, values2}, + values2); + + CheckScalar("case_when", {MakeStruct({cond1, cond2}), scalar1, scalar2}, + ArrayFromJSON(type, R"([[2, null], [2, null], [7, "foo"], null])")); + CheckScalar("case_when", {MakeStruct({cond1}), scalar_null}, values_null); + CheckScalar("case_when", {MakeStruct({cond1}), scalar_null, scalar1}, + ArrayFromJSON(type, R"([null, null, [2, null], [2, null]])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), scalar1, scalar2, scalar1}, + ArrayFromJSON(type, R"([[2, null], [2, null], [7, "foo"], [2, null]])")); + + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2}, + ArrayFromJSON(type, R"([[2, true], null, null, null])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2, values1}, + ArrayFromJSON(type, R"([[2, true], null, null, [7, "baz"]])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values_null, values2, values1}, + ArrayFromJSON(type, R"([null, null, null, [7, "baz"]])")); + } +} + TEST(TestCaseWhen, DispatchBest) { CheckDispatchBest("case_when", {struct_({field("", boolean())}), int64(), int32()}, {struct_({field("", boolean())}), int64(), int64()}); From 5307c5a1a5543dbfc9aa92ea89e1be348c8cb7bd Mon Sep 17 00:00:00 2001 From: David Li Date: Tue, 3 Aug 2021 14:27:51 -0400 Subject: [PATCH 13/31] ARROW-13222: [C++] Trim unnecessary cases --- .../arrow/compute/kernels/scalar_if_else.cc | 85 ++----------------- 1 file changed, 8 insertions(+), 77 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index 29fc9dc2a9d..6fa127bfab0 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -1417,9 +1417,8 @@ struct CaseWhenFunctor { } }; -template static Status ExecVarWidthScalarCaseWhen(KernelContext* ctx, const ExecBatch& batch, - Datum* out, CopyArray copy_array) { + Datum* out) { const auto& conds = checked_cast(*batch.values[0].scalar()); Datum result; for (size_t i = 0; i < batch.values.size() - 1; i++) { @@ -1450,9 +1449,7 @@ static Status ExecVarWidthScalarCaseWhen(KernelContext* ctx, const ExecBatch& ba ctx->memory_pool())); *output = *array->data(); } else { - // Copy offsets/data - const ArrayData& source = *result.array(); - RETURN_NOT_OK(copy_array(ctx, source, output)); + *output = *result.array(); } return Status::OK(); } @@ -1522,37 +1519,11 @@ struct CaseWhenFunctor> { return Status::Invalid("cond struct must not have outer nulls"); } if (batch[0].is_scalar()) { - return ExecScalar(ctx, batch, out); + return ExecVarWidthScalarCaseWhen(ctx, batch, out); } return ExecArray(ctx, batch, out); } - static Status ExecScalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - return ExecVarWidthScalarCaseWhen( - ctx, batch, out, - [](KernelContext* ctx, const ArrayData& source, ArrayData* output) { - output->length = source.length; - output->SetNullCount(source.null_count); - if (source.MayHaveNulls()) { - ARROW_ASSIGN_OR_RAISE( - output->buffers[0], - arrow::internal::CopyBitmap(ctx->memory_pool(), source.buffers[0]->data(), - source.offset, source.length)); - } - ARROW_ASSIGN_OR_RAISE(output->buffers[1], - ctx->Allocate(sizeof(offset_type) * (source.length + 1))); - const offset_type* in_offsets = source.GetValues(1); - offset_type* out_offsets = output->GetMutableValues(1); - std::transform(in_offsets, in_offsets + source.length + 1, out_offsets, - [&](offset_type offset) { return offset - in_offsets[0]; }); - auto data_length = out_offsets[output->length] - out_offsets[0]; - ARROW_ASSIGN_OR_RAISE(output->buffers[2], ctx->Allocate(data_length)); - std::memcpy(output->buffers[2]->mutable_data(), - source.buffers[2]->data() + in_offsets[0], data_length); - return Status::OK(); - }); - } - static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { return ExecVarWidthArrayCaseWhen( ctx, batch, out, @@ -1854,19 +1825,11 @@ struct CaseWhenFunctor> { return Status::Invalid("cond struct must not have outer nulls"); } if (batch[0].is_scalar()) { - return ExecScalar(ctx, batch, out); + return ExecVarWidthScalarCaseWhen(ctx, batch, out); } return ExecArray(ctx, batch, out); } - static Status ExecScalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - return ExecVarWidthScalarCaseWhen( - ctx, batch, out, - [](KernelContext* ctx, const ArrayData& source, ArrayData* output) { - *output = source; - return Status::OK(); - }); - } static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { const auto& ty = checked_cast(*out->type()); ArrayAppenderFunc array_appender; @@ -1912,19 +1875,11 @@ struct CaseWhenFunctor { return Status::Invalid("cond struct must not have outer nulls"); } if (batch[0].is_scalar()) { - return ExecScalar(ctx, batch, out); + return ExecVarWidthScalarCaseWhen(ctx, batch, out); } return ExecArray(ctx, batch, out); } - static Status ExecScalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - return ExecVarWidthScalarCaseWhen( - ctx, batch, out, - [](KernelContext* ctx, const ArrayData& source, ArrayData* output) { - *output = source; - return Status::OK(); - }); - } static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { ArrayAppenderFunc array_appender; RETURN_NOT_OK(GetValueAppenders(*out->type(), &array_appender)); @@ -1951,19 +1906,11 @@ struct CaseWhenFunctor { return Status::Invalid("cond struct must not have outer nulls"); } if (batch[0].is_scalar()) { - return ExecScalar(ctx, batch, out); + return ExecVarWidthScalarCaseWhen(ctx, batch, out); } return ExecArray(ctx, batch, out); } - static Status ExecScalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - return ExecVarWidthScalarCaseWhen( - ctx, batch, out, - [](KernelContext* ctx, const ArrayData& source, ArrayData* output) { - *output = source; - return Status::OK(); - }); - } static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { ArrayAppenderFunc array_appender; RETURN_NOT_OK(GetValueAppenders(*out->type(), &array_appender)); @@ -1990,19 +1937,11 @@ struct CaseWhenFunctor { return Status::Invalid("cond struct must not have outer nulls"); } if (batch[0].is_scalar()) { - return ExecScalar(ctx, batch, out); + return ExecVarWidthScalarCaseWhen(ctx, batch, out); } return ExecArray(ctx, batch, out); } - static Status ExecScalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - return ExecVarWidthScalarCaseWhen( - ctx, batch, out, - [](KernelContext* ctx, const ArrayData& source, ArrayData* output) { - *output = source; - return Status::OK(); - }); - } static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { const auto& ty = checked_cast(*out->type()); const int64_t width = ty.list_size(); @@ -2036,19 +1975,11 @@ struct CaseWhenFunctor> { return Status::Invalid("cond struct must not have outer nulls"); } if (batch[0].is_scalar()) { - return ExecScalar(ctx, batch, out); + return ExecVarWidthScalarCaseWhen(ctx, batch, out); } return ExecArray(ctx, batch, out); } - static Status ExecScalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - return ExecVarWidthScalarCaseWhen( - ctx, batch, out, - [](KernelContext* ctx, const ArrayData& source, ArrayData* output) { - *output = source; - return Status::OK(); - }); - } static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { ArrayAppenderFunc array_appender; RETURN_NOT_OK(GetValueAppenders(*out->type(), &array_appender)); From 72811bbff07526d9fd9593fedfc7afe480a60acf Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 18 Aug 2021 16:35:29 -0400 Subject: [PATCH 14/31] ARROW-13222: [C++] Refactor GetValueAppender into builder methods --- cpp/src/arrow/array/builder_base.h | 17 + cpp/src/arrow/array/builder_binary.h | 24 ++ cpp/src/arrow/array/builder_nested.h | 64 ++++ cpp/src/arrow/array/builder_primitive.h | 16 + cpp/src/arrow/array/builder_union.cc | 4 + cpp/src/arrow/array/builder_union.h | 27 ++ .../arrow/compute/kernels/scalar_if_else.cc | 302 +----------------- .../compute/kernels/scalar_if_else_test.cc | 79 ++--- 8 files changed, 196 insertions(+), 337 deletions(-) diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index b52a6b668ec..b05a104bdc4 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -123,6 +123,23 @@ class ARROW_EXPORT ArrayBuilder { Status AppendScalar(const Scalar& scalar, int64_t n_repeats); Status AppendScalars(const ScalarVector& scalars); + /// \brief Append a range of values from an array + Status AppendArraySlice(const ArrayData& array, const int64_t offset, + const int64_t length) { + if (!type()->Equals(*array.type)) { + // TODO: test this + return Status::TypeError("Expected array of type ", *type(), + " but got array of type ", *array.type); + } + return AppendArraySliceUnchecked(array, offset, length); + } + /// \brief Append a range of values from an array without checking type compatibility + virtual Status AppendArraySliceUnchecked(const ArrayData& array, const int64_t offset, + const int64_t length) { + return Status::NotImplemented("AppendArraySliceUnchecked for builder for ", *type()); + } + // TODO: overloads for arrays + /// For cases where raw data was memcpy'd into the internal buffers, allows us /// to advance the length of the builder. It is your responsibility to use /// this function responsibly. diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 1a6b254a7d3..ec12aa8e6b2 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -274,6 +274,23 @@ class BaseBinaryBuilder : public ArrayBuilder { return Status::OK(); } + Status AppendArraySliceUnchecked(const ArrayData& array, int64_t offset, + int64_t length) override { + auto bitmap = array.GetValues(0, 0); + auto offsets = array.GetValues(1); + auto data = array.GetValues(2, 0); + for (int64_t i = 0; i < length; i++) { + if (!bitmap || BitUtil::GetBit(bitmap, array.offset + offset + i)) { + const offset_type start = offsets[offset + i]; + const offset_type end = offsets[offset + i + 1]; + RETURN_NOT_OK(Append(data + start, end - start)); + } else { + RETURN_NOT_OK(AppendNull()); + } + } + return Status::OK(); + } + void Reset() override { ArrayBuilder::Reset(); offsets_builder_.Reset(); @@ -495,6 +512,13 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { Status AppendEmptyValue() final; Status AppendEmptyValues(int64_t length) final; + Status AppendArraySliceUnchecked(const ArrayData& array, int64_t offset, + int64_t length) override { + return AppendValues( + array.GetValues(1, 0) + ((array.offset + offset) * byte_width_), length, + array.GetValues(0, 0), array.offset + offset); + } + void UnsafeAppend(const uint8_t* value) { UnsafeAppendToBitmap(true); if (ARROW_PREDICT_TRUE(byte_width_ > 0)) { diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index 12b999b786e..57c460be272 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -122,6 +122,23 @@ class BaseListBuilder : public ArrayBuilder { return Status::OK(); } + Status AppendArraySliceUnchecked(const ArrayData& array, int64_t offset, + int64_t length) override { + const offset_type* offsets = array.GetValues(1); + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : nullptr; + for (int64_t row = offset; row < offset + length; row++) { + if (!validity || BitUtil::GetBit(validity, array.offset + row)) { + RETURN_NOT_OK(Append()); + int64_t slot_length = offsets[row + 1] - offsets[row]; + value_builder_->AppendArraySliceUnchecked(*array.child_data[0], offsets[row], + slot_length); + } else { + RETURN_NOT_OK(AppendNull()); + } + } + return Status::OK(); + } + Status FinishInternal(std::shared_ptr* out) override { ARROW_RETURN_NOT_OK(AppendNextOffset()); @@ -275,6 +292,25 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder { Status AppendEmptyValues(int64_t length) final; + Status AppendArraySliceUnchecked(const ArrayData& array, int64_t offset, + int64_t length) override { + const int32_t* offsets = array.GetValues(1); + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : nullptr; + for (int64_t row = offset; row < offset + length; row++) { + if (!validity || BitUtil::GetBit(validity, array.offset + row)) { + RETURN_NOT_OK(Append()); + const int64_t slot_length = offsets[row + 1] - offsets[row]; + key_builder_->AppendArraySliceUnchecked(*array.child_data[0]->child_data[0], + offsets[row], slot_length); + item_builder_->AppendArraySliceUnchecked(*array.child_data[0]->child_data[1], + offsets[row], slot_length); + } else { + RETURN_NOT_OK(AppendNull()); + } + } + return Status::OK(); + } + /// \brief Get builder to append keys. /// /// Append a key with this builder should be followed by appending @@ -374,6 +410,21 @@ class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder { Status AppendEmptyValues(int64_t length) final; + Status AppendArraySliceUnchecked(const ArrayData& array, int64_t offset, + int64_t length) final { + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : nullptr; + for (int64_t row = offset; row < offset + length; row++) { + if (!validity || BitUtil::GetBit(validity, array.offset + row)) { + RETURN_NOT_OK(value_builder_->AppendArraySliceUnchecked( + *array.child_data[0], list_size_ * (array.offset + row), list_size_)); + RETURN_NOT_OK(Append()); + } else { + RETURN_NOT_OK(AppendNull()); + } + } + return Status::OK(); + } + ArrayBuilder* value_builder() const { return value_builder_.get(); } std::shared_ptr type() const override { @@ -467,6 +518,19 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder { return Status::OK(); } + Status AppendArraySliceUnchecked(const ArrayData& array, int64_t offset, + int64_t length) override { + for (int i = 0; static_cast(i) < children_.size(); i++) { + children_[i]->AppendArraySliceUnchecked(*array.child_data[i], array.offset + offset, + length); + } + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : nullptr; + for (int64_t row = offset; row < offset + length; row++) { + RETURN_NOT_OK(Append(!validity || BitUtil::GetBit(validity, array.offset + row))); + } + return Status::OK(); + } + void Reset() override; ArrayBuilder* field_builder(int i) const { return children_[i].get(); } diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h index 856904e1894..4b5d9f58b5e 100644 --- a/cpp/src/arrow/array/builder_primitive.h +++ b/cpp/src/arrow/array/builder_primitive.h @@ -53,6 +53,10 @@ class ARROW_EXPORT NullBuilder : public ArrayBuilder { Status Append(std::nullptr_t) { return AppendNull(); } + Status AppendArraySliceUnchecked(const ArrayData&, int64_t, int64_t length) override { + return AppendNulls(length); + } + Status FinishInternal(std::shared_ptr* out) override; /// \cond FALSE @@ -271,6 +275,12 @@ class NumericBuilder : public ArrayBuilder { return Status::OK(); } + Status AppendArraySliceUnchecked(const ArrayData& array, int64_t offset, + int64_t length) override { + return AppendValues(array.GetValues(1) + offset, length, + array.GetValues(0, 0), array.offset + offset); + } + /// Append a single scalar under the assumption that the underlying Buffer is /// large enough. /// @@ -483,6 +493,12 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { Status AppendValues(int64_t length, bool value); + Status AppendArraySliceUnchecked(const ArrayData& array, int64_t offset, + int64_t length) override { + return AppendValues(array.GetValues(1, 0), length, + array.GetValues(0, 0), array.offset + offset); + } + Status FinishInternal(std::shared_ptr* out) override; /// \cond FALSE diff --git a/cpp/src/arrow/array/builder_union.cc b/cpp/src/arrow/array/builder_union.cc index 8617cb73fce..e55315f32cf 100644 --- a/cpp/src/arrow/array/builder_union.cc +++ b/cpp/src/arrow/array/builder_union.cc @@ -64,6 +64,7 @@ BasicUnionBuilder::BasicUnionBuilder( type_codes_ = union_type.type_codes(); children_ = children; + type_id_to_child_id_.resize(union_type.max_type_code() + 1, -1); type_id_to_children_.resize(union_type.max_type_code() + 1, nullptr); DCHECK_LE( type_id_to_children_.size() - 1, @@ -73,6 +74,7 @@ BasicUnionBuilder::BasicUnionBuilder( child_fields_[i] = union_type.field(static_cast(i)); auto type_id = union_type.type_codes()[i]; + type_id_to_child_id_[type_id] = i; type_id_to_children_[type_id] = children[i].get(); } } @@ -82,6 +84,7 @@ int8_t BasicUnionBuilder::AppendChild(const std::shared_ptr& new_c children_.push_back(new_child); auto new_type_id = NextTypeId(); + type_id_to_child_id_[new_type_id] = static_cast(children_.size() - 1); type_id_to_children_[new_type_id] = new_child.get(); child_fields_.push_back(field(field_name, nullptr)); type_codes_.push_back(static_cast(new_type_id)); @@ -114,6 +117,7 @@ int8_t BasicUnionBuilder::NextTypeId() { static_cast(UnionType::kMaxTypeCode)); // type_id_to_children_ is already densely packed, so just append the new child + type_id_to_child_id_.resize(type_id_to_child_id_.size() + 1); type_id_to_children_.resize(type_id_to_children_.size() + 1); return dense_type_id_++; } diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h index 060be474fb8..24ddf76f12c 100644 --- a/cpp/src/arrow/array/builder_union.h +++ b/cpp/src/arrow/array/builder_union.h @@ -74,6 +74,7 @@ class ARROW_EXPORT BasicUnionBuilder : public ArrayBuilder { UnionMode::type mode_; std::vector type_id_to_children_; + std::vector type_id_to_child_id_; // for all type_id < dense_type_id_, type_id_to_children_[type_id] != nullptr int8_t dense_type_id_ = 0; TypedBufferBuilder types_builder_; @@ -155,6 +156,21 @@ class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder { return offsets_builder_.Append(offset); } + Status AppendArraySliceUnchecked(const ArrayData& array, const int64_t offset, + const int64_t length) override { + const int8_t* type_codes = array.GetValues(1); + const int32_t* offsets = array.GetValues(2); + for (int64_t row = offset; row < offset + length; row++) { + const int8_t type_code = type_codes[row]; + const int child_id = type_id_to_child_id_[type_code]; + const int32_t union_offset = offsets[row]; + RETURN_NOT_OK(Append(type_code)); + RETURN_NOT_OK(type_id_to_children_[type_code]->AppendArraySliceUnchecked( + *array.child_data[child_id], union_offset, /*length=*/1)); + } + return Status::OK(); + } + Status FinishInternal(std::shared_ptr* out) override; private: @@ -230,6 +246,17 @@ class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder { /// The corresponding child builder must be appended to independently after this method /// is called, and all other child builders must have null or empty value appended. Status Append(int8_t next_type) { return types_builder_.Append(next_type); } + + Status AppendArraySliceUnchecked(const ArrayData& array, const int64_t offset, + const int64_t length) override { + for (size_t i = 0; i < type_codes_.size(); i++) { + type_id_to_children_[type_codes_[i]]->AppendArraySliceUnchecked( + *array.child_data[i], array.offset + offset, length); + } + const int8_t* type_codes = array.GetValues(1); + types_builder_.Append(type_codes + offset, length); + return Status::OK(); + } }; } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index 6fa127bfab0..603ee87d1c5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -1454,11 +1454,11 @@ static Status ExecVarWidthScalarCaseWhen(KernelContext* ctx, const ExecBatch& ba return Status::OK(); } -template +// TODO: untemplate on ReserveData to avoid generating so many overloads +template static Status ExecVarWidthArrayCaseWhen(KernelContext* ctx, const ExecBatch& batch, Datum* out, ReserveData reserve_data, - AppendScalar append_scalar, - AppendArray append_array) { + AppendScalar append_scalar) { const auto& conds_array = *batch.values[0].array(); ArrayData* output = out->mutable_array(); const bool have_else_arg = @@ -1498,7 +1498,7 @@ static Status ExecVarWidthArrayCaseWhen(KernelContext* ctx, const ExecBatch& bat const auto& array = source.array(); if (!array->buffers[0] || BitUtil::GetBit(array->buffers[0]->data(), array->offset + row)) { - RETURN_NOT_OK(append_array(raw_builder.get(), array, row)); + RETURN_NOT_OK(raw_builder->AppendArraySliceUnchecked(*array, row, /*length=*/1)); } else { RETURN_NOT_OK(raw_builder->AppendNull()); } @@ -1554,268 +1554,10 @@ struct CaseWhenFunctor> { return checked_cast(raw_builder) ->Append(scalar.value->data(), static_cast(scalar.value->size())); - }, - // AppendArray - [](ArrayBuilder* raw_builder, const std::shared_ptr& array, - const int64_t row) { - const offset_type* offsets = array->GetValues(1); - return checked_cast(raw_builder) - ->Append(array->buffers[2]->data() + offsets[row], - offsets[row + 1] - offsets[row]); }); } }; -// Given an array and a builder, append a slice of the array to the builder -using ArrayAppenderFunc = std::function&, int64_t, int64_t)>; - -static Status GetValueAppenders(const DataType& type, ArrayAppenderFunc* array_appender); - -struct GetAppenders { - template - enable_if_t::value && !is_boolean_type::value, Status> Visit( - const T&) { - using BuilderType = typename TypeTraits::BuilderType; - using c_type = typename T::c_type; - array_appender = [](ArrayBuilder* raw_builder, - const std::shared_ptr& array, const int64_t offset, - const int64_t length) { - return checked_cast(raw_builder) - ->AppendValues(array->GetValues(1) + offset, length, - array->GetValues(0, 0), array->offset + offset); - }; - return Status::OK(); - } - - Status Visit(const BooleanType&) { - array_appender = [](ArrayBuilder* raw_builder, - const std::shared_ptr& array, const int64_t offset, - const int64_t length) { - return checked_cast(raw_builder) - ->AppendValues(array->GetValues(1, 0), length, - array->GetValues(0, 0), array->offset + offset); - }; - return Status::OK(); - } - - template - enable_if_fixed_size_binary Visit(const T& ty) { - const int32_t width = static_cast(ty).byte_width(); - array_appender = [=](ArrayBuilder* raw_builder, - const std::shared_ptr& array, const int64_t offset, - const int64_t length) { - return checked_cast(raw_builder) - ->AppendValues( - array->GetValues(1, 0) + ((array->offset + offset) * width), - length, array->GetValues(0, 0), array->offset + offset); - }; - return Status::OK(); - } - - template - enable_if_base_binary Visit(const T&) { - using BuilderType = typename TypeTraits::BuilderType; - using offset_type = typename T::offset_type; - array_appender = [](ArrayBuilder* raw_builder, - const std::shared_ptr& array, const int64_t offset, - const int64_t length) { - auto builder = checked_cast(raw_builder); - auto bitmap = array->GetValues(0, 0); - auto offsets = array->GetValues(1); - auto data = array->GetValues(2, 0); - for (int64_t i = 0; i < length; i++) { - if (!bitmap || BitUtil::GetBit(bitmap, offset + i)) { - const offset_type start = offsets[offset + i]; - const offset_type end = offsets[offset + i + 1]; - RETURN_NOT_OK(builder->Append(data + start, end - start)); - } else { - RETURN_NOT_OK(builder->AppendNull()); - } - } - return Status::OK(); - }; - return Status::OK(); - } - - template - enable_if_var_size_list Visit(const T& ty) { - using BuilderType = typename TypeTraits::BuilderType; - using offset_type = typename T::offset_type; - const auto& list_ty = checked_cast(ty); - ArrayAppenderFunc sub_appender; - RETURN_NOT_OK(GetValueAppenders(*list_ty.value_type(), &sub_appender)); - array_appender = [=](ArrayBuilder* raw_builder, - const std::shared_ptr& array, const int64_t offset, - const int64_t length) { - auto builder = checked_cast(raw_builder); - auto child_builder = builder->value_builder(); - const offset_type* offsets = array->GetValues(1); - const uint8_t* validity = - array->MayHaveNulls() ? array->buffers[0]->data() : nullptr; - for (int64_t row = offset; row < offset + length; row++) { - if (!validity || BitUtil::GetBit(validity, array->offset + row)) { - RETURN_NOT_OK(builder->Append()); - int64_t length = offsets[row + 1] - offsets[row]; - RETURN_NOT_OK( - sub_appender(child_builder, array->child_data[0], offsets[row], length)); - } else { - RETURN_NOT_OK(builder->AppendNull()); - } - } - return Status::OK(); - }; - return Status::OK(); - } - - Status Visit(const MapType& ty) { - const auto& map_ty = checked_cast(ty); - ArrayAppenderFunc key_appender, item_appender; - RETURN_NOT_OK(GetValueAppenders(*map_ty.key_type(), &key_appender)); - RETURN_NOT_OK(GetValueAppenders(*map_ty.item_type(), &item_appender)); - array_appender = [=](ArrayBuilder* raw_builder, - const std::shared_ptr& array, const int64_t offset, - const int64_t length) { - auto builder = checked_cast(raw_builder); - auto key_builder = builder->key_builder(); - auto item_builder = builder->item_builder(); - const int32_t* offsets = array->GetValues(1); - const uint8_t* validity = - array->MayHaveNulls() ? array->buffers[0]->data() : nullptr; - for (int64_t row = offset; row < offset + length; row++) { - if (!validity || BitUtil::GetBit(validity, array->offset + row)) { - RETURN_NOT_OK(builder->Append()); - int64_t length = offsets[row + 1] - offsets[row]; - RETURN_NOT_OK(key_appender(key_builder, array->child_data[0]->child_data[0], - offsets[row], length)); - RETURN_NOT_OK(item_appender(item_builder, array->child_data[0]->child_data[1], - offsets[row], length)); - } else { - RETURN_NOT_OK(builder->AppendNull()); - } - } - return Status::OK(); - }; - return Status::OK(); - } - - Status Visit(const StructType& ty) { - const auto& struct_ty = checked_cast(ty); - std::vector appenders(struct_ty.num_fields()); - for (int i = 0; static_cast(i) < appenders.size(); i++) { - RETURN_NOT_OK(GetValueAppenders(*struct_ty.field(i)->type(), &appenders[i])); - } - array_appender = [=](ArrayBuilder* raw_builder, - const std::shared_ptr& array, const int64_t offset, - const int64_t length) { - auto builder = checked_cast(raw_builder); - for (int i = 0; static_cast(i) < appenders.size(); i++) { - RETURN_NOT_OK(appenders[i](builder->field_builder(i), array->child_data[i], - array->offset + offset, length)); - } - const uint8_t* validity = - array->MayHaveNulls() ? array->buffers[0]->data() : nullptr; - for (int64_t row = offset; row < offset + length; row++) { - RETURN_NOT_OK( - builder->Append(!validity || BitUtil::GetBit(validity, array->offset + row))); - } - return Status::OK(); - }; - return Status::OK(); - } - - Status Visit(const FixedSizeListType& ty) { - const auto& list_ty = checked_cast(ty); - const int64_t width = list_ty.list_size(); - ArrayAppenderFunc sub_appender; - RETURN_NOT_OK(GetValueAppenders(*list_ty.value_type(), &sub_appender)); - array_appender = [=](ArrayBuilder* raw_builder, - const std::shared_ptr& array, const int64_t offset, - const int64_t length) { - auto builder = checked_cast(raw_builder); - auto child_builder = builder->value_builder(); - const uint8_t* validity = - array->MayHaveNulls() ? array->buffers[0]->data() : nullptr; - for (int64_t row = offset; row < offset + length; row++) { - if (!validity || BitUtil::GetBit(validity, array->offset + row)) { - RETURN_NOT_OK(sub_appender(child_builder, array->child_data[0], - width * (array->offset + row), width)); - RETURN_NOT_OK(builder->Append()); - } else { - RETURN_NOT_OK(builder->AppendNull()); - } - } - return Status::OK(); - }; - return Status::OK(); - } - - Status Visit(const DenseUnionType& ty) { - const auto& union_ty = checked_cast(ty); - std::vector appenders(union_ty.num_fields()); - for (int i = 0; static_cast(i) < appenders.size(); i++) { - RETURN_NOT_OK(GetValueAppenders(*union_ty.field(i)->type(), &appenders[i])); - } - array_appender = [=](ArrayBuilder* raw_builder, - const std::shared_ptr& array, const int64_t offset, - const int64_t length) { - auto builder = checked_cast(raw_builder); - const int8_t* type_codes = array->GetValues(1); - for (int64_t row = offset; row < offset + length; row++) { - const int8_t type_code = type_codes[row]; - const int child_id = - checked_cast(*builder->type()).child_ids()[type_code]; - const int32_t union_offset = array->GetValues(2)[row]; - RETURN_NOT_OK(builder->Append(type_code)); - RETURN_NOT_OK(appenders[child_id](builder->child_builder(child_id).get(), - array->child_data[child_id], union_offset, - /*length=*/1)); - } - return Status::OK(); - }; - return Status::OK(); - } - - Status Visit(const SparseUnionType& ty) { - const auto& union_ty = checked_cast(ty); - std::vector appenders(union_ty.num_fields()); - for (int i = 0; static_cast(i) < appenders.size(); i++) { - RETURN_NOT_OK(GetValueAppenders(*union_ty.field(i)->type(), &appenders[i])); - } - array_appender = [=](ArrayBuilder* raw_builder, - const std::shared_ptr& array, const int64_t offset, - const int64_t length) { - auto builder = checked_cast(raw_builder); - for (int i = 0; static_cast(i) < appenders.size(); i++) { - RETURN_NOT_OK(appenders[i](builder->child_builder(i).get(), array->child_data[i], - array->offset + offset, length)); - } - const int8_t* type_codes = array->GetValues(1); - for (int64_t row = offset; row < offset + length; row++) { - RETURN_NOT_OK(builder->Append(type_codes[row])); - } - return Status::OK(); - }; - return Status::OK(); - } - - Status Visit(const DataType& ty) { - return Status::NotImplemented("Appender for type ", ty); - } - - ArrayAppenderFunc GetArrayAppender() { return array_appender; } - - ArrayAppenderFunc array_appender; -}; - -static Status GetValueAppenders(const DataType& type, ArrayAppenderFunc* array_appender) { - // We could extend this to cover scalars too, avoiding a type check in AppendScalar - GetAppenders get_appenders; - RETURN_NOT_OK(VisitTypeInline(type, &get_appenders)); - *array_appender = std::move(get_appenders.array_appender); - return Status::OK(); -} - template struct CaseWhenFunctor> { using offset_type = typename Type::offset_type; @@ -1831,9 +1573,6 @@ struct CaseWhenFunctor> { } static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - const auto& ty = checked_cast(*out->type()); - ArrayAppenderFunc array_appender; - RETURN_NOT_OK(GetValueAppenders(ty, &array_appender)); return ExecVarWidthArrayCaseWhen( ctx, batch, out, // ReserveData @@ -1859,11 +1598,6 @@ struct CaseWhenFunctor> { // AppendScalar [](ArrayBuilder* raw_builder, const Scalar& scalar) { return raw_builder->AppendScalar(scalar); - }, - // AppendArray - [&](ArrayBuilder* raw_builder, const std::shared_ptr& array, - const int64_t row) { - return array_appender(raw_builder, array, row, /*length=*/1); }); } }; @@ -1881,8 +1615,6 @@ struct CaseWhenFunctor { } static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - ArrayAppenderFunc array_appender; - RETURN_NOT_OK(GetValueAppenders(*out->type(), &array_appender)); return ExecVarWidthArrayCaseWhen( ctx, batch, out, // ReserveData @@ -1890,11 +1622,6 @@ struct CaseWhenFunctor { // AppendScalar [](ArrayBuilder* raw_builder, const Scalar& scalar) { return raw_builder->AppendScalar(scalar); - }, - // AppendArray - [&](ArrayBuilder* raw_builder, const std::shared_ptr& array, - const int64_t row) { - return array_appender(raw_builder, array, row, /*length=*/1); }); } }; @@ -1912,8 +1639,6 @@ struct CaseWhenFunctor { } static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - ArrayAppenderFunc array_appender; - RETURN_NOT_OK(GetValueAppenders(*out->type(), &array_appender)); return ExecVarWidthArrayCaseWhen( ctx, batch, out, // ReserveData @@ -1921,11 +1646,6 @@ struct CaseWhenFunctor { // AppendScalar [](ArrayBuilder* raw_builder, const Scalar& scalar) { return raw_builder->AppendScalar(scalar); - }, - // AppendArray - [&](ArrayBuilder* raw_builder, const std::shared_ptr& array, - const int64_t row) { - return array_appender(raw_builder, array, row, /*length=*/1); }); } }; @@ -1945,8 +1665,6 @@ struct CaseWhenFunctor { static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { const auto& ty = checked_cast(*out->type()); const int64_t width = ty.list_size(); - ArrayAppenderFunc array_appender; - RETURN_NOT_OK(GetValueAppenders(ty, &array_appender)); return ExecVarWidthArrayCaseWhen( ctx, batch, out, // ReserveData @@ -1959,11 +1677,6 @@ struct CaseWhenFunctor { // AppendScalar [](ArrayBuilder* raw_builder, const Scalar& scalar) { return raw_builder->AppendScalar(scalar); - }, - // AppendArray - [&](ArrayBuilder* raw_builder, const std::shared_ptr& array, - const int64_t row) { - return array_appender(raw_builder, array, row, /*length=*/1); }); } }; @@ -1981,8 +1694,6 @@ struct CaseWhenFunctor> { } static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - ArrayAppenderFunc array_appender; - RETURN_NOT_OK(GetValueAppenders(*out->type(), &array_appender)); return ExecVarWidthArrayCaseWhen( ctx, batch, out, // ReserveData @@ -1990,11 +1701,6 @@ struct CaseWhenFunctor> { // AppendScalar [](ArrayBuilder* raw_builder, const Scalar& scalar) { return raw_builder->AppendScalar(scalar); - }, - // AppendArray - [&](ArrayBuilder* raw_builder, const std::shared_ptr& array, - const int64_t row) { - return array_appender(raw_builder, array, row, /*length=*/1); }); } }; diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc index b3b0f26cead..8ae6d79ba60 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc @@ -886,47 +886,48 @@ TYPED_TEST(TestCaseWhenBinary, Basics) { auto values1 = ArrayFromJSON(type, R"(["cDE", null, "degfhi", "efg"])"); auto values2 = ArrayFromJSON(type, R"(["fghijk", "ghi", null, "hi"])"); - CheckScalar("case_when", {MakeStruct({}), values1}, values1); - CheckScalar("case_when", {MakeStruct({}), values_null}, values_null); - - CheckScalar("case_when", {MakeStruct({cond_true}), scalar1, values1}, - *MakeArrayFromScalar(*scalar1, 4)); - CheckScalar("case_when", {MakeStruct({cond_false}), scalar1, values1}, values1); - - CheckScalar("case_when", {MakeStruct({cond_true}), values1}, values1); - CheckScalar("case_when", {MakeStruct({cond_false}), values1}, values_null); - CheckScalar("case_when", {MakeStruct({cond_null}), values1}, values_null); - CheckScalar("case_when", {MakeStruct({cond_true}), values1, values2}, values1); - CheckScalar("case_when", {MakeStruct({cond_false}), values1, values2}, values2); - CheckScalar("case_when", {MakeStruct({cond_null}), values1, values2}, values2); - - CheckScalar("case_when", {MakeStruct({cond_true, cond_true}), values1, values2}, - values1); - CheckScalar("case_when", {MakeStruct({cond_false, cond_false}), values1, values2}, - values_null); - CheckScalar("case_when", {MakeStruct({cond_true, cond_false}), values1, values2}, - values1); - CheckScalar("case_when", {MakeStruct({cond_false, cond_true}), values1, values2}, - values2); - CheckScalar("case_when", {MakeStruct({cond_null, cond_true}), values1, values2}, - values2); - CheckScalar("case_when", - {MakeStruct({cond_false, cond_false}), values1, values2, values2}, values2); - - CheckScalar("case_when", {MakeStruct({cond1, cond2}), scalar1, scalar2}, - ArrayFromJSON(type, R"(["aBxYz", "aBxYz", "b", null])")); - CheckScalar("case_when", {MakeStruct({cond1}), scalar_null}, values_null); - CheckScalar("case_when", {MakeStruct({cond1}), scalar_null, scalar1}, - ArrayFromJSON(type, R"([null, null, "aBxYz", "aBxYz"])")); - CheckScalar("case_when", {MakeStruct({cond1, cond2}), scalar1, scalar2, scalar1}, - ArrayFromJSON(type, R"(["aBxYz", "aBxYz", "b", "aBxYz"])")); - - CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2}, - ArrayFromJSON(type, R"(["cDE", null, null, null])")); + // CheckScalar("case_when", {MakeStruct({}), values1}, values1); + // CheckScalar("case_when", {MakeStruct({}), values_null}, values_null); + + // CheckScalar("case_when", {MakeStruct({cond_true}), scalar1, values1}, + // *MakeArrayFromScalar(*scalar1, 4)); + // CheckScalar("case_when", {MakeStruct({cond_false}), scalar1, values1}, values1); + + // CheckScalar("case_when", {MakeStruct({cond_true}), values1}, values1); + // CheckScalar("case_when", {MakeStruct({cond_false}), values1}, values_null); + // CheckScalar("case_when", {MakeStruct({cond_null}), values1}, values_null); + // CheckScalar("case_when", {MakeStruct({cond_true}), values1, values2}, values1); + // CheckScalar("case_when", {MakeStruct({cond_false}), values1, values2}, values2); + // CheckScalar("case_when", {MakeStruct({cond_null}), values1, values2}, values2); + + // CheckScalar("case_when", {MakeStruct({cond_true, cond_true}), values1, values2}, + // values1); + // CheckScalar("case_when", {MakeStruct({cond_false, cond_false}), values1, values2}, + // values_null); + // CheckScalar("case_when", {MakeStruct({cond_true, cond_false}), values1, values2}, + // values1); + // CheckScalar("case_when", {MakeStruct({cond_false, cond_true}), values1, values2}, + // values2); + // CheckScalar("case_when", {MakeStruct({cond_null, cond_true}), values1, values2}, + // values2); + // CheckScalar("case_when", + // {MakeStruct({cond_false, cond_false}), values1, values2, values2}, + // values2); + + // CheckScalar("case_when", {MakeStruct({cond1, cond2}), scalar1, scalar2}, + // ArrayFromJSON(type, R"(["aBxYz", "aBxYz", "b", null])")); + // CheckScalar("case_when", {MakeStruct({cond1}), scalar_null}, values_null); + // CheckScalar("case_when", {MakeStruct({cond1}), scalar_null, scalar1}, + // ArrayFromJSON(type, R"([null, null, "aBxYz", "aBxYz"])")); + // CheckScalar("case_when", {MakeStruct({cond1, cond2}), scalar1, scalar2, scalar1}, + // ArrayFromJSON(type, R"(["aBxYz", "aBxYz", "b", "aBxYz"])")); + + // CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2}, + // ArrayFromJSON(type, R"(["cDE", null, null, null])")); CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2, values1}, ArrayFromJSON(type, R"(["cDE", null, null, "efg"])")); - CheckScalar("case_when", {MakeStruct({cond1, cond2}), values_null, values2, values1}, - ArrayFromJSON(type, R"([null, null, null, "efg"])")); + // CheckScalar("case_when", {MakeStruct({cond1, cond2}), values_null, values2, values1}, + // ArrayFromJSON(type, R"([null, null, null, "efg"])")); } template From d9cb23e8e5d3fa923ba4a60f6c6c561e85f97d33 Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 18 Aug 2021 16:45:59 -0400 Subject: [PATCH 15/31] ARROW-13222: [C++] Untemplate common case --- .../arrow/compute/kernels/scalar_if_else.cc | 59 ++++++++----------- 1 file changed, 24 insertions(+), 35 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index 603ee87d1c5..c700c76fb23 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -1454,11 +1454,11 @@ static Status ExecVarWidthScalarCaseWhen(KernelContext* ctx, const ExecBatch& ba return Status::OK(); } -// TODO: untemplate on ReserveData to avoid generating so many overloads -template -static Status ExecVarWidthArrayCaseWhen(KernelContext* ctx, const ExecBatch& batch, - Datum* out, ReserveData reserve_data, - AppendScalar append_scalar) { +// Use std::function for reserve_data to avoid instantiating template so much +template +static Status ExecVarWidthArrayCaseWhenImpl( + KernelContext* ctx, const ExecBatch& batch, Datum* out, + std::function reserve_data, AppendScalar append_scalar) { const auto& conds_array = *batch.values[0].array(); ArrayData* output = out->mutable_array(); const bool have_else_arg = @@ -1510,6 +1510,17 @@ static Status ExecVarWidthArrayCaseWhen(KernelContext* ctx, const ExecBatch& bat return Status::OK(); } +// Single instantiation using ArrayBuilder::AppendScalar for append_scalar +static Status ExecVarWidthArrayCaseWhen( + KernelContext* ctx, const ExecBatch& batch, Datum* out, + std::function reserve_data) { + return ExecVarWidthArrayCaseWhenImpl( + ctx, batch, out, std::move(reserve_data), + [](ArrayBuilder* raw_builder, const Scalar& scalar) { + return raw_builder->AppendScalar(scalar); + }); +} + template struct CaseWhenFunctor> { using offset_type = typename Type::offset_type; @@ -1525,7 +1536,7 @@ struct CaseWhenFunctor> { } static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - return ExecVarWidthArrayCaseWhen( + return ExecVarWidthArrayCaseWhenImpl( ctx, batch, out, // ReserveData [&](ArrayBuilder* raw_builder) { @@ -1594,10 +1605,6 @@ struct CaseWhenFunctor> { } } return child_builder->Reserve(reservation); - }, - // AppendScalar - [](ArrayBuilder* raw_builder, const Scalar& scalar) { - return raw_builder->AppendScalar(scalar); }); } }; @@ -1615,14 +1622,9 @@ struct CaseWhenFunctor { } static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - return ExecVarWidthArrayCaseWhen( - ctx, batch, out, - // ReserveData - [](ArrayBuilder*) { return Status::OK(); }, - // AppendScalar - [](ArrayBuilder* raw_builder, const Scalar& scalar) { - return raw_builder->AppendScalar(scalar); - }); + return ExecVarWidthArrayCaseWhen(ctx, batch, out, + // ReserveData + [](ArrayBuilder*) { return Status::OK(); }); } }; @@ -1639,14 +1641,9 @@ struct CaseWhenFunctor { } static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - return ExecVarWidthArrayCaseWhen( - ctx, batch, out, - // ReserveData - [](ArrayBuilder*) { return Status::OK(); }, - // AppendScalar - [](ArrayBuilder* raw_builder, const Scalar& scalar) { - return raw_builder->AppendScalar(scalar); - }); + return ExecVarWidthArrayCaseWhen(ctx, batch, out, + // ReserveData + [](ArrayBuilder*) { return Status::OK(); }); } }; @@ -1673,10 +1670,6 @@ struct CaseWhenFunctor { return checked_cast(raw_builder) ->value_builder() ->Reserve(children); - }, - // AppendScalar - [](ArrayBuilder* raw_builder, const Scalar& scalar) { - return raw_builder->AppendScalar(scalar); }); } }; @@ -1697,11 +1690,7 @@ struct CaseWhenFunctor> { return ExecVarWidthArrayCaseWhen( ctx, batch, out, // ReserveData - [&](ArrayBuilder* raw_builder) { return Status::OK(); }, - // AppendScalar - [](ArrayBuilder* raw_builder, const Scalar& scalar) { - return raw_builder->AppendScalar(scalar); - }); + [&](ArrayBuilder* raw_builder) { return Status::OK(); }); } }; From c834b5c7fa1e4fb9a892873135ad15d2d157cf2c Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 18 Aug 2021 17:12:54 -0400 Subject: [PATCH 16/31] ARROW-13222: [C++] Address feedback --- cpp/src/arrow/array/util.cc | 21 +++++++------------ cpp/src/arrow/buffer_builder.h | 11 +++++----- .../arrow/compute/kernels/scalar_if_else.cc | 5 +++-- .../kernels/scalar_if_else_benchmark.cc | 12 ----------- 4 files changed, 15 insertions(+), 34 deletions(-) diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index b8b47788fe7..48647cdbed2 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -445,14 +445,9 @@ class NullArrayFactory { out_->buffers[1] = buffer_; // buffer_ is zeroed, but 0 may not be a valid type code if (type.type_codes()[0] != 0) { - std::memset(buffer_->mutable_data(), type.type_codes()[0], buffer_->size()); - } - - // We can't reuse the buffer unless it's zeroed - std::shared_ptr child_buffer = buffer_; - if (type.type_codes()[0] != 0) { - ARROW_ASSIGN_OR_RAISE(child_buffer, AllocateBuffer(buffer_->size(), pool_)); - std::memset(child_buffer->mutable_data(), 0, child_buffer->size()); + ARROW_ASSIGN_OR_RAISE(out_->buffers[1], AllocateBuffer(buffer_->size(), pool_)); + std::memset(out_->buffers[1]->mutable_data(), type.type_codes()[0], + buffer_->size()); } // For sparse unions, we now create children with the same length as the @@ -462,13 +457,12 @@ class NullArrayFactory { // For dense unions, we set the offsets to all zero and create children // with length 1 out_->buffers.resize(3); - out_->buffers[2] = child_buffer; + out_->buffers[2] = buffer_; child_length = 1; } for (int i = 0; i < type_->num_fields(); ++i) { - ARROW_ASSIGN_OR_RAISE(out_->child_data[i], - CreateChild(i, child_length, child_buffer)); + ARROW_ASSIGN_OR_RAISE(out_->child_data[i], CreateChild(i, child_length)); } return Status::OK(); } @@ -484,10 +478,9 @@ class NullArrayFactory { return Status::NotImplemented("construction of all-null ", type); } - Result> CreateChild( - int i, int64_t length, const std::shared_ptr& buffer = nullptr) { + Result> CreateChild(int i, int64_t length) { NullArrayFactory child_factory(pool_, type_->field(i)->type(), length); - child_factory.buffer_ = buffer ? buffer : buffer_; + child_factory.buffer_ = buffer_; return child_factory.Create(); } diff --git a/cpp/src/arrow/buffer_builder.h b/cpp/src/arrow/buffer_builder.h index 2c5afde1fdc..7b02ad09a82 100644 --- a/cpp/src/arrow/buffer_builder.h +++ b/cpp/src/arrow/buffer_builder.h @@ -28,6 +28,7 @@ #include "arrow/status.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_generate.h" +#include "arrow/util/bitmap_ops.h" #include "arrow/util/macros.h" #include "arrow/util/ubsan.h" #include "arrow/util/visibility.h" @@ -339,6 +340,7 @@ class TypedBufferBuilder { ++bit_length_; } + /// \brief Append bits from an array of bytes (one value per byte) void UnsafeAppend(const uint8_t* bytes, int64_t num_elements) { if (num_elements == 0) return; int64_t i = 0; @@ -350,14 +352,11 @@ class TypedBufferBuilder { bit_length_ += num_elements; } + /// \brief Append bits from a packed bitmap void UnsafeAppend(const uint8_t* bitmap, int64_t offset, int64_t num_elements) { if (num_elements == 0) return; - int64_t i = offset; - internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements, [&] { - bool value = BitUtil::GetBit(bitmap, i++); - false_count_ += !value; - return value; - }); + internal::CopyBitmap(bitmap, offset, num_elements, mutable_data(), bit_length_); + false_count_ += num_elements - internal::CountSetBits(bitmap, offset, num_elements); bit_length_ += num_elements; } diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index c700c76fb23..4de29319586 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -1417,8 +1417,8 @@ struct CaseWhenFunctor { } }; -static Status ExecVarWidthScalarCaseWhen(KernelContext* ctx, const ExecBatch& batch, - Datum* out) { +Status ExecVarWidthScalarCaseWhen(KernelContext* ctx, const ExecBatch& batch, + Datum* out) { const auto& conds = checked_cast(*batch.values[0].scalar()); Datum result; for (size_t i = 0; i < batch.values.size() - 1; i++) { @@ -1435,6 +1435,7 @@ static Status ExecVarWidthScalarCaseWhen(KernelContext* ctx, const ExecBatch& ba } } if (out->is_scalar()) { + DCHECK(result.is_scalar() || result.kind() == Datum::NONE); *out = result.is_scalar() ? result.scalar() : MakeNullScalar(out->type()); return Status::OK(); } diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc index dbe6f957247..a8041f9086e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc @@ -159,12 +159,6 @@ static void CaseWhenBench(benchmark::State& state) { random::RandomArrayGenerator rand(/*seed=*/0); - auto cond1 = std::static_pointer_cast( - rand.ArrayOf(boolean(), len, /*null_probability=*/0.01)); - auto cond2 = std::static_pointer_cast( - rand.ArrayOf(boolean(), len, /*null_probability=*/0.01)); - auto cond3 = std::static_pointer_cast( - rand.ArrayOf(boolean(), len, /*null_probability=*/0.01)); auto cond_field = field("cond", boolean(), key_value_metadata({{"null_probability", "0.01"}})); auto cond = rand.ArrayOf(*field("", struct_({cond_field, cond_field, cond_field}), @@ -198,12 +192,6 @@ static void CaseWhenBenchList(benchmark::State& state) { random::RandomArrayGenerator rand(/*seed=*/0); - auto cond1 = std::static_pointer_cast( - rand.ArrayOf(boolean(), len, /*null_probability=*/0.01)); - auto cond2 = std::static_pointer_cast( - rand.ArrayOf(boolean(), len, /*null_probability=*/0.01)); - auto cond3 = std::static_pointer_cast( - rand.ArrayOf(boolean(), len, /*null_probability=*/0.01)); auto cond_field = field("cond", boolean(), key_value_metadata({{"null_probability", "0.01"}})); auto cond = rand.ArrayOf(*field("", struct_({cond_field, cond_field, cond_field}), From 2b6dd26062ba7650276b19e41efe58e23a1613f5 Mon Sep 17 00:00:00 2001 From: David Li Date: Thu, 19 Aug 2021 11:08:18 -0400 Subject: [PATCH 17/31] ARROW-13222: [C++] Add tests for AppendArraySliceUnchecked --- cpp/src/arrow/array/array_test.cc | 99 +++++++++++++++++-- cpp/src/arrow/array/builder_base.h | 15 +-- cpp/src/arrow/array/builder_binary.h | 4 +- cpp/src/arrow/array/builder_nested.h | 33 ++++--- cpp/src/arrow/array/builder_union.h | 4 +- .../compute/kernels/scalar_if_else_test.cc | 79 ++++++++------- cpp/src/arrow/type_traits.h | 11 +++ 7 files changed, 166 insertions(+), 79 deletions(-) diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index d417f29f58f..e714f03b191 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -500,23 +500,30 @@ static ScalarVector GetScalars() { const auto dense_union_ty = ::arrow::dense_union(union_fields, union_type_codes); return { - std::make_shared(false), std::make_shared(3), - std::make_shared(3), std::make_shared(3), - std::make_shared(3), std::make_shared(3.0), - std::make_shared(10), std::make_shared(11), + std::make_shared(false), + std::make_shared(3), + std::make_shared(3), + std::make_shared(3), + std::make_shared(3), + std::make_shared(3.0), + std::make_shared(10), + std::make_shared(11), std::make_shared(1000, time32(TimeUnit::SECOND)), std::make_shared(1111, time64(TimeUnit::MICRO)), std::make_shared(1111, timestamp(TimeUnit::MILLI)), std::make_shared(1), std::make_shared(daytime), std::make_shared(60, duration(TimeUnit::SECOND)), - std::make_shared(hello), std::make_shared(hello), + std::make_shared(hello), + std::make_shared(hello), std::make_shared( hello, fixed_size_binary(static_cast(hello->size()))), std::make_shared(Decimal128(10), decimal(16, 4)), std::make_shared(Decimal256(10), decimal(76, 38)), - std::make_shared(hello), std::make_shared(hello), + std::make_shared(hello), + std::make_shared(hello), std::make_shared(ArrayFromJSON(int8(), "[1, 2, 3]")), + ScalarFromJSON(map(int8(), utf8()), R"([[1, "foo"], [2, "bar"]])"), std::make_shared(ArrayFromJSON(int8(), "[1, 1, 2, 2, 3, 3]")), std::make_shared(ArrayFromJSON(int8(), "[1, 2, 3, 4]")), std::make_shared( @@ -533,7 +540,12 @@ static ScalarVector GetScalars() { std::make_shared(std::make_shared(101), 6, dense_union_ty), std::make_shared(std::make_shared(101), 42, - dense_union_ty)}; + dense_union_ty), + DictionaryScalar::Make(ScalarFromJSON(int8(), "1"), + ArrayFromJSON(utf8(), R"(["foo", "bar"])")), + DictionaryScalar::Make(ScalarFromJSON(uint8(), "1"), + ArrayFromJSON(utf8(), R"(["foo", "bar"])")), + }; } TEST_F(TestArray, TestMakeArrayFromScalar) { @@ -560,6 +572,8 @@ TEST_F(TestArray, TestMakeArrayFromScalar) { } for (auto scalar : scalars) { + // TODO(ARROW-13197): appending dictionary scalars not implemented + if (is_dictionary(scalar->type->id())) continue; AssertAppendScalar(pool_, scalar); } } @@ -614,6 +628,77 @@ TEST_F(TestArray, TestMakeArrayFromMapScalar) { AssertAppendScalar(pool_, std::make_shared(scalar)); } +TEST_F(TestArray, TestAppendArraySlice) { + auto scalars = GetScalars(); + for (const auto& scalar : scalars) { + // TODO(ARROW-13573): appending dictionary arrays not implemented + if (is_dictionary(scalar->type->id())) continue; + + ARROW_SCOPED_TRACE(*scalar->type); + ASSERT_OK_AND_ASSIGN(auto array, MakeArrayFromScalar(*scalar, 16)); + ASSERT_OK_AND_ASSIGN(auto nulls, MakeArrayOfNull(scalar->type, 16)); + + std::unique_ptr builder; + ASSERT_OK(MakeBuilder(pool_, scalar->type, &builder)); + + ASSERT_OK(builder->AppendArraySliceUnchecked(*array->data(), 0, 4)); + ASSERT_EQ(4, builder->length()); + ASSERT_OK(builder->AppendArraySliceUnchecked(*array->data(), 0, 0)); + ASSERT_EQ(4, builder->length()); + ASSERT_OK(builder->AppendArraySliceUnchecked(*array->data(), 1, 0)); + ASSERT_EQ(4, builder->length()); + ASSERT_OK(builder->AppendArraySliceUnchecked(*array->data(), 1, 4)); + ASSERT_EQ(8, builder->length()); + + ASSERT_OK(builder->AppendArraySliceUnchecked(*nulls->data(), 0, 4)); + ASSERT_EQ(12, builder->length()); + if (!is_union(scalar->type->id())) { + ASSERT_EQ(4, builder->null_count()); + } + ASSERT_OK(builder->AppendArraySliceUnchecked(*nulls->data(), 0, 0)); + ASSERT_EQ(12, builder->length()); + if (!is_union(scalar->type->id())) { + ASSERT_EQ(4, builder->null_count()); + } + ASSERT_OK(builder->AppendArraySliceUnchecked(*nulls->data(), 1, 0)); + ASSERT_EQ(12, builder->length()); + if (!is_union(scalar->type->id())) { + ASSERT_EQ(4, builder->null_count()); + } + ASSERT_OK(builder->AppendArraySliceUnchecked(*nulls->data(), 1, 4)); + ASSERT_EQ(16, builder->length()); + if (!is_union(scalar->type->id())) { + ASSERT_EQ(8, builder->null_count()); + } + + std::shared_ptr result; + ASSERT_OK(builder->Finish(&result)); + ASSERT_OK(result->ValidateFull()); + ASSERT_EQ(16, result->length()); + if (!is_union(scalar->type->id())) { + ASSERT_EQ(8, result->null_count()); + } + } + + { + ASSERT_OK_AND_ASSIGN(auto array, MakeArrayOfNull(null(), 16)); + NullBuilder builder(pool_); + ASSERT_OK(builder.AppendArraySliceUnchecked(*array->data(), 0, 4)); + ASSERT_EQ(4, builder.length()); + ASSERT_OK(builder.AppendArraySliceUnchecked(*array->data(), 0, 0)); + ASSERT_EQ(4, builder.length()); + ASSERT_OK(builder.AppendArraySliceUnchecked(*array->data(), 1, 0)); + ASSERT_EQ(4, builder.length()); + ASSERT_OK(builder.AppendArraySliceUnchecked(*array->data(), 1, 4)); + ASSERT_EQ(8, builder.length()); + std::shared_ptr result; + ASSERT_OK(builder.Finish(&result)); + ASSERT_OK(result->ValidateFull()); + ASSERT_EQ(8, result->length()); + ASSERT_EQ(8, result->null_count()); + } +} + TEST_F(TestArray, ValidateBuffersPrimitive) { auto empty_buffer = std::make_shared(""); auto null_buffer = Buffer::FromString("\xff"); diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index b05a104bdc4..3105b95fbaa 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -123,22 +123,13 @@ class ARROW_EXPORT ArrayBuilder { Status AppendScalar(const Scalar& scalar, int64_t n_repeats); Status AppendScalars(const ScalarVector& scalars); - /// \brief Append a range of values from an array - Status AppendArraySlice(const ArrayData& array, const int64_t offset, - const int64_t length) { - if (!type()->Equals(*array.type)) { - // TODO: test this - return Status::TypeError("Expected array of type ", *type(), - " but got array of type ", *array.type); - } - return AppendArraySliceUnchecked(array, offset, length); - } - /// \brief Append a range of values from an array without checking type compatibility + /// \brief Append a range of values from an array. + /// + /// The given array must be the same type as the builder. virtual Status AppendArraySliceUnchecked(const ArrayData& array, const int64_t offset, const int64_t length) { return Status::NotImplemented("AppendArraySliceUnchecked for builder for ", *type()); } - // TODO: overloads for arrays /// For cases where raw data was memcpy'd into the internal buffers, allows us /// to advance the length of the builder. It is your responsibility to use diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index ec12aa8e6b2..2cc40a94a38 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -283,9 +283,9 @@ class BaseBinaryBuilder : public ArrayBuilder { if (!bitmap || BitUtil::GetBit(bitmap, array.offset + offset + i)) { const offset_type start = offsets[offset + i]; const offset_type end = offsets[offset + i + 1]; - RETURN_NOT_OK(Append(data + start, end - start)); + ARROW_RETURN_NOT_OK(Append(data + start, end - start)); } else { - RETURN_NOT_OK(AppendNull()); + ARROW_RETURN_NOT_OK(AppendNull()); } } return Status::OK(); diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index 57c460be272..6abedc0de8b 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -128,12 +128,12 @@ class BaseListBuilder : public ArrayBuilder { const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : nullptr; for (int64_t row = offset; row < offset + length; row++) { if (!validity || BitUtil::GetBit(validity, array.offset + row)) { - RETURN_NOT_OK(Append()); + ARROW_RETURN_NOT_OK(Append()); int64_t slot_length = offsets[row + 1] - offsets[row]; - value_builder_->AppendArraySliceUnchecked(*array.child_data[0], offsets[row], - slot_length); + ARROW_RETURN_NOT_OK(value_builder_->AppendArraySliceUnchecked( + *array.child_data[0], offsets[row], slot_length)); } else { - RETURN_NOT_OK(AppendNull()); + ARROW_RETURN_NOT_OK(AppendNull()); } } return Status::OK(); @@ -298,14 +298,14 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder { const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : nullptr; for (int64_t row = offset; row < offset + length; row++) { if (!validity || BitUtil::GetBit(validity, array.offset + row)) { - RETURN_NOT_OK(Append()); + ARROW_RETURN_NOT_OK(Append()); const int64_t slot_length = offsets[row + 1] - offsets[row]; - key_builder_->AppendArraySliceUnchecked(*array.child_data[0]->child_data[0], - offsets[row], slot_length); - item_builder_->AppendArraySliceUnchecked(*array.child_data[0]->child_data[1], - offsets[row], slot_length); + ARROW_RETURN_NOT_OK(key_builder_->AppendArraySliceUnchecked( + *array.child_data[0]->child_data[0], offsets[row], slot_length)); + ARROW_RETURN_NOT_OK(item_builder_->AppendArraySliceUnchecked( + *array.child_data[0]->child_data[1], offsets[row], slot_length)); } else { - RETURN_NOT_OK(AppendNull()); + ARROW_RETURN_NOT_OK(AppendNull()); } } return Status::OK(); @@ -415,11 +415,11 @@ class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder { const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : nullptr; for (int64_t row = offset; row < offset + length; row++) { if (!validity || BitUtil::GetBit(validity, array.offset + row)) { - RETURN_NOT_OK(value_builder_->AppendArraySliceUnchecked( + ARROW_RETURN_NOT_OK(value_builder_->AppendArraySliceUnchecked( *array.child_data[0], list_size_ * (array.offset + row), list_size_)); - RETURN_NOT_OK(Append()); + ARROW_RETURN_NOT_OK(Append()); } else { - RETURN_NOT_OK(AppendNull()); + ARROW_RETURN_NOT_OK(AppendNull()); } } return Status::OK(); @@ -521,12 +521,13 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder { Status AppendArraySliceUnchecked(const ArrayData& array, int64_t offset, int64_t length) override { for (int i = 0; static_cast(i) < children_.size(); i++) { - children_[i]->AppendArraySliceUnchecked(*array.child_data[i], array.offset + offset, - length); + ARROW_RETURN_NOT_OK(children_[i]->AppendArraySliceUnchecked( + *array.child_data[i], array.offset + offset, length)); } const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : nullptr; for (int64_t row = offset; row < offset + length; row++) { - RETURN_NOT_OK(Append(!validity || BitUtil::GetBit(validity, array.offset + row))); + ARROW_RETURN_NOT_OK( + Append(!validity || BitUtil::GetBit(validity, array.offset + row))); } return Status::OK(); } diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h index 24ddf76f12c..d4fe41a3097 100644 --- a/cpp/src/arrow/array/builder_union.h +++ b/cpp/src/arrow/array/builder_union.h @@ -164,8 +164,8 @@ class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder { const int8_t type_code = type_codes[row]; const int child_id = type_id_to_child_id_[type_code]; const int32_t union_offset = offsets[row]; - RETURN_NOT_OK(Append(type_code)); - RETURN_NOT_OK(type_id_to_children_[type_code]->AppendArraySliceUnchecked( + ARROW_RETURN_NOT_OK(Append(type_code)); + ARROW_RETURN_NOT_OK(type_id_to_children_[type_code]->AppendArraySliceUnchecked( *array.child_data[child_id], union_offset, /*length=*/1)); } return Status::OK(); diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc index 8ae6d79ba60..b3b0f26cead 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc @@ -886,48 +886,47 @@ TYPED_TEST(TestCaseWhenBinary, Basics) { auto values1 = ArrayFromJSON(type, R"(["cDE", null, "degfhi", "efg"])"); auto values2 = ArrayFromJSON(type, R"(["fghijk", "ghi", null, "hi"])"); - // CheckScalar("case_when", {MakeStruct({}), values1}, values1); - // CheckScalar("case_when", {MakeStruct({}), values_null}, values_null); - - // CheckScalar("case_when", {MakeStruct({cond_true}), scalar1, values1}, - // *MakeArrayFromScalar(*scalar1, 4)); - // CheckScalar("case_when", {MakeStruct({cond_false}), scalar1, values1}, values1); - - // CheckScalar("case_when", {MakeStruct({cond_true}), values1}, values1); - // CheckScalar("case_when", {MakeStruct({cond_false}), values1}, values_null); - // CheckScalar("case_when", {MakeStruct({cond_null}), values1}, values_null); - // CheckScalar("case_when", {MakeStruct({cond_true}), values1, values2}, values1); - // CheckScalar("case_when", {MakeStruct({cond_false}), values1, values2}, values2); - // CheckScalar("case_when", {MakeStruct({cond_null}), values1, values2}, values2); - - // CheckScalar("case_when", {MakeStruct({cond_true, cond_true}), values1, values2}, - // values1); - // CheckScalar("case_when", {MakeStruct({cond_false, cond_false}), values1, values2}, - // values_null); - // CheckScalar("case_when", {MakeStruct({cond_true, cond_false}), values1, values2}, - // values1); - // CheckScalar("case_when", {MakeStruct({cond_false, cond_true}), values1, values2}, - // values2); - // CheckScalar("case_when", {MakeStruct({cond_null, cond_true}), values1, values2}, - // values2); - // CheckScalar("case_when", - // {MakeStruct({cond_false, cond_false}), values1, values2, values2}, - // values2); - - // CheckScalar("case_when", {MakeStruct({cond1, cond2}), scalar1, scalar2}, - // ArrayFromJSON(type, R"(["aBxYz", "aBxYz", "b", null])")); - // CheckScalar("case_when", {MakeStruct({cond1}), scalar_null}, values_null); - // CheckScalar("case_when", {MakeStruct({cond1}), scalar_null, scalar1}, - // ArrayFromJSON(type, R"([null, null, "aBxYz", "aBxYz"])")); - // CheckScalar("case_when", {MakeStruct({cond1, cond2}), scalar1, scalar2, scalar1}, - // ArrayFromJSON(type, R"(["aBxYz", "aBxYz", "b", "aBxYz"])")); - - // CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2}, - // ArrayFromJSON(type, R"(["cDE", null, null, null])")); + CheckScalar("case_when", {MakeStruct({}), values1}, values1); + CheckScalar("case_when", {MakeStruct({}), values_null}, values_null); + + CheckScalar("case_when", {MakeStruct({cond_true}), scalar1, values1}, + *MakeArrayFromScalar(*scalar1, 4)); + CheckScalar("case_when", {MakeStruct({cond_false}), scalar1, values1}, values1); + + CheckScalar("case_when", {MakeStruct({cond_true}), values1}, values1); + CheckScalar("case_when", {MakeStruct({cond_false}), values1}, values_null); + CheckScalar("case_when", {MakeStruct({cond_null}), values1}, values_null); + CheckScalar("case_when", {MakeStruct({cond_true}), values1, values2}, values1); + CheckScalar("case_when", {MakeStruct({cond_false}), values1, values2}, values2); + CheckScalar("case_when", {MakeStruct({cond_null}), values1, values2}, values2); + + CheckScalar("case_when", {MakeStruct({cond_true, cond_true}), values1, values2}, + values1); + CheckScalar("case_when", {MakeStruct({cond_false, cond_false}), values1, values2}, + values_null); + CheckScalar("case_when", {MakeStruct({cond_true, cond_false}), values1, values2}, + values1); + CheckScalar("case_when", {MakeStruct({cond_false, cond_true}), values1, values2}, + values2); + CheckScalar("case_when", {MakeStruct({cond_null, cond_true}), values1, values2}, + values2); + CheckScalar("case_when", + {MakeStruct({cond_false, cond_false}), values1, values2, values2}, values2); + + CheckScalar("case_when", {MakeStruct({cond1, cond2}), scalar1, scalar2}, + ArrayFromJSON(type, R"(["aBxYz", "aBxYz", "b", null])")); + CheckScalar("case_when", {MakeStruct({cond1}), scalar_null}, values_null); + CheckScalar("case_when", {MakeStruct({cond1}), scalar_null, scalar1}, + ArrayFromJSON(type, R"([null, null, "aBxYz", "aBxYz"])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), scalar1, scalar2, scalar1}, + ArrayFromJSON(type, R"(["aBxYz", "aBxYz", "b", "aBxYz"])")); + + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2}, + ArrayFromJSON(type, R"(["cDE", null, null, null])")); CheckScalar("case_when", {MakeStruct({cond1, cond2}), values1, values2, values1}, ArrayFromJSON(type, R"(["cDE", null, null, "efg"])")); - // CheckScalar("case_when", {MakeStruct({cond1, cond2}), values_null, values2, values1}, - // ArrayFromJSON(type, R"([null, null, null, "efg"])")); + CheckScalar("case_when", {MakeStruct({cond1, cond2}), values_null, values2, values1}, + ArrayFromJSON(type, R"([null, null, null, "efg"])")); } template diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 1e04821c019..f94ba513e6a 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -1005,6 +1005,17 @@ static inline bool is_nested(Type::type type_id) { return false; } +static inline bool is_union(Type::type type_id) { + switch (type_id) { + case Type::SPARSE_UNION: + case Type::DENSE_UNION: + return true; + default: + break; + } + return false; +} + static inline int offset_bit_width(Type::type type_id) { switch (type_id) { case Type::STRING: From 7390452d26d4dcfd3f1aa64fa8c20a548ab9465e Mon Sep 17 00:00:00 2001 From: David Li Date: Thu, 19 Aug 2021 11:11:47 -0400 Subject: [PATCH 18/31] ARROW-13222: [C++] Fix lint/MSVC warnings --- cpp/src/arrow/array/builder_base.h | 4 ++-- cpp/src/arrow/array/builder_nested.h | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index 3105b95fbaa..99fa33433c9 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -126,8 +126,8 @@ class ARROW_EXPORT ArrayBuilder { /// \brief Append a range of values from an array. /// /// The given array must be the same type as the builder. - virtual Status AppendArraySliceUnchecked(const ArrayData& array, const int64_t offset, - const int64_t length) { + virtual Status AppendArraySliceUnchecked(const ArrayData& array, int64_t offset, + int64_t length) { return Status::NotImplemented("AppendArraySliceUnchecked for builder for ", *type()); } diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index 6abedc0de8b..798f0196890 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -125,7 +125,7 @@ class BaseListBuilder : public ArrayBuilder { Status AppendArraySliceUnchecked(const ArrayData& array, int64_t offset, int64_t length) override { const offset_type* offsets = array.GetValues(1); - const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : nullptr; + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR; for (int64_t row = offset; row < offset + length; row++) { if (!validity || BitUtil::GetBit(validity, array.offset + row)) { ARROW_RETURN_NOT_OK(Append()); @@ -295,7 +295,7 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder { Status AppendArraySliceUnchecked(const ArrayData& array, int64_t offset, int64_t length) override { const int32_t* offsets = array.GetValues(1); - const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : nullptr; + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR; for (int64_t row = offset; row < offset + length; row++) { if (!validity || BitUtil::GetBit(validity, array.offset + row)) { ARROW_RETURN_NOT_OK(Append()); @@ -412,7 +412,7 @@ class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder { Status AppendArraySliceUnchecked(const ArrayData& array, int64_t offset, int64_t length) final { - const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : nullptr; + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR; for (int64_t row = offset; row < offset + length; row++) { if (!validity || BitUtil::GetBit(validity, array.offset + row)) { ARROW_RETURN_NOT_OK(value_builder_->AppendArraySliceUnchecked( @@ -524,7 +524,7 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder { ARROW_RETURN_NOT_OK(children_[i]->AppendArraySliceUnchecked( *array.child_data[i], array.offset + offset, length)); } - const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : nullptr; + const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR; for (int64_t row = offset; row < offset + length; row++) { ARROW_RETURN_NOT_OK( Append(!validity || BitUtil::GetBit(validity, array.offset + row))); From bd95fc8cd670f5615fad1430910c88124f289556 Mon Sep 17 00:00:00 2001 From: David Li Date: Thu, 19 Aug 2021 12:51:24 -0400 Subject: [PATCH 19/31] ARROW-13222: [C++] Optimize BaseBinaryBuilder::AppendArraySliceUnchecked --- cpp/src/arrow/array/array_test.cc | 53 +++++++++++++++++----------- cpp/src/arrow/array/builder_binary.h | 42 ++++++++++++++++++---- 2 files changed, 68 insertions(+), 27 deletions(-) diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index e714f03b191..d476cc91844 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -37,6 +37,7 @@ #include "arrow/array/builder_binary.h" #include "arrow/array/builder_decimal.h" #include "arrow/array/builder_dict.h" +#include "arrow/array/concatenate.h" #include "arrow/array/data.h" #include "arrow/array/util.h" #include "arrow/buffer.h" @@ -634,6 +635,8 @@ TEST_F(TestArray, TestAppendArraySlice) { // TODO(ARROW-13573): appending dictionary arrays not implemented if (is_dictionary(scalar->type->id())) continue; + auto is_union = arrow::is_union(scalar->type->id()); + ARROW_SCOPED_TRACE(*scalar->type); ASSERT_OK_AND_ASSIGN(auto array, MakeArrayFromScalar(*scalar, 16)); ASSERT_OK_AND_ASSIGN(auto nulls, MakeArrayOfNull(scalar->type, 16)); @@ -641,7 +644,9 @@ TEST_F(TestArray, TestAppendArraySlice) { std::unique_ptr builder; ASSERT_OK(MakeBuilder(pool_, scalar->type, &builder)); - ASSERT_OK(builder->AppendArraySliceUnchecked(*array->data(), 0, 4)); + ASSERT_OK(builder->AppendArraySliceUnchecked(*array->data(), 0, 2)); + ASSERT_EQ(2, builder->length()); + ASSERT_OK(builder->AppendArraySliceUnchecked(*array->data(), 2, 2)); ASSERT_EQ(4, builder->length()); ASSERT_OK(builder->AppendArraySliceUnchecked(*array->data(), 0, 0)); ASSERT_EQ(4, builder->length()); @@ -652,31 +657,39 @@ TEST_F(TestArray, TestAppendArraySlice) { ASSERT_OK(builder->AppendArraySliceUnchecked(*nulls->data(), 0, 4)); ASSERT_EQ(12, builder->length()); - if (!is_union(scalar->type->id())) { - ASSERT_EQ(4, builder->null_count()); - } + ASSERT_EQ(is_union ? 0 : 4, builder->null_count()); ASSERT_OK(builder->AppendArraySliceUnchecked(*nulls->data(), 0, 0)); ASSERT_EQ(12, builder->length()); - if (!is_union(scalar->type->id())) { - ASSERT_EQ(4, builder->null_count()); - } + ASSERT_EQ(is_union ? 0 : 4, builder->null_count()); ASSERT_OK(builder->AppendArraySliceUnchecked(*nulls->data(), 1, 0)); ASSERT_EQ(12, builder->length()); - if (!is_union(scalar->type->id())) { - ASSERT_EQ(4, builder->null_count()); - } + ASSERT_EQ(is_union ? 0 : 4, builder->null_count()); ASSERT_OK(builder->AppendArraySliceUnchecked(*nulls->data(), 1, 4)); ASSERT_EQ(16, builder->length()); - if (!is_union(scalar->type->id())) { - ASSERT_EQ(8, builder->null_count()); - } - - std::shared_ptr result; - ASSERT_OK(builder->Finish(&result)); - ASSERT_OK(result->ValidateFull()); - ASSERT_EQ(16, result->length()); - if (!is_union(scalar->type->id())) { - ASSERT_EQ(8, result->null_count()); + ASSERT_EQ(is_union ? 0 : 8, builder->null_count()); + + if (is_union) { + // Concatenate not supported by union + std::shared_ptr result; + ASSERT_OK(builder->Finish(&result)); + ASSERT_OK(result->ValidateFull()); + ASSERT_EQ(16, result->length()); + ASSERT_EQ(0, result->null_count()); + } else { + ASSERT_OK_AND_ASSIGN(auto both, Concatenate({array, nulls})); + ASSERT_OK(builder->AppendArraySliceUnchecked(*both->data(), 14, 4)); + ASSERT_EQ(20, builder->length()); + ASSERT_EQ(10, builder->null_count()); + std::shared_ptr result; + ASSERT_OK(builder->Finish(&result)); + ASSERT_OK(result->ValidateFull()); + ASSERT_EQ(20, result->length()); + ASSERT_EQ(10, result->null_count()); + + ASSERT_OK_AND_ASSIGN(auto expected, + Concatenate({array->Slice(0, 4), array->Slice(1, 4), + nulls->Slice(0, 8), both->Slice(14, 4)})); + AssertArraysApproxEqual(*expected, *result, /*verbose=*/true); } } diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 2cc40a94a38..c84e66d0697 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -35,6 +35,7 @@ #include "arrow/buffer_builder.h" #include "arrow/status.h" #include "arrow/type.h" +#include "arrow/util/bit_run_reader.h" #include "arrow/util/macros.h" #include "arrow/util/string_view.h" // IWYU pragma: export #include "arrow/util/visibility.h" @@ -276,17 +277,44 @@ class BaseBinaryBuilder : public ArrayBuilder { Status AppendArraySliceUnchecked(const ArrayData& array, int64_t offset, int64_t length) override { + if (length == 0) return Status::OK(); auto bitmap = array.GetValues(0, 0); - auto offsets = array.GetValues(1); + auto offsets = array.GetValues(1) + offset; auto data = array.GetValues(2, 0); - for (int64_t i = 0; i < length; i++) { - if (!bitmap || BitUtil::GetBit(bitmap, array.offset + offset + i)) { - const offset_type start = offsets[offset + i]; - const offset_type end = offsets[offset + i + 1]; - ARROW_RETURN_NOT_OK(Append(data + start, end - start)); + ARROW_RETURN_NOT_OK(Reserve(length)); + + auto copy_values = [&](int64_t position, int64_t length) { + const offset_type first_offset = offsets[position]; + const offset_type last_offset = offsets[position + length]; + const offset_type data_length = last_offset - first_offset; + ARROW_RETURN_NOT_OK(ReserveData(data_length)); + UnsafeAppendNextOffset(); + const offset_type base_offset = + offsets_builder_.data()[offsets_builder_.length() - 1]; + ARROW_RETURN_NOT_OK(value_data_builder_.Append(data + first_offset, data_length)); + for (int64_t i = 1; i < length; i++) { + offsets_builder_.UnsafeAppend(base_offset + + (offsets[position + i] - first_offset)); + } + UnsafeAppendToBitmap(length, true); + return Status::OK(); + }; + + if (!bitmap) { + return copy_values(0, length); + } + internal::BitRunReader reader(bitmap, array.offset + offset, length); + int64_t position = 0; + while (true) { + internal::BitRun run = reader.NextRun(); + if (run.length == 0) break; + + if (run.set) { + ARROW_RETURN_NOT_OK(copy_values(position, run.length)); } else { - ARROW_RETURN_NOT_OK(AppendNull()); + ARROW_RETURN_NOT_OK(AppendNulls(run.length)); } + position += run.length; } return Status::OK(); } From bf01084d0da984141e3121d817fed70c778fe634 Mon Sep 17 00:00:00 2001 From: David Li Date: Thu, 19 Aug 2021 12:55:07 -0400 Subject: [PATCH 20/31] ARROW-13222: [C++] Optimize StructBuilder::AppendArraySliceUnchecked --- cpp/src/arrow/array/builder_nested.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index 798f0196890..f4b2ea7ccd0 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -525,10 +525,8 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder { *array.child_data[i], array.offset + offset, length)); } const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR; - for (int64_t row = offset; row < offset + length; row++) { - ARROW_RETURN_NOT_OK( - Append(!validity || BitUtil::GetBit(validity, array.offset + row))); - } + ARROW_RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(validity, array.offset + offset, length); return Status::OK(); } From b3c12a1d68da064a498204e28fd8a52906a352fb Mon Sep 17 00:00:00 2001 From: David Li Date: Thu, 19 Aug 2021 12:58:10 -0400 Subject: [PATCH 21/31] ARROW-13222: [C++] Reorganize union builders --- cpp/src/arrow/array/builder_union.cc | 28 ++++++++++++++++++++++++++++ cpp/src/arrow/array/builder_union.h | 24 ++---------------------- 2 files changed, 30 insertions(+), 22 deletions(-) diff --git a/cpp/src/arrow/array/builder_union.cc b/cpp/src/arrow/array/builder_union.cc index e55315f32cf..e2ff473d65d 100644 --- a/cpp/src/arrow/array/builder_union.cc +++ b/cpp/src/arrow/array/builder_union.cc @@ -45,6 +45,22 @@ Status BasicUnionBuilder::FinishInternal(std::shared_ptr* out) { return Status::OK(); } +Status DenseUnionBuilder::AppendArraySliceUnchecked(const ArrayData& array, + const int64_t offset, + const int64_t length) { + const int8_t* type_codes = array.GetValues(1); + const int32_t* offsets = array.GetValues(2); + for (int64_t row = offset; row < offset + length; row++) { + const int8_t type_code = type_codes[row]; + const int child_id = type_id_to_child_id_[type_code]; + const int32_t union_offset = offsets[row]; + ARROW_RETURN_NOT_OK(Append(type_code)); + ARROW_RETURN_NOT_OK(type_id_to_children_[type_code]->AppendArraySliceUnchecked( + *array.child_data[child_id], union_offset, /*length=*/1)); + } + return Status::OK(); +} + Status DenseUnionBuilder::FinishInternal(std::shared_ptr* out) { ARROW_RETURN_NOT_OK(BasicUnionBuilder::FinishInternal(out)); (*out)->buffers.resize(3); @@ -122,4 +138,16 @@ int8_t BasicUnionBuilder::NextTypeId() { return dense_type_id_++; } +Status SparseUnionBuilder::AppendArraySliceUnchecked(const ArrayData& array, + const int64_t offset, + const int64_t length) { + for (size_t i = 0; i < type_codes_.size(); i++) { + type_id_to_children_[type_codes_[i]]->AppendArraySliceUnchecked( + *array.child_data[i], array.offset + offset, length); + } + const int8_t* type_codes = array.GetValues(1); + types_builder_.Append(type_codes + offset, length); + return Status::OK(); +} + } // namespace arrow diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h index d4fe41a3097..da44ba5922f 100644 --- a/cpp/src/arrow/array/builder_union.h +++ b/cpp/src/arrow/array/builder_union.h @@ -157,19 +157,7 @@ class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder { } Status AppendArraySliceUnchecked(const ArrayData& array, const int64_t offset, - const int64_t length) override { - const int8_t* type_codes = array.GetValues(1); - const int32_t* offsets = array.GetValues(2); - for (int64_t row = offset; row < offset + length; row++) { - const int8_t type_code = type_codes[row]; - const int child_id = type_id_to_child_id_[type_code]; - const int32_t union_offset = offsets[row]; - ARROW_RETURN_NOT_OK(Append(type_code)); - ARROW_RETURN_NOT_OK(type_id_to_children_[type_code]->AppendArraySliceUnchecked( - *array.child_data[child_id], union_offset, /*length=*/1)); - } - return Status::OK(); - } + const int64_t length) override; Status FinishInternal(std::shared_ptr* out) override; @@ -248,15 +236,7 @@ class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder { Status Append(int8_t next_type) { return types_builder_.Append(next_type); } Status AppendArraySliceUnchecked(const ArrayData& array, const int64_t offset, - const int64_t length) override { - for (size_t i = 0; i < type_codes_.size(); i++) { - type_id_to_children_[type_codes_[i]]->AppendArraySliceUnchecked( - *array.child_data[i], array.offset + offset, length); - } - const int8_t* type_codes = array.GetValues(1); - types_builder_.Append(type_codes + offset, length); - return Status::OK(); - } + const int64_t length) override; }; } // namespace arrow From 0324fd104c27de981ebf2026501a20a081ab9d7d Mon Sep 17 00:00:00 2001 From: David Li Date: Thu, 19 Aug 2021 12:58:30 -0400 Subject: [PATCH 22/31] ARROW-13222: [C++] Don't over-allocate --- cpp/src/arrow/array/util.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 48647cdbed2..5e95dc93f56 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -445,9 +445,8 @@ class NullArrayFactory { out_->buffers[1] = buffer_; // buffer_ is zeroed, but 0 may not be a valid type code if (type.type_codes()[0] != 0) { - ARROW_ASSIGN_OR_RAISE(out_->buffers[1], AllocateBuffer(buffer_->size(), pool_)); - std::memset(out_->buffers[1]->mutable_data(), type.type_codes()[0], - buffer_->size()); + ARROW_ASSIGN_OR_RAISE(out_->buffers[1], AllocateBuffer(length_, pool_)); + std::memset(out_->buffers[1]->mutable_data(), type.type_codes()[0], length_); } // For sparse unions, we now create children with the same length as the From 0ac03c14ced7290f6f8d3e66283b537233eeedf3 Mon Sep 17 00:00:00 2001 From: David Li Date: Thu, 19 Aug 2021 13:09:02 -0400 Subject: [PATCH 23/31] Revert "ARROW-13222: [C++] Optimize BaseBinaryBuilder::AppendArraySliceUnchecked" This reverts commit fe9ecbf0a35117db7ad955ef345181328633277b. --- cpp/src/arrow/array/array_test.cc | 53 +++++++++++----------------- cpp/src/arrow/array/builder_binary.h | 42 ++++------------------ 2 files changed, 27 insertions(+), 68 deletions(-) diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index d476cc91844..e714f03b191 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -37,7 +37,6 @@ #include "arrow/array/builder_binary.h" #include "arrow/array/builder_decimal.h" #include "arrow/array/builder_dict.h" -#include "arrow/array/concatenate.h" #include "arrow/array/data.h" #include "arrow/array/util.h" #include "arrow/buffer.h" @@ -635,8 +634,6 @@ TEST_F(TestArray, TestAppendArraySlice) { // TODO(ARROW-13573): appending dictionary arrays not implemented if (is_dictionary(scalar->type->id())) continue; - auto is_union = arrow::is_union(scalar->type->id()); - ARROW_SCOPED_TRACE(*scalar->type); ASSERT_OK_AND_ASSIGN(auto array, MakeArrayFromScalar(*scalar, 16)); ASSERT_OK_AND_ASSIGN(auto nulls, MakeArrayOfNull(scalar->type, 16)); @@ -644,9 +641,7 @@ TEST_F(TestArray, TestAppendArraySlice) { std::unique_ptr builder; ASSERT_OK(MakeBuilder(pool_, scalar->type, &builder)); - ASSERT_OK(builder->AppendArraySliceUnchecked(*array->data(), 0, 2)); - ASSERT_EQ(2, builder->length()); - ASSERT_OK(builder->AppendArraySliceUnchecked(*array->data(), 2, 2)); + ASSERT_OK(builder->AppendArraySliceUnchecked(*array->data(), 0, 4)); ASSERT_EQ(4, builder->length()); ASSERT_OK(builder->AppendArraySliceUnchecked(*array->data(), 0, 0)); ASSERT_EQ(4, builder->length()); @@ -657,39 +652,31 @@ TEST_F(TestArray, TestAppendArraySlice) { ASSERT_OK(builder->AppendArraySliceUnchecked(*nulls->data(), 0, 4)); ASSERT_EQ(12, builder->length()); - ASSERT_EQ(is_union ? 0 : 4, builder->null_count()); + if (!is_union(scalar->type->id())) { + ASSERT_EQ(4, builder->null_count()); + } ASSERT_OK(builder->AppendArraySliceUnchecked(*nulls->data(), 0, 0)); ASSERT_EQ(12, builder->length()); - ASSERT_EQ(is_union ? 0 : 4, builder->null_count()); + if (!is_union(scalar->type->id())) { + ASSERT_EQ(4, builder->null_count()); + } ASSERT_OK(builder->AppendArraySliceUnchecked(*nulls->data(), 1, 0)); ASSERT_EQ(12, builder->length()); - ASSERT_EQ(is_union ? 0 : 4, builder->null_count()); + if (!is_union(scalar->type->id())) { + ASSERT_EQ(4, builder->null_count()); + } ASSERT_OK(builder->AppendArraySliceUnchecked(*nulls->data(), 1, 4)); ASSERT_EQ(16, builder->length()); - ASSERT_EQ(is_union ? 0 : 8, builder->null_count()); - - if (is_union) { - // Concatenate not supported by union - std::shared_ptr result; - ASSERT_OK(builder->Finish(&result)); - ASSERT_OK(result->ValidateFull()); - ASSERT_EQ(16, result->length()); - ASSERT_EQ(0, result->null_count()); - } else { - ASSERT_OK_AND_ASSIGN(auto both, Concatenate({array, nulls})); - ASSERT_OK(builder->AppendArraySliceUnchecked(*both->data(), 14, 4)); - ASSERT_EQ(20, builder->length()); - ASSERT_EQ(10, builder->null_count()); - std::shared_ptr result; - ASSERT_OK(builder->Finish(&result)); - ASSERT_OK(result->ValidateFull()); - ASSERT_EQ(20, result->length()); - ASSERT_EQ(10, result->null_count()); - - ASSERT_OK_AND_ASSIGN(auto expected, - Concatenate({array->Slice(0, 4), array->Slice(1, 4), - nulls->Slice(0, 8), both->Slice(14, 4)})); - AssertArraysApproxEqual(*expected, *result, /*verbose=*/true); + if (!is_union(scalar->type->id())) { + ASSERT_EQ(8, builder->null_count()); + } + + std::shared_ptr result; + ASSERT_OK(builder->Finish(&result)); + ASSERT_OK(result->ValidateFull()); + ASSERT_EQ(16, result->length()); + if (!is_union(scalar->type->id())) { + ASSERT_EQ(8, result->null_count()); } } diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index c84e66d0697..2cc40a94a38 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -35,7 +35,6 @@ #include "arrow/buffer_builder.h" #include "arrow/status.h" #include "arrow/type.h" -#include "arrow/util/bit_run_reader.h" #include "arrow/util/macros.h" #include "arrow/util/string_view.h" // IWYU pragma: export #include "arrow/util/visibility.h" @@ -277,44 +276,17 @@ class BaseBinaryBuilder : public ArrayBuilder { Status AppendArraySliceUnchecked(const ArrayData& array, int64_t offset, int64_t length) override { - if (length == 0) return Status::OK(); auto bitmap = array.GetValues(0, 0); - auto offsets = array.GetValues(1) + offset; + auto offsets = array.GetValues(1); auto data = array.GetValues(2, 0); - ARROW_RETURN_NOT_OK(Reserve(length)); - - auto copy_values = [&](int64_t position, int64_t length) { - const offset_type first_offset = offsets[position]; - const offset_type last_offset = offsets[position + length]; - const offset_type data_length = last_offset - first_offset; - ARROW_RETURN_NOT_OK(ReserveData(data_length)); - UnsafeAppendNextOffset(); - const offset_type base_offset = - offsets_builder_.data()[offsets_builder_.length() - 1]; - ARROW_RETURN_NOT_OK(value_data_builder_.Append(data + first_offset, data_length)); - for (int64_t i = 1; i < length; i++) { - offsets_builder_.UnsafeAppend(base_offset + - (offsets[position + i] - first_offset)); - } - UnsafeAppendToBitmap(length, true); - return Status::OK(); - }; - - if (!bitmap) { - return copy_values(0, length); - } - internal::BitRunReader reader(bitmap, array.offset + offset, length); - int64_t position = 0; - while (true) { - internal::BitRun run = reader.NextRun(); - if (run.length == 0) break; - - if (run.set) { - ARROW_RETURN_NOT_OK(copy_values(position, run.length)); + for (int64_t i = 0; i < length; i++) { + if (!bitmap || BitUtil::GetBit(bitmap, array.offset + offset + i)) { + const offset_type start = offsets[offset + i]; + const offset_type end = offsets[offset + i + 1]; + ARROW_RETURN_NOT_OK(Append(data + start, end - start)); } else { - ARROW_RETURN_NOT_OK(AppendNulls(run.length)); + ARROW_RETURN_NOT_OK(AppendNull()); } - position += run.length; } return Status::OK(); } From e5581e6300676be5d0efcd675ccc4a33ea9f240e Mon Sep 17 00:00:00 2001 From: David Li Date: Thu, 19 Aug 2021 13:21:53 -0400 Subject: [PATCH 24/31] ARROW-13222: [C++] Rename to AppendArraySlice --- cpp/src/arrow/array/array_test.cc | 24 +++++++-------- cpp/src/arrow/array/builder_base.h | 6 ++-- cpp/src/arrow/array/builder_binary.h | 8 ++--- cpp/src/arrow/array/builder_nested.h | 29 +++++++++---------- cpp/src/arrow/array/builder_primitive.h | 10 +++---- cpp/src/arrow/array/builder_union.cc | 16 +++++----- cpp/src/arrow/array/builder_union.h | 8 ++--- .../arrow/compute/kernels/scalar_if_else.cc | 2 +- 8 files changed, 50 insertions(+), 53 deletions(-) diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index e714f03b191..480c5f1c649 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -641,31 +641,31 @@ TEST_F(TestArray, TestAppendArraySlice) { std::unique_ptr builder; ASSERT_OK(MakeBuilder(pool_, scalar->type, &builder)); - ASSERT_OK(builder->AppendArraySliceUnchecked(*array->data(), 0, 4)); + ASSERT_OK(builder->AppendArraySlice(*array->data(), 0, 4)); ASSERT_EQ(4, builder->length()); - ASSERT_OK(builder->AppendArraySliceUnchecked(*array->data(), 0, 0)); + ASSERT_OK(builder->AppendArraySlice(*array->data(), 0, 0)); ASSERT_EQ(4, builder->length()); - ASSERT_OK(builder->AppendArraySliceUnchecked(*array->data(), 1, 0)); + ASSERT_OK(builder->AppendArraySlice(*array->data(), 1, 0)); ASSERT_EQ(4, builder->length()); - ASSERT_OK(builder->AppendArraySliceUnchecked(*array->data(), 1, 4)); + ASSERT_OK(builder->AppendArraySlice(*array->data(), 1, 4)); ASSERT_EQ(8, builder->length()); - ASSERT_OK(builder->AppendArraySliceUnchecked(*nulls->data(), 0, 4)); + ASSERT_OK(builder->AppendArraySlice(*nulls->data(), 0, 4)); ASSERT_EQ(12, builder->length()); if (!is_union(scalar->type->id())) { ASSERT_EQ(4, builder->null_count()); } - ASSERT_OK(builder->AppendArraySliceUnchecked(*nulls->data(), 0, 0)); + ASSERT_OK(builder->AppendArraySlice(*nulls->data(), 0, 0)); ASSERT_EQ(12, builder->length()); if (!is_union(scalar->type->id())) { ASSERT_EQ(4, builder->null_count()); } - ASSERT_OK(builder->AppendArraySliceUnchecked(*nulls->data(), 1, 0)); + ASSERT_OK(builder->AppendArraySlice(*nulls->data(), 1, 0)); ASSERT_EQ(12, builder->length()); if (!is_union(scalar->type->id())) { ASSERT_EQ(4, builder->null_count()); } - ASSERT_OK(builder->AppendArraySliceUnchecked(*nulls->data(), 1, 4)); + ASSERT_OK(builder->AppendArraySlice(*nulls->data(), 1, 4)); ASSERT_EQ(16, builder->length()); if (!is_union(scalar->type->id())) { ASSERT_EQ(8, builder->null_count()); @@ -683,13 +683,13 @@ TEST_F(TestArray, TestAppendArraySlice) { { ASSERT_OK_AND_ASSIGN(auto array, MakeArrayOfNull(null(), 16)); NullBuilder builder(pool_); - ASSERT_OK(builder.AppendArraySliceUnchecked(*array->data(), 0, 4)); + ASSERT_OK(builder.AppendArraySlice(*array->data(), 0, 4)); ASSERT_EQ(4, builder.length()); - ASSERT_OK(builder.AppendArraySliceUnchecked(*array->data(), 0, 0)); + ASSERT_OK(builder.AppendArraySlice(*array->data(), 0, 0)); ASSERT_EQ(4, builder.length()); - ASSERT_OK(builder.AppendArraySliceUnchecked(*array->data(), 1, 0)); + ASSERT_OK(builder.AppendArraySlice(*array->data(), 1, 0)); ASSERT_EQ(4, builder.length()); - ASSERT_OK(builder.AppendArraySliceUnchecked(*array->data(), 1, 4)); + ASSERT_OK(builder.AppendArraySlice(*array->data(), 1, 4)); ASSERT_EQ(8, builder.length()); std::shared_ptr result; ASSERT_OK(builder.Finish(&result)); diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h index 99fa33433c9..67203e79071 100644 --- a/cpp/src/arrow/array/builder_base.h +++ b/cpp/src/arrow/array/builder_base.h @@ -126,9 +126,9 @@ class ARROW_EXPORT ArrayBuilder { /// \brief Append a range of values from an array. /// /// The given array must be the same type as the builder. - virtual Status AppendArraySliceUnchecked(const ArrayData& array, int64_t offset, - int64_t length) { - return Status::NotImplemented("AppendArraySliceUnchecked for builder for ", *type()); + virtual Status AppendArraySlice(const ArrayData& array, int64_t offset, + int64_t length) { + return Status::NotImplemented("AppendArraySlice for builder for ", *type()); } /// For cases where raw data was memcpy'd into the internal buffers, allows us diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index 2cc40a94a38..6ca65113f1c 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -274,8 +274,8 @@ class BaseBinaryBuilder : public ArrayBuilder { return Status::OK(); } - Status AppendArraySliceUnchecked(const ArrayData& array, int64_t offset, - int64_t length) override { + Status AppendArraySlice(const ArrayData& array, int64_t offset, + int64_t length) override { auto bitmap = array.GetValues(0, 0); auto offsets = array.GetValues(1); auto data = array.GetValues(2, 0); @@ -512,8 +512,8 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { Status AppendEmptyValue() final; Status AppendEmptyValues(int64_t length) final; - Status AppendArraySliceUnchecked(const ArrayData& array, int64_t offset, - int64_t length) override { + Status AppendArraySlice(const ArrayData& array, int64_t offset, + int64_t length) override { return AppendValues( array.GetValues(1, 0) + ((array.offset + offset) * byte_width_), length, array.GetValues(0, 0), array.offset + offset); diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h index f4b2ea7ccd0..e53b758efa3 100644 --- a/cpp/src/arrow/array/builder_nested.h +++ b/cpp/src/arrow/array/builder_nested.h @@ -122,16 +122,16 @@ class BaseListBuilder : public ArrayBuilder { return Status::OK(); } - Status AppendArraySliceUnchecked(const ArrayData& array, int64_t offset, - int64_t length) override { + Status AppendArraySlice(const ArrayData& array, int64_t offset, + int64_t length) override { const offset_type* offsets = array.GetValues(1); const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR; for (int64_t row = offset; row < offset + length; row++) { if (!validity || BitUtil::GetBit(validity, array.offset + row)) { ARROW_RETURN_NOT_OK(Append()); int64_t slot_length = offsets[row + 1] - offsets[row]; - ARROW_RETURN_NOT_OK(value_builder_->AppendArraySliceUnchecked( - *array.child_data[0], offsets[row], slot_length)); + ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(*array.child_data[0], + offsets[row], slot_length)); } else { ARROW_RETURN_NOT_OK(AppendNull()); } @@ -292,17 +292,17 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder { Status AppendEmptyValues(int64_t length) final; - Status AppendArraySliceUnchecked(const ArrayData& array, int64_t offset, - int64_t length) override { + Status AppendArraySlice(const ArrayData& array, int64_t offset, + int64_t length) override { const int32_t* offsets = array.GetValues(1); const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR; for (int64_t row = offset; row < offset + length; row++) { if (!validity || BitUtil::GetBit(validity, array.offset + row)) { ARROW_RETURN_NOT_OK(Append()); const int64_t slot_length = offsets[row + 1] - offsets[row]; - ARROW_RETURN_NOT_OK(key_builder_->AppendArraySliceUnchecked( + ARROW_RETURN_NOT_OK(key_builder_->AppendArraySlice( *array.child_data[0]->child_data[0], offsets[row], slot_length)); - ARROW_RETURN_NOT_OK(item_builder_->AppendArraySliceUnchecked( + ARROW_RETURN_NOT_OK(item_builder_->AppendArraySlice( *array.child_data[0]->child_data[1], offsets[row], slot_length)); } else { ARROW_RETURN_NOT_OK(AppendNull()); @@ -410,12 +410,11 @@ class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder { Status AppendEmptyValues(int64_t length) final; - Status AppendArraySliceUnchecked(const ArrayData& array, int64_t offset, - int64_t length) final { + Status AppendArraySlice(const ArrayData& array, int64_t offset, int64_t length) final { const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR; for (int64_t row = offset; row < offset + length; row++) { if (!validity || BitUtil::GetBit(validity, array.offset + row)) { - ARROW_RETURN_NOT_OK(value_builder_->AppendArraySliceUnchecked( + ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice( *array.child_data[0], list_size_ * (array.offset + row), list_size_)); ARROW_RETURN_NOT_OK(Append()); } else { @@ -518,11 +517,11 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder { return Status::OK(); } - Status AppendArraySliceUnchecked(const ArrayData& array, int64_t offset, - int64_t length) override { + Status AppendArraySlice(const ArrayData& array, int64_t offset, + int64_t length) override { for (int i = 0; static_cast(i) < children_.size(); i++) { - ARROW_RETURN_NOT_OK(children_[i]->AppendArraySliceUnchecked( - *array.child_data[i], array.offset + offset, length)); + ARROW_RETURN_NOT_OK(children_[i]->AppendArraySlice(*array.child_data[i], + array.offset + offset, length)); } const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR; ARROW_RETURN_NOT_OK(Reserve(length)); diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h index 4b5d9f58b5e..67d58fc9d13 100644 --- a/cpp/src/arrow/array/builder_primitive.h +++ b/cpp/src/arrow/array/builder_primitive.h @@ -53,7 +53,7 @@ class ARROW_EXPORT NullBuilder : public ArrayBuilder { Status Append(std::nullptr_t) { return AppendNull(); } - Status AppendArraySliceUnchecked(const ArrayData&, int64_t, int64_t length) override { + Status AppendArraySlice(const ArrayData&, int64_t, int64_t length) override { return AppendNulls(length); } @@ -275,8 +275,8 @@ class NumericBuilder : public ArrayBuilder { return Status::OK(); } - Status AppendArraySliceUnchecked(const ArrayData& array, int64_t offset, - int64_t length) override { + Status AppendArraySlice(const ArrayData& array, int64_t offset, + int64_t length) override { return AppendValues(array.GetValues(1) + offset, length, array.GetValues(0, 0), array.offset + offset); } @@ -493,8 +493,8 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { Status AppendValues(int64_t length, bool value); - Status AppendArraySliceUnchecked(const ArrayData& array, int64_t offset, - int64_t length) override { + Status AppendArraySlice(const ArrayData& array, int64_t offset, + int64_t length) override { return AppendValues(array.GetValues(1, 0), length, array.GetValues(0, 0), array.offset + offset); } diff --git a/cpp/src/arrow/array/builder_union.cc b/cpp/src/arrow/array/builder_union.cc index e2ff473d65d..092a97e5fc2 100644 --- a/cpp/src/arrow/array/builder_union.cc +++ b/cpp/src/arrow/array/builder_union.cc @@ -45,9 +45,8 @@ Status BasicUnionBuilder::FinishInternal(std::shared_ptr* out) { return Status::OK(); } -Status DenseUnionBuilder::AppendArraySliceUnchecked(const ArrayData& array, - const int64_t offset, - const int64_t length) { +Status DenseUnionBuilder::AppendArraySlice(const ArrayData& array, const int64_t offset, + const int64_t length) { const int8_t* type_codes = array.GetValues(1); const int32_t* offsets = array.GetValues(2); for (int64_t row = offset; row < offset + length; row++) { @@ -55,7 +54,7 @@ Status DenseUnionBuilder::AppendArraySliceUnchecked(const ArrayData& array, const int child_id = type_id_to_child_id_[type_code]; const int32_t union_offset = offsets[row]; ARROW_RETURN_NOT_OK(Append(type_code)); - ARROW_RETURN_NOT_OK(type_id_to_children_[type_code]->AppendArraySliceUnchecked( + ARROW_RETURN_NOT_OK(type_id_to_children_[type_code]->AppendArraySlice( *array.child_data[child_id], union_offset, /*length=*/1)); } return Status::OK(); @@ -138,12 +137,11 @@ int8_t BasicUnionBuilder::NextTypeId() { return dense_type_id_++; } -Status SparseUnionBuilder::AppendArraySliceUnchecked(const ArrayData& array, - const int64_t offset, - const int64_t length) { +Status SparseUnionBuilder::AppendArraySlice(const ArrayData& array, const int64_t offset, + const int64_t length) { for (size_t i = 0; i < type_codes_.size(); i++) { - type_id_to_children_[type_codes_[i]]->AppendArraySliceUnchecked( - *array.child_data[i], array.offset + offset, length); + type_id_to_children_[type_codes_[i]]->AppendArraySlice(*array.child_data[i], + array.offset + offset, length); } const int8_t* type_codes = array.GetValues(1); types_builder_.Append(type_codes + offset, length); diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h index da44ba5922f..f62abd7c2ba 100644 --- a/cpp/src/arrow/array/builder_union.h +++ b/cpp/src/arrow/array/builder_union.h @@ -156,8 +156,8 @@ class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder { return offsets_builder_.Append(offset); } - Status AppendArraySliceUnchecked(const ArrayData& array, const int64_t offset, - const int64_t length) override; + Status AppendArraySlice(const ArrayData& array, const int64_t offset, + const int64_t length) override; Status FinishInternal(std::shared_ptr* out) override; @@ -235,8 +235,8 @@ class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder { /// is called, and all other child builders must have null or empty value appended. Status Append(int8_t next_type) { return types_builder_.Append(next_type); } - Status AppendArraySliceUnchecked(const ArrayData& array, const int64_t offset, - const int64_t length) override; + Status AppendArraySlice(const ArrayData& array, const int64_t offset, + const int64_t length) override; }; } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index 4de29319586..ca7b8a8a745 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -1499,7 +1499,7 @@ static Status ExecVarWidthArrayCaseWhenImpl( const auto& array = source.array(); if (!array->buffers[0] || BitUtil::GetBit(array->buffers[0]->data(), array->offset + row)) { - RETURN_NOT_OK(raw_builder->AppendArraySliceUnchecked(*array, row, /*length=*/1)); + RETURN_NOT_OK(raw_builder->AppendArraySlice(*array, row, /*length=*/1)); } else { RETURN_NOT_OK(raw_builder->AppendNull()); } From ee1a6b0b468272669799589bc4a3fe4078fce98a Mon Sep 17 00:00:00 2001 From: David Li Date: Thu, 19 Aug 2021 14:06:28 -0400 Subject: [PATCH 25/31] ARROW-13222: [C++] Add forgotten RETURN_NOT_OK --- cpp/src/arrow/array/builder_union.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/array/builder_union.cc b/cpp/src/arrow/array/builder_union.cc index 092a97e5fc2..e66e21a858d 100644 --- a/cpp/src/arrow/array/builder_union.cc +++ b/cpp/src/arrow/array/builder_union.cc @@ -53,8 +53,8 @@ Status DenseUnionBuilder::AppendArraySlice(const ArrayData& array, const int64_t const int8_t type_code = type_codes[row]; const int child_id = type_id_to_child_id_[type_code]; const int32_t union_offset = offsets[row]; - ARROW_RETURN_NOT_OK(Append(type_code)); - ARROW_RETURN_NOT_OK(type_id_to_children_[type_code]->AppendArraySlice( + RETURN_NOT_OK(Append(type_code)); + RETURN_NOT_OK(type_id_to_children_[type_code]->AppendArraySlice( *array.child_data[child_id], union_offset, /*length=*/1)); } return Status::OK(); @@ -140,8 +140,8 @@ int8_t BasicUnionBuilder::NextTypeId() { Status SparseUnionBuilder::AppendArraySlice(const ArrayData& array, const int64_t offset, const int64_t length) { for (size_t i = 0; i < type_codes_.size(); i++) { - type_id_to_children_[type_codes_[i]]->AppendArraySlice(*array.child_data[i], - array.offset + offset, length); + RETURN_NOT_OK(type_id_to_children_[type_codes_[i]]->AppendArraySlice( + *array.child_data[i], array.offset + offset, length)); } const int8_t* type_codes = array.GetValues(1); types_builder_.Append(type_codes + offset, length); From ef62b84ace0345e9a4d1e9238cf5d195ab578ca8 Mon Sep 17 00:00:00 2001 From: David Li Date: Thu, 19 Aug 2021 14:19:45 -0400 Subject: [PATCH 26/31] ARROW-13222: [C++] Add another forgotten RETURN_NOT_OK --- cpp/src/arrow/array/builder_union.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/builder_union.cc b/cpp/src/arrow/array/builder_union.cc index e66e21a858d..8a9ec45685b 100644 --- a/cpp/src/arrow/array/builder_union.cc +++ b/cpp/src/arrow/array/builder_union.cc @@ -144,7 +144,7 @@ Status SparseUnionBuilder::AppendArraySlice(const ArrayData& array, const int64_ *array.child_data[i], array.offset + offset, length)); } const int8_t* type_codes = array.GetValues(1); - types_builder_.Append(type_codes + offset, length); + RETURN_NOT_OK(types_builder_.Append(type_codes + offset, length)); return Status::OK(); } From b52dab8b6635cd6c76e53efff008b7487bfdd7c3 Mon Sep 17 00:00:00 2001 From: David Li Date: Thu, 19 Aug 2021 15:17:32 -0400 Subject: [PATCH 27/31] ARROW-13222: [C++] Fix qualifiers for MSVC2015 --- cpp/src/arrow/array/builder_union.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h index f62abd7c2ba..c1a799e56bf 100644 --- a/cpp/src/arrow/array/builder_union.h +++ b/cpp/src/arrow/array/builder_union.h @@ -156,8 +156,8 @@ class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder { return offsets_builder_.Append(offset); } - Status AppendArraySlice(const ArrayData& array, const int64_t offset, - const int64_t length) override; + Status AppendArraySlice(const ArrayData& array, int64_t offset, + int64_t length) override; Status FinishInternal(std::shared_ptr* out) override; @@ -235,8 +235,8 @@ class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder { /// is called, and all other child builders must have null or empty value appended. Status Append(int8_t next_type) { return types_builder_.Append(next_type); } - Status AppendArraySlice(const ArrayData& array, const int64_t offset, - const int64_t length) override; + Status AppendArraySlice(const ArrayData& array, int64_t offset, + int64_t length) override; }; } // namespace arrow From 814b2f33390938a69f651bdd9f225eeeadfb0a0c Mon Sep 17 00:00:00 2001 From: David Li Date: Thu, 19 Aug 2021 15:29:44 -0400 Subject: [PATCH 28/31] ARROW-13222: [C++] Add cast for MSVC --- cpp/src/arrow/array/builder_union.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/array/builder_union.cc b/cpp/src/arrow/array/builder_union.cc index 8a9ec45685b..6096b76ff21 100644 --- a/cpp/src/arrow/array/builder_union.cc +++ b/cpp/src/arrow/array/builder_union.cc @@ -89,7 +89,7 @@ BasicUnionBuilder::BasicUnionBuilder( child_fields_[i] = union_type.field(static_cast(i)); auto type_id = union_type.type_codes()[i]; - type_id_to_child_id_[type_id] = i; + type_id_to_child_id_[type_id] = static_cast(i); type_id_to_children_[type_id] = children[i].get(); } } From 2f67f7ff8abd4673d2cbd239b8b0aad776e30bec Mon Sep 17 00:00:00 2001 From: David Li Date: Fri, 20 Aug 2021 09:47:36 -0400 Subject: [PATCH 29/31] ARROW-13222: [C++] Try to avoid apparently miscompilation under MinGW toolchain --- .../arrow/compute/kernels/scalar_if_else.cc | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index ca7b8a8a745..cd4d1f7e3e4 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -1623,9 +1623,10 @@ struct CaseWhenFunctor { } static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - return ExecVarWidthArrayCaseWhen(ctx, batch, out, - // ReserveData - [](ArrayBuilder*) { return Status::OK(); }); + std::function reserve_data = [](ArrayBuilder*) { + return Status::OK(); + }; + return ExecVarWidthArrayCaseWhen(ctx, batch, out, std::move(reserve_data)); } }; @@ -1642,9 +1643,10 @@ struct CaseWhenFunctor { } static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - return ExecVarWidthArrayCaseWhen(ctx, batch, out, - // ReserveData - [](ArrayBuilder*) { return Status::OK(); }); + std::function reserve_data = [](ArrayBuilder*) { + return Status::OK(); + }; + return ExecVarWidthArrayCaseWhen(ctx, batch, out, std::move(reserve_data)); } }; @@ -1688,10 +1690,10 @@ struct CaseWhenFunctor> { } static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - return ExecVarWidthArrayCaseWhen( - ctx, batch, out, - // ReserveData - [&](ArrayBuilder* raw_builder) { return Status::OK(); }); + std::function reserve_data = [](ArrayBuilder*) { + return Status::OK(); + }; + return ExecVarWidthArrayCaseWhen(ctx, batch, out, std::move(reserve_data)); } }; From 4cc6387fd50c7dd40c322d3b8e27103a8d1a5451 Mon Sep 17 00:00:00 2001 From: David Li Date: Mon, 23 Aug 2021 08:14:07 -0400 Subject: [PATCH 30/31] ARROW-13222: [C++] Try to fix MinGW again --- cpp/src/arrow/compute/kernels/scalar_if_else.cc | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index cd4d1f7e3e4..affe9267942 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -1610,6 +1610,9 @@ struct CaseWhenFunctor> { } }; +// No-op reserve function, pulled out to avoid apparent miscompilation on MinGW +Status ReserveNoData(ArrayBuilder*) { return Status::OK(); } + template <> struct CaseWhenFunctor { static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { @@ -1623,9 +1626,7 @@ struct CaseWhenFunctor { } static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - std::function reserve_data = [](ArrayBuilder*) { - return Status::OK(); - }; + std::function reserve_data = ReserveNoData; return ExecVarWidthArrayCaseWhen(ctx, batch, out, std::move(reserve_data)); } }; @@ -1643,9 +1644,7 @@ struct CaseWhenFunctor { } static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - std::function reserve_data = [](ArrayBuilder*) { - return Status::OK(); - }; + std::function reserve_data = ReserveNoData; return ExecVarWidthArrayCaseWhen(ctx, batch, out, std::move(reserve_data)); } }; @@ -1690,9 +1689,7 @@ struct CaseWhenFunctor> { } static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - std::function reserve_data = [](ArrayBuilder*) { - return Status::OK(); - }; + std::function reserve_data = ReserveNoData; return ExecVarWidthArrayCaseWhen(ctx, batch, out, std::move(reserve_data)); } }; From f032791f06c6b3d1451a9a9cd083def09a24666e Mon Sep 17 00:00:00 2001 From: David Li Date: Mon, 23 Aug 2021 09:30:33 -0400 Subject: [PATCH 31/31] ARROW-13222: [C++] Update compute.rst --- docs/source/cpp/compute.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index ade2cdaa7d5..6de5a6ed9a7 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -902,7 +902,7 @@ Structural transforms +--------------------------+------------+---------------------------------------------------+---------------------+---------+ | Function name | Arity | Input types | Output type | Notes | +==========================+============+===================================================+=====================+=========+ -| case_when | Varargs | Struct of Boolean (Arg 0), Any fixed-width (rest) | Input type | \(1) | +| case_when | Varargs | Struct of Boolean (Arg 0), Any (rest) | Input type | \(1) | +--------------------------+------------+---------------------------------------------------+---------------------+---------+ | choose | Varargs | Integral (Arg 0); Fixed-width/Binary-like (rest) | Input type | \(2) | +--------------------------+------------+---------------------------------------------------+---------------------+---------+ @@ -936,6 +936,9 @@ Structural transforms the first value datum for which the corresponding Boolean is true, or the corresponding value from the 'default' input, or null otherwise. + Note that currently, while all types are supported, dictionaries will be + unpacked. + * \(2) The first input must be an integral type. The rest of the arguments can be any type, but must all be the same type or promotable to a common type. Each value of the first input (the 'index') is used as a zero-based index into the