From 26b32d5eb869601722acd1bd9f416eced47fa313 Mon Sep 17 00:00:00 2001 From: Yibo Cai Date: Wed, 28 Apr 2021 06:10:53 +0000 Subject: [PATCH 1/6] ARROW-12568: [C++][Compute] Segfault when casting sliced ListArray --- cpp/src/arrow/compute/kernels/scalar_cast_nested.cc | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 9364120c133..ec92dbb5d60 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -65,9 +65,11 @@ Status CastListExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { Datum values = in_array.child_data[0]; if (in_array.offset != 0) { - ARROW_ASSIGN_OR_RAISE(out_array->buffers[0], - CopyBitmap(ctx->memory_pool(), in_array.buffers[0]->data(), - in_array.offset, in_array.length)); + if (in_array.buffers[0]) { + ARROW_ASSIGN_OR_RAISE(out_array->buffers[0], + CopyBitmap(ctx->memory_pool(), in_array.buffers[0]->data(), + in_array.offset, in_array.length)); + } ARROW_ASSIGN_OR_RAISE(out_array->buffers[1], ctx->Allocate(sizeof(offset_type) * (in_array.length + 1))); From f2f9dc50dd0287f6469340cb9756cc8a2c259c59 Mon Sep 17 00:00:00 2001 From: Yibo Cai Date: Wed, 28 Apr 2021 10:02:50 +0000 Subject: [PATCH 2/6] fix scalar boolean kernel nullptr deference --- .../arrow/compute/kernels/scalar_boolean.cc | 51 ++++++++++++++----- .../compute/kernels/scalar_boolean_test.cc | 18 +++++++ 2 files changed, 57 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_boolean.cc index d555a81392a..26eaf83e023 100644 --- a/cpp/src/arrow/compute/kernels/scalar_boolean.cc +++ b/cpp/src/arrow/compute/kernels/scalar_boolean.cc @@ -175,15 +175,24 @@ struct KleeneAnd : Commutative { } if (right_true) { - GetBitmap(*out, 0).CopyFrom(GetBitmap(left, 0)); + if (left.null_count == 0) { + GetBitmap(*out, 0).SetBitsTo(true); + } else { + GetBitmap(*out, 0).CopyFrom(GetBitmap(left, 0)); + } GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1)); return Status::OK(); } // scalar was null: out[i] is valid iff left[i] was false - ::arrow::internal::BitmapAndNot(left.buffers[0]->data(), left.offset, - left.buffers[1]->data(), left.offset, left.length, - out->offset, out->buffers[0]->mutable_data()); + if (left.null_count == 0) { + ::arrow::internal::InvertBitmap(left.buffers[1]->data(), left.offset, left.length, + out->buffers[0]->mutable_data(), out->offset); + } else { + ::arrow::internal::BitmapAndNot(left.buffers[0]->data(), left.offset, + left.buffers[1]->data(), left.offset, left.length, + out->offset, out->buffers[0]->mutable_data()); + } ::arrow::internal::CopyBitmap(left.buffers[1]->data(), left.offset, left.length, out->buffers[1]->mutable_data(), out->offset); return Status::OK(); @@ -266,15 +275,24 @@ struct KleeneOr : Commutative { } if (right_false) { - GetBitmap(*out, 0).CopyFrom(GetBitmap(left, 0)); + if (left.null_count == 0) { + GetBitmap(*out, 0).SetBitsTo(true); + } else { + GetBitmap(*out, 0).CopyFrom(GetBitmap(left, 0)); + } GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1)); return Status::OK(); } // scalar was null: out[i] is valid iff left[i] was true - ::arrow::internal::BitmapAnd(left.buffers[0]->data(), left.offset, - left.buffers[1]->data(), left.offset, left.length, - out->offset, out->buffers[0]->mutable_data()); + if (left.null_count == 0) { + ::arrow::internal::CopyBitmap(left.buffers[1]->data(), left.offset, left.length, + out->buffers[0]->mutable_data(), out->offset); + } else { + ::arrow::internal::BitmapAnd(left.buffers[0]->data(), left.offset, + left.buffers[1]->data(), left.offset, left.length, + out->offset, out->buffers[0]->mutable_data()); + } ::arrow::internal::CopyBitmap(left.buffers[1]->data(), left.offset, left.length, out->buffers[1]->mutable_data(), out->offset); return Status::OK(); @@ -379,15 +397,24 @@ struct KleeneAndNot { } if (left_true) { - GetBitmap(*out, 0).CopyFrom(GetBitmap(right, 0)); + if (right.null_count == 0) { + GetBitmap(*out, 0).SetBitsTo(true); + } else { + GetBitmap(*out, 0).CopyFrom(GetBitmap(right, 0)); + } GetBitmap(*out, 1).CopyFromInverted(GetBitmap(right, 1)); return Status::OK(); } // scalar was null: out[i] is valid iff right[i] was true - ::arrow::internal::BitmapAnd(right.buffers[0]->data(), right.offset, - right.buffers[1]->data(), right.offset, right.length, - out->offset, out->buffers[0]->mutable_data()); + if (right.null_count == 0) { + ::arrow::internal::CopyBitmap(right.buffers[1]->data(), right.offset, right.length, + out->buffers[0]->mutable_data(), out->offset); + } else { + ::arrow::internal::BitmapAnd(right.buffers[0]->data(), right.offset, + right.buffers[1]->data(), right.offset, right.length, + out->offset, out->buffers[0]->mutable_data()); + } ::arrow::internal::InvertBitmap(right.buffers[1]->data(), right.offset, right.length, out->buffers[1]->mutable_data(), out->offset); return Status::OK(); diff --git a/cpp/src/arrow/compute/kernels/scalar_boolean_test.cc b/cpp/src/arrow/compute/kernels/scalar_boolean_test.cc index 7d3f68e2aef..4c11eb6db30 100644 --- a/cpp/src/arrow/compute/kernels/scalar_boolean_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_boolean_test.cc @@ -110,6 +110,12 @@ TEST(TestBooleanKernel, KleeneAnd) { expected = ArrayFromJSON(boolean(), "[true, false, false, null, false]"); CheckScalarBinary("and_kleene", left, right, expected); CheckBooleanScalarArrayBinary("and_kleene", left); + + left = ArrayFromJSON(boolean(), " [true, true, false, true]"); + right = ArrayFromJSON(boolean(), " [true, false, false, false]"); + expected = ArrayFromJSON(boolean(), "[true, false, false, false]"); + CheckScalarBinary("and_kleene", left, right, expected); + CheckBooleanScalarArrayBinary("and_kleene", left); } TEST(TestBooleanKernel, KleeneAndNot) { @@ -121,6 +127,12 @@ TEST(TestBooleanKernel, KleeneAndNot) { boolean(), "[false, true, null, false, false, false, false, null, null]"); CheckScalarBinary("and_not_kleene", left, right, expected); CheckBooleanScalarArrayBinary("and_not_kleene", left); + + left = ArrayFromJSON(boolean(), " [true, true, false, false]"); + right = ArrayFromJSON(boolean(), " [true, false, true, false]"); + expected = ArrayFromJSON(boolean(), "[false, true, false, false]"); + CheckScalarBinary("and_not_kleene", left, right, expected); + CheckBooleanScalarArrayBinary("and_not_kleene", left); } TEST(TestBooleanKernel, KleeneOr) { @@ -135,6 +147,12 @@ TEST(TestBooleanKernel, KleeneOr) { expected = ArrayFromJSON(boolean(), "[true, true, false, true, null]"); CheckScalarBinary("or_kleene", left, right, expected); CheckBooleanScalarArrayBinary("or_kleene", left); + + left = ArrayFromJSON(boolean(), " [true, true, false, false]"); + right = ArrayFromJSON(boolean(), " [true, false, false, true]"); + expected = ArrayFromJSON(boolean(), "[true, true, false, true]"); + CheckScalarBinary("or_kleene", left, right, expected); + CheckBooleanScalarArrayBinary("or_kleene", left); } } // namespace compute From 4af89d1aed0b8ff2d6185b2c884239dc7d1079a2 Mon Sep 17 00:00:00 2001 From: Yibo Cai Date: Wed, 28 Apr 2021 10:26:29 +0000 Subject: [PATCH 3/6] fix hash aggregate kernel nullptr deference --- .../arrow/compute/kernels/hash_aggregate.cc | 8 ++++--- .../compute/kernels/hash_aggregate_test.cc | 23 +++++++++++++++++++ 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc index f45e82e04af..ae7bf9324db 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc @@ -483,9 +483,11 @@ struct GroupedCountImpl : public GroupedAggregator { const auto& input = batch[0].array(); if (options_.count_mode == CountOptions::COUNT_NULL) { - for (int64_t i = 0, input_i = input->offset; i < input->length; ++i, ++input_i) { - auto g = group_ids[i]; - raw_counts[g] += !BitUtil::GetBit(input->buffers[0]->data(), input_i); + if (input->GetNullCount() != 0) { + for (int64_t i = 0, input_i = input->offset; i < input->length; ++i, ++input_i) { + auto g = group_ids[i]; + raw_counts[g] += !BitUtil::GetBit(input->buffers[0]->data(), input_i); + } } return Status::OK(); } diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc index 7858d8bb147..507f1716110 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate_test.cc @@ -678,6 +678,29 @@ TEST(GroupBy, ConcreteCaseWithValidateGroupBy) { } } +// Count nulls/non_nulls from record batch with no nulls +TEST(GroupBy, CountNull) { + auto batch = RecordBatchFromJSON( + schema({field("argument", float64()), field("key", utf8())}), R"([ + [1.0, "alfa"], + [2.0, "beta"], + [3.0, "gama"] + ])"); + + CountOptions count_non_null{CountOptions::COUNT_NON_NULL}, + count_null{CountOptions::COUNT_NULL}; + + using internal::Aggregate; + for (auto agg : { + Aggregate{"hash_count", &count_non_null}, + Aggregate{"hash_count", &count_null}, + }) { + SCOPED_TRACE(agg.function); + ValidateGroupBy({agg}, {batch->GetColumnByName("argument")}, + {batch->GetColumnByName("key")}); + } +} + TEST(GroupBy, RandomArraySum) { for (int64_t length : {1 << 10, 1 << 12, 1 << 15}) { for (auto null_probability : {0.0, 0.01, 0.5, 1.0}) { From 846cebef20e1a97bf653f97d231c007ea89ac8b5 Mon Sep 17 00:00:00 2001 From: Yibo Cai Date: Sun, 2 May 2021 07:38:12 +0000 Subject: [PATCH 4/6] data.null_count -> data.GetNullCount() --- cpp/src/arrow/compute/kernels/scalar_boolean.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_boolean.cc index 26eaf83e023..5a417f79110 100644 --- a/cpp/src/arrow/compute/kernels/scalar_boolean.cc +++ b/cpp/src/arrow/compute/kernels/scalar_boolean.cc @@ -175,7 +175,7 @@ struct KleeneAnd : Commutative { } if (right_true) { - if (left.null_count == 0) { + if (left.GetNullCount() == 0) { GetBitmap(*out, 0).SetBitsTo(true); } else { GetBitmap(*out, 0).CopyFrom(GetBitmap(left, 0)); @@ -185,7 +185,7 @@ struct KleeneAnd : Commutative { } // scalar was null: out[i] is valid iff left[i] was false - if (left.null_count == 0) { + if (left.GetNullCount() == 0) { ::arrow::internal::InvertBitmap(left.buffers[1]->data(), left.offset, left.length, out->buffers[0]->mutable_data(), out->offset); } else { @@ -275,7 +275,7 @@ struct KleeneOr : Commutative { } if (right_false) { - if (left.null_count == 0) { + if (left.GetNullCount() == 0) { GetBitmap(*out, 0).SetBitsTo(true); } else { GetBitmap(*out, 0).CopyFrom(GetBitmap(left, 0)); @@ -285,7 +285,7 @@ struct KleeneOr : Commutative { } // scalar was null: out[i] is valid iff left[i] was true - if (left.null_count == 0) { + if (left.GetNullCount() == 0) { ::arrow::internal::CopyBitmap(left.buffers[1]->data(), left.offset, left.length, out->buffers[0]->mutable_data(), out->offset); } else { @@ -397,7 +397,7 @@ struct KleeneAndNot { } if (left_true) { - if (right.null_count == 0) { + if (right.GetNullCount() == 0) { GetBitmap(*out, 0).SetBitsTo(true); } else { GetBitmap(*out, 0).CopyFrom(GetBitmap(right, 0)); @@ -407,7 +407,7 @@ struct KleeneAndNot { } // scalar was null: out[i] is valid iff right[i] was true - if (right.null_count == 0) { + if (right.GetNullCount() == 0) { ::arrow::internal::CopyBitmap(right.buffers[1]->data(), right.offset, right.length, out->buffers[0]->mutable_data(), out->offset); } else { From 3198b7ec9d9dfad53219bac15f2e728bcb5b0e34 Mon Sep 17 00:00:00 2001 From: Yibo Cai Date: Mon, 3 May 2021 14:08:14 +0000 Subject: [PATCH 5/6] clear out->buffers[0] if no nulls in output --- .../arrow/compute/kernels/scalar_boolean.cc | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_boolean.cc index 5a417f79110..3d47d239888 100644 --- a/cpp/src/arrow/compute/kernels/scalar_boolean.cc +++ b/cpp/src/arrow/compute/kernels/scalar_boolean.cc @@ -169,14 +169,16 @@ struct KleeneAnd : Commutative { bool right_false = right.is_valid && !checked_cast(right).value; if (right_false) { - GetBitmap(*out, 0).SetBitsTo(true); + out->null_count = 0; + out->buffers[0] = nullptr; GetBitmap(*out, 1).SetBitsTo(false); // all false case return Status::OK(); } if (right_true) { if (left.GetNullCount() == 0) { - GetBitmap(*out, 0).SetBitsTo(true); + out->null_count = 0; + out->buffers[0] = nullptr; } else { GetBitmap(*out, 0).CopyFrom(GetBitmap(left, 0)); } @@ -201,7 +203,8 @@ struct KleeneAnd : Commutative { static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right, ArrayData* out) { if (left.GetNullCount() == 0 && right.GetNullCount() == 0) { - GetBitmap(*out, 0).SetBitsTo(true); + out->null_count = 0; + out->buffers[0] = nullptr; return And::Call(ctx, left, right, out); } auto compute_word = [](uint64_t left_true, uint64_t left_false, uint64_t right_true, @@ -269,14 +272,16 @@ struct KleeneOr : Commutative { bool right_false = right.is_valid && !checked_cast(right).value; if (right_true) { - GetBitmap(*out, 0).SetBitsTo(true); + out->null_count = 0; + out->buffers[0] = nullptr; GetBitmap(*out, 1).SetBitsTo(true); // all true case return Status::OK(); } if (right_false) { if (left.GetNullCount() == 0) { - GetBitmap(*out, 0).SetBitsTo(true); + out->null_count = 0; + out->buffers[0] = nullptr; } else { GetBitmap(*out, 0).CopyFrom(GetBitmap(left, 0)); } @@ -301,7 +306,8 @@ struct KleeneOr : Commutative { static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right, ArrayData* out) { if (left.GetNullCount() == 0 && right.GetNullCount() == 0) { - GetBitmap(*out, 0).SetBitsTo(true); + out->null_count = 0; + out->buffers[0] = nullptr; return Or::Call(ctx, left, right, out); } @@ -391,14 +397,16 @@ struct KleeneAndNot { bool left_false = left.is_valid && !checked_cast(left).value; if (left_false) { - GetBitmap(*out, 0).SetBitsTo(true); + out->null_count = 0; + out->buffers[0] = nullptr; GetBitmap(*out, 1).SetBitsTo(false); // all false case return Status::OK(); } if (left_true) { if (right.GetNullCount() == 0) { - GetBitmap(*out, 0).SetBitsTo(true); + out->null_count = 0; + out->buffers[0] = nullptr; } else { GetBitmap(*out, 0).CopyFrom(GetBitmap(right, 0)); } @@ -428,7 +436,8 @@ struct KleeneAndNot { static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right, ArrayData* out) { if (left.GetNullCount() == 0 && right.GetNullCount() == 0) { - GetBitmap(*out, 0).SetBitsTo(true); + out->null_count = 0; + out->buffers[0] = nullptr; return AndNot::Call(ctx, left, right, out); } From af677ce3fe696c174af203d7f4b9dc4499283b9b Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 3 May 2021 16:50:40 +0200 Subject: [PATCH 6/6] Add test for list casting --- cpp/src/arrow/compute/kernels/scalar_cast_test.cc | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 6efecbb2ad0..e9618fa5c5d 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -1702,6 +1702,20 @@ TEST(Cast, ListToList) { CheckCast(MakeArray(list_float32), MakeArray(list_int32)); CheckCast(MakeArray(list_int64), MakeArray(list_int32)); } + + // No nulls (ARROW-12568) + for (auto make_list : std::vector{&list, &large_list}) { + auto list_int32 = ArrayFromJSON(make_list(int32()), + "[[0], [1], [2, 3, 4], [5, 6], [], [7], [8, 9]]") + ->data(); + auto list_int64 = list_int32->Copy(); + list_int64->type = make_list(int64()); + list_int64->child_data[0] = Cast(list_int32->child_data[0], int64())->array(); + ASSERT_OK(MakeArray(list_int64)->ValidateFull()); + + CheckCast(MakeArray(list_int32), MakeArray(list_int64)); + CheckCast(MakeArray(list_int64), MakeArray(list_int32)); + } } TEST(Cast, ListToListOptionsPassthru) {