From d4dbe5d8aa9bc64b46b199e3882ec74f0f1a3938 Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Sat, 26 Mar 2022 19:18:21 +0530 Subject: [PATCH 1/8] Add basic impl --- .../compute/kernels/scalar_cast_nested.cc | 102 ++++++++++++++++++ .../arrow/compute/kernels/scalar_cast_test.cc | 20 ++++ 2 files changed, 122 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index cc36c510363..a0b3556d0f6 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -228,6 +228,107 @@ void AddStructToStructCast(CastFunction* func) { DCHECK_OK(func->AddKernel(StructType::type_id, std::move(kernel))); } +struct CastStructSubset { + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + const CastOptions& options = CastState::Get(ctx); + const auto& in_type = checked_cast(*batch[0].type()); + const auto& out_type = checked_cast(*out->type()); + const int in_field_count = in_type.num_fields(); + const int out_field_count = out_type.num_fields(); + + std::vector fields_to_select(in_field_count, false); + + int out_field_index = 0; + for (int in_field_index = 0; + in_field_index < in_field_count && out_field_index < out_field_count; + ++in_field_index) { + const auto in_field = in_type.field(in_field_index); + const auto out_field = out_type.field(out_field_index); + if (in_field->name() == out_field->name()) { + if (in_field->nullable() && !out_field->nullable()) { + return Status::TypeError("cannot cast nullable struct to non-nullable struct: ", + in_type.ToString(), " ", out_type.ToString()); + } + fields_to_select[in_field_index] = true; + ++out_field_index; + } + } + + if (out_field_index < out_field_count - 1) { + return Status::TypeError( + "struct subfields names don't match or are in the wrong order: ", + in_type.ToString(), " ", out_type.ToString()); + } + + if (out->kind() == Datum::SCALAR) { + const auto& in_scalar = checked_cast(*batch[0].scalar()); + auto out_scalar = checked_cast(out->scalar().get()); + + DCHECK(!out_scalar->is_valid); + if (in_scalar.is_valid) { + out_field_index = 0; + for (int in_field_index = 0; in_field_index < in_field_count; in_field_index++) { + if (fields_to_select[in_field_index]) { + auto values = in_scalar.value[in_field_index]; + auto target_type = out->type()->field(out_field_index++)->type(); + ARROW_ASSIGN_OR_RAISE(Datum cast_values, Cast(values, target_type, options, + ctx->exec_context())); + DCHECK_EQ(Datum::SCALAR, cast_values.kind()); + out_scalar->value.push_back(cast_values.scalar()); + } + } + out_scalar->is_valid = true; + } + return Status::OK(); + } + + const ArrayData& in_array = *batch[0].array(); + ArrayData* out_array = out->mutable_array(); + + if (in_array.GetNullCount() > 0) { + auto out_bitmap_builder = TypedBufferBuilder(ctx->memory_pool()); + const auto in_bitmap = in_array.buffers[0]->data(); + + for (int in_field_index = 0; in_field_index < in_field_count; in_field_index++) { + if (fields_to_select[in_field_index]) { + if (bit_util::GetBit(in_bitmap, in_array.offset + in_field_index)) { + ARROW_RETURN_NOT_OK(out_bitmap_builder.Append(true)); + } else { + ARROW_RETURN_NOT_OK(out_bitmap_builder.Append(false)); + } + } + } + ARROW_ASSIGN_OR_RAISE(out_array->buffers[0], out_bitmap_builder.Finish()); + } + + out_field_index = 0; + for (int in_field_index = 0; in_field_index < in_field_count; in_field_index++) { + if (fields_to_select[in_field_index]) { + auto values = + in_array.child_data[in_field_index]->Slice(in_array.offset, in_array.length); + auto target_type = out->type()->field(out_field_index++)->type(); + + ARROW_ASSIGN_OR_RAISE(Datum cast_values, + Cast(values, target_type, options, ctx->exec_context())); + + DCHECK_EQ(Datum::ARRAY, cast_values.kind()); + out_array->child_data.push_back(cast_values.array()); + } + } + + return Status::OK(); + } +}; + +void AddStructToStructSubsetCast(CastFunction* func) { + ScalarKernel kernel; + kernel.exec = CastStructSubset::Exec; + kernel.signature = + KernelSignature::Make({InputType(StructType::type_id)}, kOutputTargetType); + kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; + DCHECK_OK(func->AddKernel(StructType::type_id, std::move(kernel))); +} + } // namespace std::vector> GetNestedCasts() { @@ -252,6 +353,7 @@ std::vector> GetNestedCasts() { // So is struct auto cast_struct = std::make_shared("cast_struct", Type::STRUCT); AddCommonCasts(Type::STRUCT, kOutputTargetType, cast_struct.get()); + AddStructToStructSubsetCast(cast_struct.get()); AddStructToStructCast(cast_struct.get()); // So is dictionary diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index f13b05ccd07..91be0354845 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2329,6 +2329,26 @@ TEST(Cast, StructToSameSizedButDifferentNullabilityStruct) { Cast(src2, options)); } +TEST(Cats, StructSubset) { + std::vector field_names = {"a", "b", "c"}; + std::shared_ptr a, b, c; + a = ArrayFromJSON(int8(), "[1, 2, 5]"); + b = ArrayFromJSON(int8(), "[3, 4, 7]"); + c = ArrayFromJSON(int8(), "[9, 11, 44]"); + ASSERT_OK_AND_ASSIGN(auto src, StructArray::Make({a, b, c}, field_names)); + ASSERT_OK_AND_ASSIGN(auto dest, StructArray::Make({a, c}, {"a", "c"})); + + // const auto dest = arrow::struct_( + // {std::make_shared("a", int8()), std::make_shared("c", int8())}); + + // const auto options = CastOptions::Safe(dest); + // + // auto res = Cast(src, options).ValueOrDie(); + // ARROW_LOG(WARNING) << res.make_array()->ToString(); + + CheckCast(src, dest); +} + TEST(Cast, IdentityCasts) { // ARROW-4102 auto CheckIdentityCast = [](std::shared_ptr type, const std::string& json) { From 92ffdda259dae2ad4b13518ee95af1809a24d1d6 Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Sun, 27 Mar 2022 01:57:37 +0530 Subject: [PATCH 2/8] Fix small bug --- .../compute/kernels/scalar_cast_nested.cc | 2 +- .../arrow/compute/kernels/scalar_cast_test.cc | 45 +++++++++++++------ 2 files changed, 32 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index a0b3556d0f6..ae7276f4bd5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -254,7 +254,7 @@ struct CastStructSubset { } } - if (out_field_index < out_field_count - 1) { + if (out_field_index < out_field_count) { return Status::TypeError( "struct subfields names don't match or are in the wrong order: ", in_type.ToString(), " ", out_type.ToString()); diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 91be0354845..90702c677b5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2330,23 +2330,40 @@ TEST(Cast, StructToSameSizedButDifferentNullabilityStruct) { } TEST(Cats, StructSubset) { - std::vector field_names = {"a", "b", "c"}; - std::shared_ptr a, b, c; + std::vector field_names = {"a", "b", "c", "d", "e"}; + std::shared_ptr a, b, c, d, e; a = ArrayFromJSON(int8(), "[1, 2, 5]"); b = ArrayFromJSON(int8(), "[3, 4, 7]"); c = ArrayFromJSON(int8(), "[9, 11, 44]"); - ASSERT_OK_AND_ASSIGN(auto src, StructArray::Make({a, b, c}, field_names)); - ASSERT_OK_AND_ASSIGN(auto dest, StructArray::Make({a, c}, {"a", "c"})); - - // const auto dest = arrow::struct_( - // {std::make_shared("a", int8()), std::make_shared("c", int8())}); - - // const auto options = CastOptions::Safe(dest); - // - // auto res = Cast(src, options).ValueOrDie(); - // ARROW_LOG(WARNING) << res.make_array()->ToString(); - - CheckCast(src, dest); + d = ArrayFromJSON(int8(), "[6, 51, 49]"); + e = ArrayFromJSON(int8(), "[19, 17, 74]"); + + ASSERT_OK_AND_ASSIGN(auto src, StructArray::Make({a, b, c, d, e}, field_names)); + ASSERT_OK_AND_ASSIGN(auto dest1, StructArray::Make({a, c}, {"a", "c"})); + CheckCast(src, dest1); + + ASSERT_OK_AND_ASSIGN(auto dest2, StructArray::Make({b, d}, {"b", "d"})); + CheckCast(src, dest2); + + ASSERT_OK_AND_ASSIGN(auto dest3, StructArray::Make({c, e}, {"c", "e"})); + CheckCast(src, dest3); + + const auto dest4 = arrow::struct_({std::make_shared("a", int8()), + std::make_shared("d", int16()), + std::make_shared("e", int64())}); + const auto options4 = CastOptions::Safe(dest4); + auto res = Cast(src, options4).ValueOrDie(); + ARROW_LOG(WARNING) << res.make_array()->ToString(); + + const auto dest5 = arrow::struct_({std::make_shared("a", int8()), + std::make_shared("d", int16()), + std::make_shared("f", int64())}); + const auto options5 = CastOptions::Safe(dest5); + EXPECT_RAISES_WITH_MESSAGE_THAT( + TypeError, + ::testing::HasSubstr( + "struct subfields names don't match or are in the wrong order"), + Cast(src, options5)); } TEST(Cast, IdentityCasts) { From 87950b9649bbcc5f3905d3766bca12ceae0867eb Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Sun, 27 Mar 2022 12:38:44 +0530 Subject: [PATCH 3/8] Fix null bitmap logic, make tests pass --- .../compute/kernels/scalar_cast_nested.cc | 18 ++----- .../arrow/compute/kernels/scalar_cast_test.cc | 51 +++++++++++++++++-- cpp/src/arrow/dataset/scanner_test.cc | 5 +- 3 files changed, 54 insertions(+), 20 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index ae7276f4bd5..df7dab841fa 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -285,20 +285,10 @@ struct CastStructSubset { const ArrayData& in_array = *batch[0].array(); ArrayData* out_array = out->mutable_array(); - if (in_array.GetNullCount() > 0) { - auto out_bitmap_builder = TypedBufferBuilder(ctx->memory_pool()); - const auto in_bitmap = in_array.buffers[0]->data(); - - for (int in_field_index = 0; in_field_index < in_field_count; in_field_index++) { - if (fields_to_select[in_field_index]) { - if (bit_util::GetBit(in_bitmap, in_array.offset + in_field_index)) { - ARROW_RETURN_NOT_OK(out_bitmap_builder.Append(true)); - } else { - ARROW_RETURN_NOT_OK(out_bitmap_builder.Append(false)); - } - } - } - ARROW_ASSIGN_OR_RAISE(out_array->buffers[0], out_bitmap_builder.Finish()); + if (in_array.buffers[0]) { + ARROW_ASSIGN_OR_RAISE(out_array->buffers[0], + CopyBitmap(ctx->memory_pool(), in_array.buffers[0]->data(), + in_array.offset, in_array.length)); } out_field_index = 0; diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 90702c677b5..8263c95dd8d 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2262,8 +2262,8 @@ TEST(Cast, StructToSameSizedButDifferentNamedStruct) { EXPECT_RAISES_WITH_MESSAGE_THAT( TypeError, - ::testing::HasSubstr("Type error: struct field names do not match: struct struct"), + ::testing::HasSubstr( + "struct subfields names don't match or are in the wrong order"), Cast(src, options)); } @@ -2281,8 +2281,8 @@ TEST(Cast, StructToDifferentSizeStruct) { EXPECT_RAISES_WITH_MESSAGE_THAT( TypeError, - ::testing::HasSubstr("Type error: struct field sizes do not match: struct struct"), + ::testing::HasSubstr( + "struct subfields names don't match or are in the wrong order"), Cast(src, options)); } @@ -2329,7 +2329,7 @@ TEST(Cast, StructToSameSizedButDifferentNullabilityStruct) { Cast(src2, options)); } -TEST(Cats, StructSubset) { +TEST(Cast, StructSubset) { std::vector field_names = {"a", "b", "c", "d", "e"}; std::shared_ptr a, b, c, d, e; a = ArrayFromJSON(int8(), "[1, 2, 5]"); @@ -2366,6 +2366,47 @@ TEST(Cats, StructSubset) { Cast(src, options5)); } +TEST(Cast, StructSubsetWithNulls) { + std::vector field_names = {"a", "b", "c", "d", "e"}; + std::shared_ptr a, b, c, d, e; + a = ArrayFromJSON(int8(), "[1, 2, 5]"); + b = ArrayFromJSON(int8(), "[3, 4, 7]"); + c = ArrayFromJSON(int8(), "[9, 11, 44]"); + d = ArrayFromJSON(int8(), "[6, 51, 49]"); + e = ArrayFromJSON(int8(), "[19, 17, 74]"); + + std::shared_ptr null_bitmap; + BitmapFromVector({0, 1, 0}, &null_bitmap); + + ASSERT_OK_AND_ASSIGN(auto src, + StructArray::Make({a, b, c, d, e}, field_names, null_bitmap)); + ASSERT_OK_AND_ASSIGN(auto dest1, StructArray::Make({a, c}, {"a", "c"}, null_bitmap)); + CheckCast(src, dest1); + + ASSERT_OK_AND_ASSIGN(auto dest2, StructArray::Make({b, d}, {"b", "d"}, null_bitmap)); + CheckCast(src, dest2); + + ASSERT_OK_AND_ASSIGN(auto dest3, StructArray::Make({c, e}, {"c", "e"}, null_bitmap)); + CheckCast(src, dest3); + + const auto dest4 = arrow::struct_({std::make_shared("a", int8()), + std::make_shared("d", int16()), + std::make_shared("e", int64())}); + const auto options4 = CastOptions::Safe(dest4); + auto res = Cast(src, options4).ValueOrDie(); + ARROW_LOG(WARNING) << res.make_array()->ToString(); + + const auto dest5 = arrow::struct_({std::make_shared("a", int8()), + std::make_shared("d", int16()), + std::make_shared("f", int64())}); + const auto options5 = CastOptions::Safe(dest5); + EXPECT_RAISES_WITH_MESSAGE_THAT( + TypeError, + ::testing::HasSubstr( + "struct subfields names don't match or are in the wrong order"), + Cast(src, options5)); +} + TEST(Cast, IdentityCasts) { // ARROW-4102 auto CheckIdentityCast = [](std::shared_ptr type, const std::string& json) { diff --git a/cpp/src/arrow/dataset/scanner_test.cc b/cpp/src/arrow/dataset/scanner_test.cc index 7fedb7d3c72..b1b37684c18 100644 --- a/cpp/src/arrow/dataset/scanner_test.cc +++ b/cpp/src/arrow/dataset/scanner_test.cc @@ -1656,7 +1656,10 @@ TEST(ScanNode, MaterializationOfNestedVirtualColumn) { // TODO(ARROW-1888): allow scanner to "patch up" structs with casts EXPECT_FINISHES_AND_RAISES_WITH_MESSAGE_THAT( - TypeError, ::testing::HasSubstr("struct field sizes do not match"), plan.Run()); + TypeError, + ::testing::HasSubstr( + "struct subfields names don't match or are in the wrong order"), + plan.Run()); } TEST(ScanNode, MinimalEndToEnd) { From 7f264bbfd61a92203de187949e9603ab7666da0e Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Sun, 27 Mar 2022 13:29:16 +0530 Subject: [PATCH 4/8] Store field indices instead of bool filter --- .../compute/kernels/scalar_cast_nested.cc | 43 ++++----- .../arrow/compute/kernels/scalar_cast_test.cc | 96 ++++++++++++------- cpp/src/arrow/dataset/scanner_test.cc | 3 +- 3 files changed, 83 insertions(+), 59 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index df7dab841fa..367c6d88f81 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -236,7 +236,7 @@ struct CastStructSubset { const int in_field_count = in_type.num_fields(); const int out_field_count = out_type.num_fields(); - std::vector fields_to_select(in_field_count, false); + std::vector fields_to_select(out_field_count, -1); int out_field_index = 0; for (int in_field_index = 0; @@ -249,15 +249,14 @@ struct CastStructSubset { return Status::TypeError("cannot cast nullable struct to non-nullable struct: ", in_type.ToString(), " ", out_type.ToString()); } - fields_to_select[in_field_index] = true; - ++out_field_index; + fields_to_select[out_field_index++] = in_field_index; } } if (out_field_index < out_field_count) { return Status::TypeError( - "struct subfields names don't match or are in the wrong order: ", - in_type.ToString(), " ", out_type.ToString()); + "struct (sub)fields don't match or are in the wrong order: Input fields: ", + in_type.ToString(), " output fields: ", out_type.ToString()); } if (out->kind() == Datum::SCALAR) { @@ -267,15 +266,13 @@ struct CastStructSubset { DCHECK(!out_scalar->is_valid); if (in_scalar.is_valid) { out_field_index = 0; - for (int in_field_index = 0; in_field_index < in_field_count; in_field_index++) { - if (fields_to_select[in_field_index]) { - auto values = in_scalar.value[in_field_index]; - auto target_type = out->type()->field(out_field_index++)->type(); - ARROW_ASSIGN_OR_RAISE(Datum cast_values, Cast(values, target_type, options, - ctx->exec_context())); - DCHECK_EQ(Datum::SCALAR, cast_values.kind()); - out_scalar->value.push_back(cast_values.scalar()); - } + for (int field_index : fields_to_select) { + auto values = in_scalar.value[field_index]; + auto target_type = out->type()->field(out_field_index++)->type(); + ARROW_ASSIGN_OR_RAISE(Datum cast_values, + Cast(values, target_type, options, ctx->exec_context())); + DCHECK_EQ(Datum::SCALAR, cast_values.kind()); + out_scalar->value.push_back(cast_values.scalar()); } out_scalar->is_valid = true; } @@ -292,18 +289,16 @@ struct CastStructSubset { } out_field_index = 0; - for (int in_field_index = 0; in_field_index < in_field_count; in_field_index++) { - if (fields_to_select[in_field_index]) { - auto values = - in_array.child_data[in_field_index]->Slice(in_array.offset, in_array.length); - auto target_type = out->type()->field(out_field_index++)->type(); + for (int field_index : fields_to_select) { + auto values = + in_array.child_data[field_index]->Slice(in_array.offset, in_array.length); + auto target_type = out->type()->field(out_field_index++)->type(); - ARROW_ASSIGN_OR_RAISE(Datum cast_values, - Cast(values, target_type, options, ctx->exec_context())); + ARROW_ASSIGN_OR_RAISE(Datum cast_values, + Cast(values, target_type, options, ctx->exec_context())); - DCHECK_EQ(Datum::ARRAY, cast_values.kind()); - out_array->child_data.push_back(cast_values.array()); - } + DCHECK_EQ(Datum::ARRAY, cast_values.kind()); + out_array->child_data.push_back(cast_values.array()); } return Status::OK(); diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 8263c95dd8d..c15bc78cede 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2262,8 +2262,7 @@ TEST(Cast, StructToSameSizedButDifferentNamedStruct) { EXPECT_RAISES_WITH_MESSAGE_THAT( TypeError, - ::testing::HasSubstr( - "struct subfields names don't match or are in the wrong order"), + ::testing::HasSubstr("struct (sub)fields don't match or are in the wrong order"), Cast(src, options)); } @@ -2281,8 +2280,7 @@ TEST(Cast, StructToDifferentSizeStruct) { EXPECT_RAISES_WITH_MESSAGE_THAT( TypeError, - ::testing::HasSubstr( - "struct subfields names don't match or are in the wrong order"), + ::testing::HasSubstr("struct (sub)fields don't match or are in the wrong order"), Cast(src, options)); } @@ -2339,31 +2337,43 @@ TEST(Cast, StructSubset) { e = ArrayFromJSON(int8(), "[19, 17, 74]"); ASSERT_OK_AND_ASSIGN(auto src, StructArray::Make({a, b, c, d, e}, field_names)); - ASSERT_OK_AND_ASSIGN(auto dest1, StructArray::Make({a, c}, {"a", "c"})); + ASSERT_OK_AND_ASSIGN(auto dest1, + StructArray::Make({a, c}, std::vector{"a", "c"})); CheckCast(src, dest1); - ASSERT_OK_AND_ASSIGN(auto dest2, StructArray::Make({b, d}, {"b", "d"})); + ASSERT_OK_AND_ASSIGN(auto dest2, + StructArray::Make({b, d}, std::vector{"b", "d"})); CheckCast(src, dest2); - ASSERT_OK_AND_ASSIGN(auto dest3, StructArray::Make({c, e}, {"c", "e"})); + ASSERT_OK_AND_ASSIGN(auto dest3, + StructArray::Make({c, e}, std::vector{"c", "e"})); CheckCast(src, dest3); - const auto dest4 = arrow::struct_({std::make_shared("a", int8()), - std::make_shared("d", int16()), - std::make_shared("e", int64())}); - const auto options4 = CastOptions::Safe(dest4); - auto res = Cast(src, options4).ValueOrDie(); - ARROW_LOG(WARNING) << res.make_array()->ToString(); + ASSERT_OK_AND_ASSIGN( + auto dest4, StructArray::Make({a, d, e}, std::vector{"a", "d", "e"})); + CheckCast(src, dest4); + + ASSERT_OK_AND_ASSIGN( + auto dest5, StructArray::Make({b, c, e}, std::vector{"b", "c", "e"})); + CheckCast(src, dest5); + + ASSERT_OK_AND_ASSIGN( + auto dest6, + StructArray::Make({a, b, c, e}, std::vector{"a", "b", "c", "e"})); + CheckCast(src, dest6); + + ASSERT_OK_AND_ASSIGN(auto dest7, + StructArray::Make({a, b, c, d, e}, {"a", "b", "c", "d", "e"})); + CheckCast(src, dest7); - const auto dest5 = arrow::struct_({std::make_shared("a", int8()), + const auto dest8 = arrow::struct_({std::make_shared("a", int8()), std::make_shared("d", int16()), std::make_shared("f", int64())}); - const auto options5 = CastOptions::Safe(dest5); + const auto options = CastOptions::Safe(dest8); EXPECT_RAISES_WITH_MESSAGE_THAT( TypeError, - ::testing::HasSubstr( - "struct subfields names don't match or are in the wrong order"), - Cast(src, options5)); + ::testing::HasSubstr("struct (sub)fields don't match or are in the wrong order"), + Cast(src, options)); } TEST(Cast, StructSubsetWithNulls) { @@ -2380,31 +2390,51 @@ TEST(Cast, StructSubsetWithNulls) { ASSERT_OK_AND_ASSIGN(auto src, StructArray::Make({a, b, c, d, e}, field_names, null_bitmap)); - ASSERT_OK_AND_ASSIGN(auto dest1, StructArray::Make({a, c}, {"a", "c"}, null_bitmap)); + ASSERT_OK_AND_ASSIGN( + auto dest1, + StructArray::Make({a, c}, std::vector{"a", "c"}, null_bitmap)); CheckCast(src, dest1); - ASSERT_OK_AND_ASSIGN(auto dest2, StructArray::Make({b, d}, {"b", "d"}, null_bitmap)); + ASSERT_OK_AND_ASSIGN( + auto dest2, + StructArray::Make({b, d}, std::vector{"b", "d"}, null_bitmap)); CheckCast(src, dest2); - ASSERT_OK_AND_ASSIGN(auto dest3, StructArray::Make({c, e}, {"c", "e"}, null_bitmap)); + ASSERT_OK_AND_ASSIGN( + auto dest3, + StructArray::Make({c, e}, std::vector{"c", "e"}, null_bitmap)); CheckCast(src, dest3); - const auto dest4 = arrow::struct_({std::make_shared("a", int8()), - std::make_shared("d", int16()), - std::make_shared("e", int64())}); - const auto options4 = CastOptions::Safe(dest4); - auto res = Cast(src, options4).ValueOrDie(); - ARROW_LOG(WARNING) << res.make_array()->ToString(); - - const auto dest5 = arrow::struct_({std::make_shared("a", int8()), + ASSERT_OK_AND_ASSIGN( + auto dest4, + StructArray::Make({a, d, e}, std::vector{"a", "d", "e"}, null_bitmap)); + CheckCast(src, dest4); + + ASSERT_OK_AND_ASSIGN( + auto dest5, + StructArray::Make({b, c, e}, std::vector{"b", "c", "e"}, null_bitmap)); + CheckCast(src, dest5); + + ASSERT_OK_AND_ASSIGN( + auto dest6, + StructArray::Make({a, b, c, e}, std::vector{"a", "b", "c", "e"}, + null_bitmap)); + CheckCast(src, dest6); + + ASSERT_OK_AND_ASSIGN( + auto dest7, + StructArray::Make({a, b, c, d, e}, + std::vector{"a", "b", "c", "d", "e"}, null_bitmap)); + CheckCast(src, dest7); + + const auto dest8 = arrow::struct_({std::make_shared("a", int8()), std::make_shared("d", int16()), std::make_shared("f", int64())}); - const auto options5 = CastOptions::Safe(dest5); + const auto options = CastOptions::Safe(dest8); EXPECT_RAISES_WITH_MESSAGE_THAT( TypeError, - ::testing::HasSubstr( - "struct subfields names don't match or are in the wrong order"), - Cast(src, options5)); + ::testing::HasSubstr("struct (sub)fields don't match or are in the wrong order"), + Cast(src, options)); } TEST(Cast, IdentityCasts) { diff --git a/cpp/src/arrow/dataset/scanner_test.cc b/cpp/src/arrow/dataset/scanner_test.cc index b1b37684c18..7e0bbb40bd1 100644 --- a/cpp/src/arrow/dataset/scanner_test.cc +++ b/cpp/src/arrow/dataset/scanner_test.cc @@ -1657,8 +1657,7 @@ TEST(ScanNode, MaterializationOfNestedVirtualColumn) { // TODO(ARROW-1888): allow scanner to "patch up" structs with casts EXPECT_FINISHES_AND_RAISES_WITH_MESSAGE_THAT( TypeError, - ::testing::HasSubstr( - "struct subfields names don't match or are in the wrong order"), + ::testing::HasSubstr("struct (sub)fields don't match or are in the wrong order"), plan.Run()); } From 29b13d665fb316ca7e738d860b5b70cf936d0cc1 Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Tue, 29 Mar 2022 00:25:00 +0530 Subject: [PATCH 5/8] Extend current impl --- .../compute/kernels/scalar_cast_nested.cc | 83 +----- .../arrow/compute/kernels/scalar_cast_test.cc | 238 +++++++++--------- 2 files changed, 127 insertions(+), 194 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 367c6d88f81..aab60c10448 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -151,84 +151,6 @@ void AddListCast(CastFunction* func) { } struct CastStruct { - static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - const CastOptions& options = CastState::Get(ctx); - const StructType& in_type = checked_cast(*batch[0].type()); - const StructType& out_type = checked_cast(*out->type()); - const auto in_field_count = in_type.num_fields(); - - if (in_field_count != out_type.num_fields()) { - return Status::TypeError("struct field sizes do not match: ", in_type.ToString(), - " ", out_type.ToString()); - } - - for (int i = 0; i < in_field_count; ++i) { - const auto in_field = in_type.field(i); - const auto out_field = out_type.field(i); - if (in_field->name() != out_field->name()) { - return Status::TypeError("struct field names do not match: ", in_type.ToString(), - " ", out_type.ToString()); - } - - if (in_field->nullable() && !out_field->nullable()) { - return Status::TypeError("cannot cast nullable struct to non-nullable struct: ", - in_type.ToString(), " ", out_type.ToString()); - } - } - - if (out->kind() == Datum::SCALAR) { - const auto& in_scalar = checked_cast(*batch[0].scalar()); - auto out_scalar = checked_cast(out->scalar().get()); - - DCHECK(!out_scalar->is_valid); - if (in_scalar.is_valid) { - for (int i = 0; i < in_field_count; i++) { - auto values = in_scalar.value[i]; - auto target_type = out->type()->field(i)->type(); - ARROW_ASSIGN_OR_RAISE(Datum cast_values, - Cast(values, target_type, options, ctx->exec_context())); - DCHECK_EQ(Datum::SCALAR, cast_values.kind()); - out_scalar->value.push_back(cast_values.scalar()); - } - out_scalar->is_valid = true; - } - return Status::OK(); - } - - const ArrayData& in_array = *batch[0].array(); - ArrayData* out_array = out->mutable_array(); - - if (in_array.buffers[0]) { - ARROW_ASSIGN_OR_RAISE(out_array->buffers[0], - CopyBitmap(ctx->memory_pool(), in_array.buffers[0]->data(), - in_array.offset, in_array.length)); - } - - for (int i = 0; i < in_field_count; ++i) { - auto values = in_array.child_data[i]->Slice(in_array.offset, in_array.length); - auto target_type = out->type()->field(i)->type(); - - ARROW_ASSIGN_OR_RAISE(Datum cast_values, - Cast(values, target_type, options, ctx->exec_context())); - - DCHECK_EQ(Datum::ARRAY, cast_values.kind()); - out_array->child_data.push_back(cast_values.array()); - } - - return Status::OK(); - } -}; - -void AddStructToStructCast(CastFunction* func) { - ScalarKernel kernel; - kernel.exec = CastStruct::Exec; - kernel.signature = - KernelSignature::Make({InputType(StructType::type_id)}, kOutputTargetType); - kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; - DCHECK_OK(func->AddKernel(StructType::type_id, std::move(kernel))); -} - -struct CastStructSubset { static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { const CastOptions& options = CastState::Get(ctx); const auto& in_type = checked_cast(*batch[0].type()); @@ -305,9 +227,9 @@ struct CastStructSubset { } }; -void AddStructToStructSubsetCast(CastFunction* func) { +void AddStructToStructCast(CastFunction* func) { ScalarKernel kernel; - kernel.exec = CastStructSubset::Exec; + kernel.exec = CastStruct::Exec; kernel.signature = KernelSignature::Make({InputType(StructType::type_id)}, kOutputTargetType); kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; @@ -338,7 +260,6 @@ std::vector> GetNestedCasts() { // So is struct auto cast_struct = std::make_shared("cast_struct", Type::STRUCT); AddCommonCasts(Type::STRUCT, kOutputTargetType, cast_struct.get()); - AddStructToStructSubsetCast(cast_struct.get()); AddStructToStructCast(cast_struct.get()); // So is dictionary diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index c15bc78cede..bf7782a3a9a 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2245,10 +2245,132 @@ static void CheckStructToStruct( } } -TEST(Cast, StructToSameSizedAndNamedStruct) { - CheckStructToStruct({int32(), float32(), int64()}); +static void CheckStructToStructSubset( + const std::vector>& value_types) { + for (const auto& src_value_type : value_types) { + for (const auto& dest_value_type : value_types) { + std::vector field_names = {"a", "b", "c", "d", "e"}; + + std::shared_ptr a1, b1, c1, d1, e1; + a1 = ArrayFromJSON(src_value_type, "[1, 2, 5]"); + b1 = ArrayFromJSON(src_value_type, "[3, 4, 7]"); + c1 = ArrayFromJSON(src_value_type, "[9, 11, 44]"); + d1 = ArrayFromJSON(src_value_type, "[6, 51, 49]"); + e1 = ArrayFromJSON(src_value_type, "[19, 17, 74]"); + + std::shared_ptr a2, b2, c2, d2, e2; + a2 = ArrayFromJSON(dest_value_type, "[1, 2, 5]"); + b2 = ArrayFromJSON(dest_value_type, "[3, 4, 7]"); + c2 = ArrayFromJSON(dest_value_type, "[9, 11, 44]"); + d2 = ArrayFromJSON(dest_value_type, "[6, 51, 49]"); + e2 = ArrayFromJSON(dest_value_type, "[19, 17, 74]"); + + ASSERT_OK_AND_ASSIGN(auto src, + StructArray::Make({a1, b1, c1, d1, e1}, field_names)); + ASSERT_OK_AND_ASSIGN( + auto dest1, StructArray::Make({a2, c2}, std::vector{"a", "c"})); + CheckCast(src, dest1); + + ASSERT_OK_AND_ASSIGN( + auto dest2, StructArray::Make({b2, d2}, std::vector{"b", "d"})); + CheckCast(src, dest2); + + ASSERT_OK_AND_ASSIGN( + auto dest3, StructArray::Make({c2, e2}, std::vector{"c", "e"})); + CheckCast(src, dest3); + + ASSERT_OK_AND_ASSIGN( + auto dest4, + StructArray::Make({a2, d2, e2}, std::vector{"a", "d", "e"})); + CheckCast(src, dest4); + + ASSERT_OK_AND_ASSIGN( + auto dest5, + StructArray::Make({b2, c2, e2}, std::vector{"b", "c", "e"})); + CheckCast(src, dest5); + + ASSERT_OK_AND_ASSIGN( + auto dest6, StructArray::Make({a2, b2, c2, e2}, + std::vector{"a", "b", "c", "e"})); + CheckCast(src, dest6); + + ASSERT_OK_AND_ASSIGN( + auto dest7, StructArray::Make({a2, b2, c2, d2, e2}, {"a", "b", "c", "d", "e"})); + CheckCast(src, dest7); + + const auto dest8 = arrow::struct_({std::make_shared("a", int8()), + std::make_shared("d", int16()), + std::make_shared("f", int64())}); + const auto options = CastOptions::Safe(dest8); + EXPECT_RAISES_WITH_MESSAGE_THAT( + TypeError, + ::testing::HasSubstr( + "struct (sub)fields don't match or are in the wrong order"), + Cast(src, options)); + + // With nulls + std::shared_ptr null_bitmap; + BitmapFromVector({0, 1, 0}, &null_bitmap); + + ASSERT_OK_AND_ASSIGN(auto src_null, StructArray::Make({a1, b1, c1, d1, e1}, + field_names, null_bitmap)); + ASSERT_OK_AND_ASSIGN( + auto dest1_null, + StructArray::Make({a2, c2}, std::vector{"a", "c"}, null_bitmap)); + CheckCast(src_null, dest1_null); + + ASSERT_OK_AND_ASSIGN( + auto dest2_null, + StructArray::Make({b2, d2}, std::vector{"b", "d"}, null_bitmap)); + CheckCast(src_null, dest2_null); + + ASSERT_OK_AND_ASSIGN( + auto dest3_null, + StructArray::Make({c2, e2}, std::vector{"c", "e"}, null_bitmap)); + CheckCast(src_null, dest3_null); + + ASSERT_OK_AND_ASSIGN( + auto dest4_null, + StructArray::Make({a2, d2, e2}, std::vector{"a", "d", "e"}, + null_bitmap)); + CheckCast(src_null, dest4_null); + + ASSERT_OK_AND_ASSIGN( + auto dest5_null, + StructArray::Make({b2, c2, e2}, std::vector{"b", "c", "e"}, + null_bitmap)); + CheckCast(src_null, dest5_null); + + ASSERT_OK_AND_ASSIGN( + auto dest6_null, + StructArray::Make({a2, b2, c2, e2}, + std::vector{"a", "b", "c", "e"}, null_bitmap)); + CheckCast(src_null, dest6_null); + + ASSERT_OK_AND_ASSIGN( + auto dest7_null, + StructArray::Make({a2, b2, c2, d2, e2}, + std::vector{"a", "b", "c", "d", "e"}, + null_bitmap)); + CheckCast(src_null, dest7_null); + + const auto dest8_null = arrow::struct_({std::make_shared("a", int8()), + std::make_shared("d", int16()), + std::make_shared("f", int64())}); + const auto options_null = CastOptions::Safe(dest8_null); + EXPECT_RAISES_WITH_MESSAGE_THAT( + TypeError, + ::testing::HasSubstr( + "struct (sub)fields don't match or are in the wrong order"), + Cast(src_null, options_null)); + } + } } +TEST(Cast, StructToSameSizedAndNamedStruct) { CheckStructToStruct(NumericTypes()); } + +TEST(Cast, StructToStructSubset) { CheckStructToStructSubset(NumericTypes()); } + TEST(Cast, StructToSameSizedButDifferentNamedStruct) { std::vector field_names = {"a", "b"}; std::shared_ptr a, b; @@ -2266,7 +2388,7 @@ TEST(Cast, StructToSameSizedButDifferentNamedStruct) { Cast(src, options)); } -TEST(Cast, StructToDifferentSizeStruct) { +TEST(Cast, StructToLargerSizeStruct) { std::vector field_names = {"a", "b"}; std::shared_ptr a, b; a = ArrayFromJSON(int8(), "[1, 2]"); @@ -2327,116 +2449,6 @@ TEST(Cast, StructToSameSizedButDifferentNullabilityStruct) { Cast(src2, options)); } -TEST(Cast, StructSubset) { - std::vector field_names = {"a", "b", "c", "d", "e"}; - std::shared_ptr a, b, c, d, e; - a = ArrayFromJSON(int8(), "[1, 2, 5]"); - b = ArrayFromJSON(int8(), "[3, 4, 7]"); - c = ArrayFromJSON(int8(), "[9, 11, 44]"); - d = ArrayFromJSON(int8(), "[6, 51, 49]"); - e = ArrayFromJSON(int8(), "[19, 17, 74]"); - - ASSERT_OK_AND_ASSIGN(auto src, StructArray::Make({a, b, c, d, e}, field_names)); - ASSERT_OK_AND_ASSIGN(auto dest1, - StructArray::Make({a, c}, std::vector{"a", "c"})); - CheckCast(src, dest1); - - ASSERT_OK_AND_ASSIGN(auto dest2, - StructArray::Make({b, d}, std::vector{"b", "d"})); - CheckCast(src, dest2); - - ASSERT_OK_AND_ASSIGN(auto dest3, - StructArray::Make({c, e}, std::vector{"c", "e"})); - CheckCast(src, dest3); - - ASSERT_OK_AND_ASSIGN( - auto dest4, StructArray::Make({a, d, e}, std::vector{"a", "d", "e"})); - CheckCast(src, dest4); - - ASSERT_OK_AND_ASSIGN( - auto dest5, StructArray::Make({b, c, e}, std::vector{"b", "c", "e"})); - CheckCast(src, dest5); - - ASSERT_OK_AND_ASSIGN( - auto dest6, - StructArray::Make({a, b, c, e}, std::vector{"a", "b", "c", "e"})); - CheckCast(src, dest6); - - ASSERT_OK_AND_ASSIGN(auto dest7, - StructArray::Make({a, b, c, d, e}, {"a", "b", "c", "d", "e"})); - CheckCast(src, dest7); - - const auto dest8 = arrow::struct_({std::make_shared("a", int8()), - std::make_shared("d", int16()), - std::make_shared("f", int64())}); - const auto options = CastOptions::Safe(dest8); - EXPECT_RAISES_WITH_MESSAGE_THAT( - TypeError, - ::testing::HasSubstr("struct (sub)fields don't match or are in the wrong order"), - Cast(src, options)); -} - -TEST(Cast, StructSubsetWithNulls) { - std::vector field_names = {"a", "b", "c", "d", "e"}; - std::shared_ptr a, b, c, d, e; - a = ArrayFromJSON(int8(), "[1, 2, 5]"); - b = ArrayFromJSON(int8(), "[3, 4, 7]"); - c = ArrayFromJSON(int8(), "[9, 11, 44]"); - d = ArrayFromJSON(int8(), "[6, 51, 49]"); - e = ArrayFromJSON(int8(), "[19, 17, 74]"); - - std::shared_ptr null_bitmap; - BitmapFromVector({0, 1, 0}, &null_bitmap); - - ASSERT_OK_AND_ASSIGN(auto src, - StructArray::Make({a, b, c, d, e}, field_names, null_bitmap)); - ASSERT_OK_AND_ASSIGN( - auto dest1, - StructArray::Make({a, c}, std::vector{"a", "c"}, null_bitmap)); - CheckCast(src, dest1); - - ASSERT_OK_AND_ASSIGN( - auto dest2, - StructArray::Make({b, d}, std::vector{"b", "d"}, null_bitmap)); - CheckCast(src, dest2); - - ASSERT_OK_AND_ASSIGN( - auto dest3, - StructArray::Make({c, e}, std::vector{"c", "e"}, null_bitmap)); - CheckCast(src, dest3); - - ASSERT_OK_AND_ASSIGN( - auto dest4, - StructArray::Make({a, d, e}, std::vector{"a", "d", "e"}, null_bitmap)); - CheckCast(src, dest4); - - ASSERT_OK_AND_ASSIGN( - auto dest5, - StructArray::Make({b, c, e}, std::vector{"b", "c", "e"}, null_bitmap)); - CheckCast(src, dest5); - - ASSERT_OK_AND_ASSIGN( - auto dest6, - StructArray::Make({a, b, c, e}, std::vector{"a", "b", "c", "e"}, - null_bitmap)); - CheckCast(src, dest6); - - ASSERT_OK_AND_ASSIGN( - auto dest7, - StructArray::Make({a, b, c, d, e}, - std::vector{"a", "b", "c", "d", "e"}, null_bitmap)); - CheckCast(src, dest7); - - const auto dest8 = arrow::struct_({std::make_shared("a", int8()), - std::make_shared("d", int16()), - std::make_shared("f", int64())}); - const auto options = CastOptions::Safe(dest8); - EXPECT_RAISES_WITH_MESSAGE_THAT( - TypeError, - ::testing::HasSubstr("struct (sub)fields don't match or are in the wrong order"), - Cast(src, options)); -} - TEST(Cast, IdentityCasts) { // ARROW-4102 auto CheckIdentityCast = [](std::shared_ptr type, const std::string& json) { From 23d72d26acb64fc2e4920a8be8191d4acbf62e84 Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Tue, 29 Mar 2022 13:14:56 +0530 Subject: [PATCH 6/8] Update tests --- .../compute/kernels/scalar_cast_nested.cc | 4 +- .../arrow/compute/kernels/scalar_cast_test.cc | 283 ++++++++++++------ cpp/src/arrow/dataset/scanner_test.cc | 2 +- 3 files changed, 196 insertions(+), 93 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index aab60c10448..7b4f56e6a6d 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -168,7 +168,7 @@ struct CastStruct { const auto out_field = out_type.field(out_field_index); if (in_field->name() == out_field->name()) { if (in_field->nullable() && !out_field->nullable()) { - return Status::TypeError("cannot cast nullable struct to non-nullable struct: ", + return Status::TypeError("cannot cast nullable field to non-nullable field: ", in_type.ToString(), " ", out_type.ToString()); } fields_to_select[out_field_index++] = in_field_index; @@ -177,7 +177,7 @@ struct CastStruct { if (out_field_index < out_field_count) { return Status::TypeError( - "struct (sub)fields don't match or are in the wrong order: Input fields: ", + "struct fields don't match or are in the wrong order: Input fields: ", in_type.ToString(), " output fields: ", out_type.ToString()); } diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index bf7782a3a9a..210517fbc86 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2248,7 +2248,10 @@ static void CheckStructToStruct( static void CheckStructToStructSubset( const std::vector>& value_types) { for (const auto& src_value_type : value_types) { + ARROW_SCOPED_TRACE("From type: ", src_value_type->ToString()); for (const auto& dest_value_type : value_types) { + ARROW_SCOPED_TRACE("To type: ", dest_value_type->ToString()); + std::vector field_names = {"a", "b", "c", "d", "e"}; std::shared_ptr a1, b1, c1, d1, e1; @@ -2267,48 +2270,84 @@ static void CheckStructToStructSubset( ASSERT_OK_AND_ASSIGN(auto src, StructArray::Make({a1, b1, c1, d1, e1}, field_names)); - ASSERT_OK_AND_ASSIGN( - auto dest1, StructArray::Make({a2, c2}, std::vector{"a", "c"})); + ASSERT_OK_AND_ASSIGN(auto dest1, + StructArray::Make({a2}, std::vector{"a"})); CheckCast(src, dest1); ASSERT_OK_AND_ASSIGN( - auto dest2, StructArray::Make({b2, d2}, std::vector{"b", "d"})); + auto dest2, StructArray::Make({b2, c2}, std::vector{"b", "c"})); CheckCast(src, dest2); ASSERT_OK_AND_ASSIGN( - auto dest3, StructArray::Make({c2, e2}, std::vector{"c", "e"})); + auto dest3, + StructArray::Make({c2, d2, e2}, std::vector{"c", "d", "e"})); CheckCast(src, dest3); ASSERT_OK_AND_ASSIGN( - auto dest4, - StructArray::Make({a2, d2, e2}, std::vector{"a", "d", "e"})); + auto dest4, StructArray::Make({a2, b2, c2, e2}, + std::vector{"a", "b", "c", "e"})); CheckCast(src, dest4); ASSERT_OK_AND_ASSIGN( - auto dest5, - StructArray::Make({b2, c2, e2}, std::vector{"b", "c", "e"})); + auto dest5, StructArray::Make({a2, b2, c2, d2, e2}, {"a", "b", "c", "d", "e"})); CheckCast(src, dest5); - ASSERT_OK_AND_ASSIGN( - auto dest6, StructArray::Make({a2, b2, c2, e2}, - std::vector{"a", "b", "c", "e"})); - CheckCast(src, dest6); - - ASSERT_OK_AND_ASSIGN( - auto dest7, StructArray::Make({a2, b2, c2, d2, e2}, {"a", "b", "c", "d", "e"})); - CheckCast(src, dest7); - - const auto dest8 = arrow::struct_({std::make_shared("a", int8()), + // field does not exist + const auto dest6 = arrow::struct_({std::make_shared("a", int8()), std::make_shared("d", int16()), std::make_shared("f", int64())}); - const auto options = CastOptions::Safe(dest8); + const auto options6 = CastOptions::Safe(dest6); EXPECT_RAISES_WITH_MESSAGE_THAT( TypeError, - ::testing::HasSubstr( - "struct (sub)fields don't match or are in the wrong order"), - Cast(src, options)); + ::testing::HasSubstr("struct fields don't match or are in the wrong order"), + Cast(src, options6)); + + // fields in wrong order + const auto dest7 = arrow::struct_({std::make_shared("a", int8()), + std::make_shared("c", int16()), + std::make_shared("b", int64())}); + const auto options7 = CastOptions::Safe(dest7); + EXPECT_RAISES_WITH_MESSAGE_THAT( + TypeError, + ::testing::HasSubstr("struct fields don't match or are in the wrong order"), + Cast(src, options7)); + + // duplicate field names + const auto dest8 = arrow::struct_( + {std::make_shared("a", int8()), std::make_shared("c", int16()), + std::make_shared("d", int32()), std::make_shared("a", int64())}); + const auto options8 = CastOptions::Safe(dest8); + EXPECT_RAISES_WITH_MESSAGE_THAT( + TypeError, + ::testing::HasSubstr("struct fields don't match or are in the wrong order"), + Cast(src, options8)); + } + } +} + +static void CheckStructToStructSubsetWithNulls( + const std::vector>& value_types) { + for (const auto& src_value_type : value_types) { + ARROW_SCOPED_TRACE("From type: ", src_value_type->ToString()); + for (const auto& dest_value_type : value_types) { + ARROW_SCOPED_TRACE("To type: ", dest_value_type->ToString()); + + std::vector field_names = {"a", "b", "c", "d", "e"}; + + std::shared_ptr a1, b1, c1, d1, e1; + a1 = ArrayFromJSON(src_value_type, "[1, 2, 5]"); + b1 = ArrayFromJSON(src_value_type, "[3, 4, 7]"); + c1 = ArrayFromJSON(src_value_type, "[9, 11, 44]"); + d1 = ArrayFromJSON(src_value_type, "[6, 51, 49]"); + e1 = ArrayFromJSON(src_value_type, "[19, 17, 74]"); + + std::shared_ptr a2, b2, c2, d2, e2; + a2 = ArrayFromJSON(dest_value_type, "[1, 2, 5]"); + b2 = ArrayFromJSON(dest_value_type, "[3, 4, 7]"); + c2 = ArrayFromJSON(dest_value_type, "[9, 11, 44]"); + d2 = ArrayFromJSON(dest_value_type, "[6, 51, 49]"); + e2 = ArrayFromJSON(dest_value_type, "[19, 17, 74]"); - // With nulls std::shared_ptr null_bitmap; BitmapFromVector({0, 1, 0}, &null_bitmap); @@ -2316,53 +2355,62 @@ static void CheckStructToStructSubset( field_names, null_bitmap)); ASSERT_OK_AND_ASSIGN( auto dest1_null, - StructArray::Make({a2, c2}, std::vector{"a", "c"}, null_bitmap)); + StructArray::Make({a2}, std::vector{"a"}, null_bitmap)); CheckCast(src_null, dest1_null); ASSERT_OK_AND_ASSIGN( auto dest2_null, - StructArray::Make({b2, d2}, std::vector{"b", "d"}, null_bitmap)); + StructArray::Make({b2, c2}, std::vector{"b", "c"}, null_bitmap)); CheckCast(src_null, dest2_null); ASSERT_OK_AND_ASSIGN( auto dest3_null, - StructArray::Make({c2, e2}, std::vector{"c", "e"}, null_bitmap)); - CheckCast(src_null, dest3_null); - - ASSERT_OK_AND_ASSIGN( - auto dest4_null, StructArray::Make({a2, d2, e2}, std::vector{"a", "d", "e"}, null_bitmap)); - CheckCast(src_null, dest4_null); - - ASSERT_OK_AND_ASSIGN( - auto dest5_null, - StructArray::Make({b2, c2, e2}, std::vector{"b", "c", "e"}, - null_bitmap)); - CheckCast(src_null, dest5_null); + CheckCast(src_null, dest3_null); ASSERT_OK_AND_ASSIGN( - auto dest6_null, + auto dest4_null, StructArray::Make({a2, b2, c2, e2}, std::vector{"a", "b", "c", "e"}, null_bitmap)); - CheckCast(src_null, dest6_null); + CheckCast(src_null, dest4_null); ASSERT_OK_AND_ASSIGN( - auto dest7_null, + auto dest5_null, StructArray::Make({a2, b2, c2, d2, e2}, std::vector{"a", "b", "c", "d", "e"}, null_bitmap)); - CheckCast(src_null, dest7_null); + CheckCast(src_null, dest5_null); - const auto dest8_null = arrow::struct_({std::make_shared("a", int8()), + // field does not exist + const auto dest6_null = arrow::struct_({std::make_shared("a", int8()), std::make_shared("d", int16()), std::make_shared("f", int64())}); - const auto options_null = CastOptions::Safe(dest8_null); + const auto options6_null = CastOptions::Safe(dest6_null); EXPECT_RAISES_WITH_MESSAGE_THAT( TypeError, - ::testing::HasSubstr( - "struct (sub)fields don't match or are in the wrong order"), - Cast(src_null, options_null)); + ::testing::HasSubstr("struct fields don't match or are in the wrong order"), + Cast(src_null, options6_null)); + + // fields in wrong order + const auto dest7_null = arrow::struct_({std::make_shared("a", int8()), + std::make_shared("c", int16()), + std::make_shared("b", int64())}); + const auto options7_null = CastOptions::Safe(dest7_null); + EXPECT_RAISES_WITH_MESSAGE_THAT( + TypeError, + ::testing::HasSubstr("struct fields don't match or are in the wrong order"), + Cast(src_null, options7_null)); + + // duplicate field names + const auto dest8_null = arrow::struct_( + {std::make_shared("a", int8()), std::make_shared("c", int16()), + std::make_shared("d", int32()), std::make_shared("a", int64())}); + const auto options8_null = CastOptions::Safe(dest8_null); + EXPECT_RAISES_WITH_MESSAGE_THAT( + TypeError, + ::testing::HasSubstr("struct fields don't match or are in the wrong order"), + Cast(src_null, options8_null)); } } } @@ -2371,6 +2419,10 @@ TEST(Cast, StructToSameSizedAndNamedStruct) { CheckStructToStruct(NumericTypes() TEST(Cast, StructToStructSubset) { CheckStructToStructSubset(NumericTypes()); } +TEST(Cast, StructToStructSubsetWithNulls) { + CheckStructToStructSubsetWithNulls(NumericTypes()); +} + TEST(Cast, StructToSameSizedButDifferentNamedStruct) { std::vector field_names = {"a", "b"}; std::shared_ptr a, b; @@ -2384,11 +2436,11 @@ TEST(Cast, StructToSameSizedButDifferentNamedStruct) { EXPECT_RAISES_WITH_MESSAGE_THAT( TypeError, - ::testing::HasSubstr("struct (sub)fields don't match or are in the wrong order"), + ::testing::HasSubstr("struct fields don't match or are in the wrong order"), Cast(src, options)); } -TEST(Cast, StructToLargerSizeStruct) { +TEST(Cast, StructToBiggerStruct) { std::vector field_names = {"a", "b"}; std::shared_ptr a, b; a = ArrayFromJSON(int8(), "[1, 2]"); @@ -2402,51 +2454,102 @@ TEST(Cast, StructToLargerSizeStruct) { EXPECT_RAISES_WITH_MESSAGE_THAT( TypeError, - ::testing::HasSubstr("struct (sub)fields don't match or are in the wrong order"), + ::testing::HasSubstr("struct fields don't match or are in the wrong order"), Cast(src, options)); } -TEST(Cast, StructToSameSizedButDifferentNullabilityStruct) { - // OK to go from not-nullable to nullable... - std::vector> fields1 = { - std::make_shared("a", int8(), false), - std::make_shared("b", int8(), false)}; - std::shared_ptr a1, b1; - a1 = ArrayFromJSON(int8(), "[1, 2]"); - b1 = ArrayFromJSON(int8(), "[3, 4]"); - ASSERT_OK_AND_ASSIGN(auto src1, StructArray::Make({a1, b1}, fields1)); - - std::vector> fields2 = { - std::make_shared("a", int8(), true), - std::make_shared("b", int8(), true)}; - std::shared_ptr a2, b2; - a2 = ArrayFromJSON(int8(), "[1, 2]"); - b2 = ArrayFromJSON(int8(), "[3, 4]"); - ASSERT_OK_AND_ASSIGN(auto dest1, StructArray::Make({a2, b2}, fields2)); - - CheckCast(src1, dest1); - - // But not the other way around - std::vector> fields3 = { - std::make_shared("a", int8(), true), - std::make_shared("b", int8(), true)}; - std::shared_ptr a3, b3; - a3 = ArrayFromJSON(int8(), "[1, null]"); - b3 = ArrayFromJSON(int8(), "[3, 4]"); - ASSERT_OK_AND_ASSIGN(auto src2, StructArray::Make({a3, b3}, fields3)); - - std::vector> fields4 = { - std::make_shared("a", int8(), false), - std::make_shared("b", int8(), false)}; - const auto dest2 = arrow::struct_(fields4); - const auto options = CastOptions::Safe(dest2); - - EXPECT_RAISES_WITH_MESSAGE_THAT( - TypeError, - ::testing::HasSubstr( - "Type error: cannot cast nullable struct to non-nullable " - "struct: struct struct"), - Cast(src2, options)); +TEST(Cast, StructToDifferentNullabilityStruct) { + { + // OK to go from non-nullable to nullable... + ARROW_SCOPED_TRACE("From non-nullable to nullable"); + std::vector> fields_src_non_nullable = { + std::make_shared("a", int8(), false), + std::make_shared("b", int8(), false), + std::make_shared("c", int8(), false)}; + std::shared_ptr a_src_non_nullable, b_src_non_nullable, c_src_non_nullable; + a_src_non_nullable = ArrayFromJSON(int8(), "[11, 23, 56]"); + b_src_non_nullable = ArrayFromJSON(int8(), "[32, 46, 37]"); + c_src_non_nullable = ArrayFromJSON(int8(), "[95, 11, 44]"); + ASSERT_OK_AND_ASSIGN( + auto src_non_nullable, + StructArray::Make({a_src_non_nullable, b_src_non_nullable, c_src_non_nullable}, + fields_src_non_nullable)); + + std::shared_ptr a_dest_nullable, b_dest_nullable, c_dest_nullable; + a_dest_nullable = ArrayFromJSON(int64(), "[11, 23, 56]"); + b_dest_nullable = ArrayFromJSON(int64(), "[32, 46, 37]"); + c_dest_nullable = ArrayFromJSON(int64(), "[95, 11, 44]"); + + std::vector> fields_dest1_nullable = { + std::make_shared("a", int64(), true), + std::make_shared("b", int64(), true), + std::make_shared("c", int64(), true)}; + ASSERT_OK_AND_ASSIGN( + auto dest1_nullable, + StructArray::Make({a_dest_nullable, b_dest_nullable, c_dest_nullable}, + fields_dest1_nullable)); + CheckCast(src_non_nullable, dest1_nullable); + + std::vector> fields_dest2_nullable = { + std::make_shared("a", int64(), true), + std::make_shared("c", int64(), true)}; + ASSERT_OK_AND_ASSIGN( + auto dest2_nullable, + StructArray::Make({a_dest_nullable, c_dest_nullable}, fields_dest2_nullable)); + CheckCast(src_non_nullable, dest2_nullable); + + std::vector> fields_dest3_nullable = { + std::make_shared("b", int64(), true)}; + ASSERT_OK_AND_ASSIGN(auto dest3_nullable, + StructArray::Make({b_dest_nullable}, fields_dest3_nullable)); + CheckCast(src_non_nullable, dest3_nullable); + } + { + // But NOT OK to go from nullable to non-nullable... + ARROW_SCOPED_TRACE("From nullable to non-nullable"); + std::vector> fields_src_nullable = { + std::make_shared("a", int8(), true), + std::make_shared("b", int8(), true), + std::make_shared("c", int8(), true)}; + std::shared_ptr a_src_nullable, b_src_nullable, c_src_nullable; + a_src_nullable = ArrayFromJSON(int8(), "[1, null, 5]"); + b_src_nullable = ArrayFromJSON(int8(), "[3, 4, null]"); + c_src_nullable = ArrayFromJSON(int8(), "[9, 11, 44]"); + ASSERT_OK_AND_ASSIGN( + auto src_nullable, + StructArray::Make({a_src_nullable, b_src_nullable, c_src_nullable}, + fields_src_nullable)); + + std::vector> fields_dest1_non_nullable = { + std::make_shared("a", int64(), false), + std::make_shared("b", int64(), false), + std::make_shared("c", int64(), false)}; + const auto dest1_non_nullable = arrow::struct_(fields_dest1_non_nullable); + const auto options1_non_nullable = CastOptions::Safe(dest1_non_nullable); + EXPECT_RAISES_WITH_MESSAGE_THAT( + TypeError, + ::testing::HasSubstr("cannot cast nullable field to non-nullable field"), + Cast(src_nullable, options1_non_nullable)); + + std::vector> fields_dest2_non_nullble = { + std::make_shared("a", int64(), false), + std::make_shared("c", int64(), false)}; + const auto dest2_non_nullable = arrow::struct_(fields_dest2_non_nullble); + const auto options2_non_nullable = CastOptions::Safe(dest2_non_nullable); + EXPECT_RAISES_WITH_MESSAGE_THAT( + TypeError, + ::testing::HasSubstr("cannot cast nullable field to non-nullable field"), + Cast(src_nullable, options2_non_nullable)); + + std::vector> fields_dest3_non_nullble = { + std::make_shared("c", int64(), false)}; + const auto dest3_non_nullable = arrow::struct_(fields_dest3_non_nullble); + const auto options3_non_nullable = CastOptions::Safe(dest3_non_nullable); + EXPECT_RAISES_WITH_MESSAGE_THAT( + TypeError, + ::testing::HasSubstr("cannot cast nullable field to non-nullable field"), + Cast(src_nullable, options3_non_nullable)); + } } TEST(Cast, IdentityCasts) { diff --git a/cpp/src/arrow/dataset/scanner_test.cc b/cpp/src/arrow/dataset/scanner_test.cc index 7e0bbb40bd1..b211ce89947 100644 --- a/cpp/src/arrow/dataset/scanner_test.cc +++ b/cpp/src/arrow/dataset/scanner_test.cc @@ -1657,7 +1657,7 @@ TEST(ScanNode, MaterializationOfNestedVirtualColumn) { // TODO(ARROW-1888): allow scanner to "patch up" structs with casts EXPECT_FINISHES_AND_RAISES_WITH_MESSAGE_THAT( TypeError, - ::testing::HasSubstr("struct (sub)fields don't match or are in the wrong order"), + ::testing::HasSubstr("struct fields don't match or are in the wrong order"), plan.Run()); } From f8e01486fb1a1175f9b9cd661bbecc094d01127e Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Tue, 29 Mar 2022 20:32:24 +0530 Subject: [PATCH 7/8] Add test for duplicate field names --- .../compute/kernels/scalar_cast_nested.cc | 12 ++--- .../arrow/compute/kernels/scalar_cast_test.cc | 47 +++++++++++++++++-- 2 files changed, 49 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 7b4f56e6a6d..d91bf032e58 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -164,8 +164,8 @@ struct CastStruct { for (int in_field_index = 0; in_field_index < in_field_count && out_field_index < out_field_count; ++in_field_index) { - const auto in_field = in_type.field(in_field_index); - const auto out_field = out_type.field(out_field_index); + const auto& in_field = in_type.field(in_field_index); + const auto& out_field = out_type.field(out_field_index); if (in_field->name() == out_field->name()) { if (in_field->nullable() && !out_field->nullable()) { return Status::TypeError("cannot cast nullable field to non-nullable field: ", @@ -189,8 +189,8 @@ struct CastStruct { if (in_scalar.is_valid) { out_field_index = 0; for (int field_index : fields_to_select) { - auto values = in_scalar.value[field_index]; - auto target_type = out->type()->field(out_field_index++)->type(); + const auto& values = in_scalar.value[field_index]; + const auto& target_type = out->type()->field(out_field_index++)->type(); ARROW_ASSIGN_OR_RAISE(Datum cast_values, Cast(values, target_type, options, ctx->exec_context())); DCHECK_EQ(Datum::SCALAR, cast_values.kind()); @@ -212,9 +212,9 @@ struct CastStruct { out_field_index = 0; for (int field_index : fields_to_select) { - auto values = + const auto& values = in_array.child_data[field_index]->Slice(in_array.offset, in_array.length); - auto target_type = out->type()->field(out_field_index++)->type(); + const auto& target_type = out->type()->field(out_field_index++)->type(); ARROW_ASSIGN_OR_RAISE(Datum cast_values, Cast(values, target_type, options, ctx->exec_context())); diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 210517fbc86..f4be3a887ee 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2312,7 +2312,7 @@ static void CheckStructToStructSubset( ::testing::HasSubstr("struct fields don't match or are in the wrong order"), Cast(src, options7)); - // duplicate field names + // duplicate missing field names const auto dest8 = arrow::struct_( {std::make_shared("a", int8()), std::make_shared("c", int16()), std::make_shared("d", int32()), std::make_shared("a", int64())}); @@ -2321,6 +2321,25 @@ static void CheckStructToStructSubset( TypeError, ::testing::HasSubstr("struct fields don't match or are in the wrong order"), Cast(src, options8)); + + // duplicate present field names + ASSERT_OK_AND_ASSIGN( + auto src_duplicate_field_names, + StructArray::Make({a1, b1, c1}, std::vector{"a", "a", "a"})); + + ASSERT_OK_AND_ASSIGN(auto dest1_duplicate_field_names, + StructArray::Make({a2}, std::vector{"a"})); + CheckCast(src_duplicate_field_names, dest1_duplicate_field_names); + + ASSERT_OK_AND_ASSIGN( + auto dest2_duplicate_field_names, + StructArray::Make({a2, b2}, std::vector{"a", "a"})); + CheckCast(src_duplicate_field_names, dest2_duplicate_field_names); + + ASSERT_OK_AND_ASSIGN( + auto dest3_duplicate_field_names, + StructArray::Make({a2, b2, c2}, std::vector{"a", "a", "a"})); + CheckCast(src_duplicate_field_names, dest3_duplicate_field_names); } } } @@ -2402,7 +2421,7 @@ static void CheckStructToStructSubsetWithNulls( ::testing::HasSubstr("struct fields don't match or are in the wrong order"), Cast(src_null, options7_null)); - // duplicate field names + // duplicate missing field names const auto dest8_null = arrow::struct_( {std::make_shared("a", int8()), std::make_shared("c", int16()), std::make_shared("d", int32()), std::make_shared("a", int64())}); @@ -2411,6 +2430,28 @@ static void CheckStructToStructSubsetWithNulls( TypeError, ::testing::HasSubstr("struct fields don't match or are in the wrong order"), Cast(src_null, options8_null)); + + // duplicate present field values + ASSERT_OK_AND_ASSIGN( + auto src_duplicate_field_names_null, + StructArray::Make({a1, b1, c1}, std::vector{"a", "a", "a"}, + null_bitmap)); + + ASSERT_OK_AND_ASSIGN( + auto dest1_duplicate_field_names_null, + StructArray::Make({a2}, std::vector{"a"}, null_bitmap)); + CheckCast(src_duplicate_field_names_null, dest1_duplicate_field_names_null); + + ASSERT_OK_AND_ASSIGN( + auto dest2_duplicate_field_names_null, + StructArray::Make({a2, b2}, std::vector{"a", "a"}, null_bitmap)); + CheckCast(src_duplicate_field_names_null, dest2_duplicate_field_names_null); + + ASSERT_OK_AND_ASSIGN( + auto dest3_duplicate_field_names_null, + StructArray::Make({a2, b2, c2}, std::vector{"a", "a", "a"}, + null_bitmap)); + CheckCast(src_duplicate_field_names_null, dest3_duplicate_field_names_null); } } } @@ -2461,7 +2502,6 @@ TEST(Cast, StructToBiggerStruct) { TEST(Cast, StructToDifferentNullabilityStruct) { { // OK to go from non-nullable to nullable... - ARROW_SCOPED_TRACE("From non-nullable to nullable"); std::vector> fields_src_non_nullable = { std::make_shared("a", int8(), false), std::make_shared("b", int8(), false), @@ -2506,7 +2546,6 @@ TEST(Cast, StructToDifferentNullabilityStruct) { } { // But NOT OK to go from nullable to non-nullable... - ARROW_SCOPED_TRACE("From nullable to non-nullable"); std::vector> fields_src_nullable = { std::make_shared("a", int8(), true), std::make_shared("b", int8(), true), From e02c0a6c01b25a089ed6a5fb58db1f589f83f94b Mon Sep 17 00:00:00 2001 From: Dhruv Vats Date: Tue, 29 Mar 2022 20:39:41 +0530 Subject: [PATCH 8/8] Add some nulls to nulls test --- cpp/src/arrow/compute/kernels/scalar_cast_test.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index f4be3a887ee..862e54d94bd 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2355,17 +2355,17 @@ static void CheckStructToStructSubsetWithNulls( std::shared_ptr a1, b1, c1, d1, e1; a1 = ArrayFromJSON(src_value_type, "[1, 2, 5]"); - b1 = ArrayFromJSON(src_value_type, "[3, 4, 7]"); + b1 = ArrayFromJSON(src_value_type, "[3, null, 7]"); c1 = ArrayFromJSON(src_value_type, "[9, 11, 44]"); - d1 = ArrayFromJSON(src_value_type, "[6, 51, 49]"); - e1 = ArrayFromJSON(src_value_type, "[19, 17, 74]"); + d1 = ArrayFromJSON(src_value_type, "[6, 51, null]"); + e1 = ArrayFromJSON(src_value_type, "[null, 17, 74]"); std::shared_ptr a2, b2, c2, d2, e2; a2 = ArrayFromJSON(dest_value_type, "[1, 2, 5]"); - b2 = ArrayFromJSON(dest_value_type, "[3, 4, 7]"); + b2 = ArrayFromJSON(dest_value_type, "[3, null, 7]"); c2 = ArrayFromJSON(dest_value_type, "[9, 11, 44]"); - d2 = ArrayFromJSON(dest_value_type, "[6, 51, 49]"); - e2 = ArrayFromJSON(dest_value_type, "[19, 17, 74]"); + d2 = ArrayFromJSON(dest_value_type, "[6, 51, null]"); + e2 = ArrayFromJSON(dest_value_type, "[null, 17, 74]"); std::shared_ptr null_bitmap; BitmapFromVector({0, 1, 0}, &null_bitmap);