From 3b995ce89d65d8469e2e6e02824648a0c3fbefd9 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sat, 8 Jan 2022 15:59:27 -0500 Subject: [PATCH 01/51] Added failing test case --- .../arrow/compute/kernels/scalar_cast_test.cc | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 226452404d0..00febce6957 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2217,6 +2217,28 @@ TEST(Cast, ListToListOptionsPassthru) { } } +static void CheckStructToStruct(const std::vector>& value_types) { + for (const auto& src_value_type : value_types) { + for (const auto& dest_value_type : value_types) { + std::vector field_names = {"a", "b"}; + std::shared_ptr a1, b1, a2, b2; + a1 = ArrayFromJSON(src_value_type, "[1, 2]"); + b1 = ArrayFromJSON(src_value_type, "[3, 4]"); + a2 = ArrayFromJSON(dest_value_type, "[1, 2]"); + b2 = ArrayFromJSON(dest_value_type, "[3, 4]"); + + auto src = StructArray::Make({a1, b2}, field_names).ValueOrDie(); + auto dest = StructArray::Make({a2, b2}, field_names).ValueOrDie(); + + CheckCast(src, dest); + } + } +} + +TEST(Cast, StructToSameSizedAndNamedStruct) { + CheckStructToStruct({int32(), float32(), int64()}); +} + TEST(Cast, IdentityCasts) { // ARROW-4102 auto CheckIdentityCast = [](std::shared_ptr type, const std::string& json) { From 9ad70baf86d73d90f3f187734821bf2930764aac Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sat, 8 Jan 2022 16:26:55 -0500 Subject: [PATCH 02/51] incomplete impl but some progress --- .../compute/kernels/scalar_cast_nested.cc | 91 ++++++++++++++++++- 1 file changed, 90 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index ab583bbbe8c..06dded417f5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -6,7 +6,7 @@ // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // -// http://www.apache.org/licenses/LICENSE-2.0 +// htt://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an @@ -150,6 +150,94 @@ void AddListCast(CastFunction* func) { DCHECK_OK(func->AddKernel(SrcType::type_id, std::move(kernel))); } +template +struct CastStruct { + static constexpr bool is_upcast = sizeof(src_offset_type) < sizeof(dest_offset_type); + static constexpr bool is_downcast = sizeof(src_offset_type) > sizeof(dest_offset_type); + + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + const CastOptions& options = CastState::Get(ctx); + + auto child_type = checked_cast(*out->type()).value_type(); + + if (out->kind() == Datum::SCALAR) { + // The scalar case is simple, as only the underlying values must be cast + const auto& in_scalar = checked_cast(*batch[0].scalar()); + auto out_scalar = checked_cast(out->scalar().get()); + + DCHECK(!out_scalar->is_valid); + if (in_scalar.is_valid) { + ARROW_ASSIGN_OR_RAISE(out_scalar->value, Cast(*in_scalar.value, child_type, + options, ctx->exec_context())); + + out_scalar->is_valid = true; + } + return Status::OK(); + } + + const ArrayData& in_array = *batch[0].array(); + auto offsets = in_array.GetValues(1); + Datum values = in_array.child_data[0]; + + ArrayData* out_array = out->mutable_array(); + out_array->buffers = in_array.buffers; + + // Shift bitmap in case the source offset is non-zero + if (in_array.offset != 0 && in_array.buffers[0]) { + ARROW_ASSIGN_OR_RAISE(out_array->buffers[0], + CopyBitmap(ctx->memory_pool(), in_array.buffers[0]->data(), + in_array.offset, in_array.length)); + } + + // Handle list offsets + // Several cases can arise: + // - the source offset is non-zero, in which case we slice the underlying values + // and shift the list offsets (regardless of their respective types) + // - the source offset is zero but source and destination types have + // different list offset types, in which case we cast the list offsets + // - otherwise, we simply keep the original list offsets + if (is_downcast) { + if (offsets[in_array.length] > std::numeric_limits::max()) { + return Status::Invalid("Array of type ", in_array.type->ToString(), + " too large to convert to ", out_array->type->ToString()); + } + } + + if (in_array.offset != 0) { + ARROW_ASSIGN_OR_RAISE( + out_array->buffers[1], + ctx->Allocate(sizeof(dest_offset_type) * (in_array.length + 1))); + + auto shifted_offsets = out_array->GetMutableValues(1); + for (int64_t i = 0; i < in_array.length + 1; ++i) { + shifted_offsets[i] = static_cast(offsets[i] - offsets[0]); + } + values = in_array.child_data[0]->Slice(offsets[0], offsets[in_array.length]); + } else { + RETURN_NOT_OK((CastListOffsets(ctx, in_array, out_array))); + } + + // Handle values + ARROW_ASSIGN_OR_RAISE(Datum cast_values, + Cast(values, child_type, options, ctx->exec_context())); + + DCHECK_EQ(Datum::ARRAY, cast_values.kind()); + out_array->child_data.push_back(cast_values.array()); + return Status::OK(); + } +}; + + + void AddStructToStructCast(CastFunction* func) { + ScalarKernel kernel; + kernel.exec = CastStruct::Exec; + kernel.signature = + // TODO: create a signature that checks element names / lengths? + KernelSignature::Make({InputType(SrcType::type_id)}, kOutputTargetType); + kernel.null_hanling = NullHandling::COMPUTED_NO_PREALLOCATE; + DCHECK_OK(func->AddKernel(StructType, std::move(kernel)); + } + } // namespace std::vector> GetNestedCasts() { @@ -174,6 +262,7 @@ std::vector> GetNestedCasts() { // So is struct auto cast_struct = std::make_shared("cast_struct", Type::STRUCT); AddCommonCasts(Type::STRUCT, kOutputTargetType, cast_struct.get()); + AddStructToStructCast(cast_struct); // So is dictionary auto cast_dictionary = From 8d125950495c3a046d4d2eb38a2d37045a4e1d03 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sun, 9 Jan 2022 11:23:49 -0500 Subject: [PATCH 03/51] exceptions implemented --- .../compute/kernels/scalar_cast_nested.cc | 78 ++++--------------- 1 file changed, 17 insertions(+), 61 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 06dded417f5..fd47cd778f4 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -150,79 +150,35 @@ void AddListCast(CastFunction* func) { DCHECK_OK(func->AddKernel(SrcType::type_id, std::move(kernel))); } -template struct CastStruct { - static constexpr bool is_upcast = sizeof(src_offset_type) < sizeof(dest_offset_type); - static constexpr bool is_downcast = sizeof(src_offset_type) > sizeof(dest_offset_type); - static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - const CastOptions& options = CastState::Get(ctx); - - auto child_type = checked_cast(*out->type()).value_type(); - + //const CastOptions& options = CastState::Get(ctx); if (out->kind() == Datum::SCALAR) { - // The scalar case is simple, as only the underlying values must be cast - const auto& in_scalar = checked_cast(*batch[0].scalar()); - auto out_scalar = checked_cast(out->scalar().get()); + const auto& in_scalar = checked_cast(*batch[0].scalar()); + auto out_scalar = checked_cast(out->scalar().get()); DCHECK(!out_scalar->is_valid); if (in_scalar.is_valid) { - ARROW_ASSIGN_OR_RAISE(out_scalar->value, Cast(*in_scalar.value, child_type, - options, ctx->exec_context())); - - out_scalar->is_valid = true; + auto in_type = in_scalar.type; } return Status::OK(); } - const ArrayData& in_array = *batch[0].array(); - auto offsets = in_array.GetValues(1); - Datum values = in_array.child_data[0]; - - ArrayData* out_array = out->mutable_array(); - out_array->buffers = in_array.buffers; - - // Shift bitmap in case the source offset is non-zero - if (in_array.offset != 0 && in_array.buffers[0]) { - ARROW_ASSIGN_OR_RAISE(out_array->buffers[0], - CopyBitmap(ctx->memory_pool(), in_array.buffers[0]->data(), - in_array.offset, in_array.length)); - } + const auto in_size = checked_cast(*batch[0].type()).num_fields(); + const auto out_size = checked_cast(*out->type()).num_fields(); - // Handle list offsets - // Several cases can arise: - // - the source offset is non-zero, in which case we slice the underlying values - // and shift the list offsets (regardless of their respective types) - // - the source offset is zero but source and destination types have - // different list offset types, in which case we cast the list offsets - // - otherwise, we simply keep the original list offsets - if (is_downcast) { - if (offsets[in_array.length] > std::numeric_limits::max()) { - return Status::Invalid("Array of type ", in_array.type->ToString(), - " too large to convert to ", out_array->type->ToString()); - } + if (in_size != out_size) { + ARROW_RETURN_NOT_OK(Status(StatusCode::TypeError, "struct sizes do not match")); } - if (in_array.offset != 0) { - ARROW_ASSIGN_OR_RAISE( - out_array->buffers[1], - ctx->Allocate(sizeof(dest_offset_type) * (in_array.length + 1))); - - auto shifted_offsets = out_array->GetMutableValues(1); - for (int64_t i = 0; i < in_array.length + 1; ++i) { - shifted_offsets[i] = static_cast(offsets[i] - offsets[0]); + for (auto i{0}; i < in_size; i++) { + const auto in_field_name = checked_cast(*batch[0].type()).field(i)->name(); + const auto out_field_name = checked_cast(*out->type()).field(i)->name(); + if (in_field_name != out_field_name) { + ARROW_RETURN_NOT_OK(Status(StatusCode::TypeError, "struct field names do not match")); } - values = in_array.child_data[0]->Slice(offsets[0], offsets[in_array.length]); - } else { - RETURN_NOT_OK((CastListOffsets(ctx, in_array, out_array))); } - // Handle values - ARROW_ASSIGN_OR_RAISE(Datum cast_values, - Cast(values, child_type, options, ctx->exec_context())); - - DCHECK_EQ(Datum::ARRAY, cast_values.kind()); - out_array->child_data.push_back(cast_values.array()); return Status::OK(); } }; @@ -233,9 +189,9 @@ struct CastStruct { kernel.exec = CastStruct::Exec; kernel.signature = // TODO: create a signature that checks element names / lengths? - KernelSignature::Make({InputType(SrcType::type_id)}, kOutputTargetType); - kernel.null_hanling = NullHandling::COMPUTED_NO_PREALLOCATE; - DCHECK_OK(func->AddKernel(StructType, std::move(kernel)); + KernelSignature::Make({InputType(StructType::type_id)}, kOutputTargetType); + kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; + DCHECK_OK(func->AddKernel(StructType::type_id, std::move(kernel))); } } // namespace @@ -262,7 +218,7 @@ std::vector> GetNestedCasts() { // So is struct auto cast_struct = std::make_shared("cast_struct", Type::STRUCT); AddCommonCasts(Type::STRUCT, kOutputTargetType, cast_struct.get()); - AddStructToStructCast(cast_struct); + AddStructToStructCast(cast_struct.get()); // So is dictionary auto cast_dictionary = From 88165c544d73bec23d7a9beac59217e8336d5212 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sun, 9 Jan 2022 12:05:22 -0500 Subject: [PATCH 04/51] passed tests for exception messages --- .../compute/kernels/scalar_cast_nested.cc | 2 +- .../arrow/compute/kernels/scalar_cast_test.cc | 44 ++++++++++++++++++- 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index fd47cd778f4..92a234afccf 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -168,7 +168,7 @@ struct CastStruct { const auto out_size = checked_cast(*out->type()).num_fields(); if (in_size != out_size) { - ARROW_RETURN_NOT_OK(Status(StatusCode::TypeError, "struct sizes do not match")); + ARROW_RETURN_NOT_OK(Status(StatusCode::TypeError, "struct field sizes do not match")); } for (auto i{0}; i < in_size; i++) { diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 00febce6957..5e4f0b4f8cd 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2216,7 +2216,7 @@ TEST(Cast, ListToListOptionsPassthru) { } } } - + /* static void CheckStructToStruct(const std::vector>& value_types) { for (const auto& src_value_type : value_types) { for (const auto& dest_value_type : value_types) { @@ -2238,6 +2238,48 @@ static void CheckStructToStruct(const std::vector>& va TEST(Cast, StructToSameSizedAndNamedStruct) { CheckStructToStruct({int32(), float32(), int64()}); } + */ + + TEST(Cast, StructToSameSizedButDifferentNamedStruct) { + std::vector field_names = {"a", "b"}; + std::shared_ptr a, b; + a = ArrayFromJSON(int8(), "[1, 2]"); + b = ArrayFromJSON(int8(), "[3, 4]"); + auto src = StructArray::Make({a, b}, field_names).ValueOrDie(); + + // TODO: instead of creating an area create StructType directly + // error: call to implicitly-deleted copy constructor of 'arrow::StructType' + std::vector field_names2 = {"c", "d"}; + std::shared_ptr c, d; + c = ArrayFromJSON(int8(), "[1, 2]"); + d = ArrayFromJSON(int8(), "[3, 4]"); + auto dest = StructArray::Make({c, d}, field_names2).ValueOrDie(); + auto options = CastOptions{}; + options.to_type = dest->type(); + + ASSERT_RAISES_WITH_MESSAGE(TypeError, "Type error: struct field names do not match", Cast(src, options)); + } + + TEST(Cast, StructToDifferentSizeStruct) { + std::vector field_names = {"a", "b"}; + std::shared_ptr a, b; + a = ArrayFromJSON(int8(), "[1, 2]"); + b = ArrayFromJSON(int8(), "[3, 4]"); + auto src = StructArray::Make({a, b}, field_names).ValueOrDie(); + + // TODO: instead of creating an area create StructType directly + // error: call to implicitly-deleted copy constructor of 'arrow::StructType' + std::vector field_names2 = {"a", "b", "c"}; + std::shared_ptr a2, b2, c; + a2 = ArrayFromJSON(int8(), "[1, 2]"); + b2 = ArrayFromJSON(int8(), "[3, 4]"); + c = ArrayFromJSON(int8(), "[5, 6]"); + auto dest = StructArray::Make({a2, b2, c}, field_names2).ValueOrDie(); + auto options = CastOptions{}; + options.to_type = dest->type(); + + ASSERT_RAISES_WITH_MESSAGE(TypeError, "Type error: struct field sizes do not match", Cast(src, options)); + } TEST(Cast, IdentityCasts) { // ARROW-4102 From bcd43528599b6fde78aba4f55bc6a63dcc168691 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sun, 9 Jan 2022 12:06:07 -0500 Subject: [PATCH 05/51] clang format fixup --- .../compute/kernels/scalar_cast_nested.cc | 33 ++--- .../arrow/compute/kernels/scalar_cast_test.cc | 119 +++++++++--------- 2 files changed, 77 insertions(+), 75 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 92a234afccf..58d27552277 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -152,14 +152,14 @@ void AddListCast(CastFunction* func) { struct CastStruct { static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - //const CastOptions& options = CastState::Get(ctx); + // const CastOptions& options = CastState::Get(ctx); if (out->kind() == Datum::SCALAR) { const auto& in_scalar = checked_cast(*batch[0].scalar()); auto out_scalar = checked_cast(out->scalar().get()); DCHECK(!out_scalar->is_valid); if (in_scalar.is_valid) { - auto in_type = in_scalar.type; + auto in_type = in_scalar.type; } return Status::OK(); } @@ -168,31 +168,34 @@ struct CastStruct { const auto out_size = checked_cast(*out->type()).num_fields(); if (in_size != out_size) { - ARROW_RETURN_NOT_OK(Status(StatusCode::TypeError, "struct field sizes do not match")); + ARROW_RETURN_NOT_OK( + Status(StatusCode::TypeError, "struct field sizes do not match")); } for (auto i{0}; i < in_size; i++) { - const auto in_field_name = checked_cast(*batch[0].type()).field(i)->name(); - const auto out_field_name = checked_cast(*out->type()).field(i)->name(); + const auto in_field_name = + checked_cast(*batch[0].type()).field(i)->name(); + const auto out_field_name = + checked_cast(*out->type()).field(i)->name(); if (in_field_name != out_field_name) { - ARROW_RETURN_NOT_OK(Status(StatusCode::TypeError, "struct field names do not match")); + ARROW_RETURN_NOT_OK( + Status(StatusCode::TypeError, "struct field names do not match")); } } return Status::OK(); } }; - - void AddStructToStructCast(CastFunction* func) { - ScalarKernel kernel; - kernel.exec = CastStruct::Exec; - kernel.signature = - // TODO: create a signature that checks element names / lengths? +void AddStructToStructCast(CastFunction* func) { + ScalarKernel kernel; + kernel.exec = CastStruct::Exec; + kernel.signature = + // TODO: create a signature that checks element names / lengths? KernelSignature::Make({InputType(StructType::type_id)}, kOutputTargetType); - kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; - DCHECK_OK(func->AddKernel(StructType::type_id, std::move(kernel))); - } + kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; + DCHECK_OK(func->AddKernel(StructType::type_id, std::move(kernel))); +} } // namespace diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 5e4f0b4f8cd..7f158752b4d 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2216,70 +2216,69 @@ TEST(Cast, ListToListOptionsPassthru) { } } } - /* -static void CheckStructToStruct(const std::vector>& value_types) { - for (const auto& src_value_type : value_types) { - for (const auto& dest_value_type : value_types) { - std::vector field_names = {"a", "b"}; - std::shared_ptr a1, b1, a2, b2; - a1 = ArrayFromJSON(src_value_type, "[1, 2]"); - b1 = ArrayFromJSON(src_value_type, "[3, 4]"); - a2 = ArrayFromJSON(dest_value_type, "[1, 2]"); - b2 = ArrayFromJSON(dest_value_type, "[3, 4]"); - - auto src = StructArray::Make({a1, b2}, field_names).ValueOrDie(); - auto dest = StructArray::Make({a2, b2}, field_names).ValueOrDie(); - - CheckCast(src, dest); - } +/* +static void CheckStructToStruct(const std::vector>& value_types) +{ for (const auto& src_value_type : value_types) { for (const auto& dest_value_type : +value_types) { std::vector field_names = {"a", "b"}; std::shared_ptr +a1, b1, a2, b2; a1 = ArrayFromJSON(src_value_type, "[1, 2]"); b1 = +ArrayFromJSON(src_value_type, "[3, 4]"); a2 = ArrayFromJSON(dest_value_type, "[1, 2]"); b2 += ArrayFromJSON(dest_value_type, "[3, 4]"); + + auto src = StructArray::Make({a1, b2}, field_names).ValueOrDie(); + auto dest = StructArray::Make({a2, b2}, field_names).ValueOrDie(); + + CheckCast(src, dest); } } +} TEST(Cast, StructToSameSizedAndNamedStruct) { - CheckStructToStruct({int32(), float32(), int64()}); -} - */ - - TEST(Cast, StructToSameSizedButDifferentNamedStruct) { - std::vector field_names = {"a", "b"}; - std::shared_ptr a, b; - a = ArrayFromJSON(int8(), "[1, 2]"); - b = ArrayFromJSON(int8(), "[3, 4]"); - auto src = StructArray::Make({a, b}, field_names).ValueOrDie(); - - // TODO: instead of creating an area create StructType directly - // error: call to implicitly-deleted copy constructor of 'arrow::StructType' - std::vector field_names2 = {"c", "d"}; - std::shared_ptr c, d; - c = ArrayFromJSON(int8(), "[1, 2]"); - d = ArrayFromJSON(int8(), "[3, 4]"); - auto dest = StructArray::Make({c, d}, field_names2).ValueOrDie(); - auto options = CastOptions{}; - options.to_type = dest->type(); - - ASSERT_RAISES_WITH_MESSAGE(TypeError, "Type error: struct field names do not match", Cast(src, options)); - } - - TEST(Cast, StructToDifferentSizeStruct) { - std::vector field_names = {"a", "b"}; - std::shared_ptr a, b; - a = ArrayFromJSON(int8(), "[1, 2]"); - b = ArrayFromJSON(int8(), "[3, 4]"); - auto src = StructArray::Make({a, b}, field_names).ValueOrDie(); - - // TODO: instead of creating an area create StructType directly - // error: call to implicitly-deleted copy constructor of 'arrow::StructType' - std::vector field_names2 = {"a", "b", "c"}; - std::shared_ptr a2, b2, c; - a2 = ArrayFromJSON(int8(), "[1, 2]"); - b2 = ArrayFromJSON(int8(), "[3, 4]"); - c = ArrayFromJSON(int8(), "[5, 6]"); - auto dest = StructArray::Make({a2, b2, c}, field_names2).ValueOrDie(); - auto options = CastOptions{}; - options.to_type = dest->type(); - - ASSERT_RAISES_WITH_MESSAGE(TypeError, "Type error: struct field sizes do not match", Cast(src, options)); - } +CheckStructToStruct({int32(), float32(), int64()}); +} +*/ + +TEST(Cast, StructToSameSizedButDifferentNamedStruct) { + std::vector field_names = {"a", "b"}; + std::shared_ptr a, b; + a = ArrayFromJSON(int8(), "[1, 2]"); + b = ArrayFromJSON(int8(), "[3, 4]"); + auto src = StructArray::Make({a, b}, field_names).ValueOrDie(); + + // TODO: instead of creating an area create StructType directly + // error: call to implicitly-deleted copy constructor of 'arrow::StructType' + std::vector field_names2 = {"c", "d"}; + std::shared_ptr c, d; + c = ArrayFromJSON(int8(), "[1, 2]"); + d = ArrayFromJSON(int8(), "[3, 4]"); + auto dest = StructArray::Make({c, d}, field_names2).ValueOrDie(); + auto options = CastOptions{}; + options.to_type = dest->type(); + + ASSERT_RAISES_WITH_MESSAGE(TypeError, "Type error: struct field names do not match", + Cast(src, options)); +} + +TEST(Cast, StructToDifferentSizeStruct) { + std::vector field_names = {"a", "b"}; + std::shared_ptr a, b; + a = ArrayFromJSON(int8(), "[1, 2]"); + b = ArrayFromJSON(int8(), "[3, 4]"); + auto src = StructArray::Make({a, b}, field_names).ValueOrDie(); + + // TODO: instead of creating an area create StructType directly + // error: call to implicitly-deleted copy constructor of 'arrow::StructType' + std::vector field_names2 = {"a", "b", "c"}; + std::shared_ptr a2, b2, c; + a2 = ArrayFromJSON(int8(), "[1, 2]"); + b2 = ArrayFromJSON(int8(), "[3, 4]"); + c = ArrayFromJSON(int8(), "[5, 6]"); + auto dest = StructArray::Make({a2, b2, c}, field_names2).ValueOrDie(); + auto options = CastOptions{}; + options.to_type = dest->type(); + + ASSERT_RAISES_WITH_MESSAGE(TypeError, "Type error: struct field sizes do not match", + Cast(src, options)); +} TEST(Cast, IdentityCasts) { // ARROW-4102 From ba7d6987a989a971312acfc4a48a7c405666b708 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sun, 9 Jan 2022 12:52:21 -0500 Subject: [PATCH 06/51] uncomment test, still failing impl --- .../compute/kernels/scalar_cast_nested.cc | 28 +++++++++-------- .../arrow/compute/kernels/scalar_cast_test.cc | 31 ++++++++++--------- 2 files changed, 32 insertions(+), 27 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 58d27552277..7c2ed49cfd4 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -152,18 +152,7 @@ void AddListCast(CastFunction* func) { struct CastStruct { static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - // const CastOptions& options = CastState::Get(ctx); - if (out->kind() == Datum::SCALAR) { - const auto& in_scalar = checked_cast(*batch[0].scalar()); - auto out_scalar = checked_cast(out->scalar().get()); - - DCHECK(!out_scalar->is_valid); - if (in_scalar.is_valid) { - auto in_type = in_scalar.type; - } - return Status::OK(); - } - + const CastOptions& options = CastState::Get(ctx); const auto in_size = checked_cast(*batch[0].type()).num_fields(); const auto out_size = checked_cast(*out->type()).num_fields(); @@ -183,6 +172,20 @@ struct CastStruct { } } + const ArrayData& in_array = *batch[0].array(); + ArrayData *out_array = out->mutable_array(); + out_array->buffers = in_array.buffers; + + for (int64_t i=0; i < in_array.length; i++) { + + } + auto out_type = out->type(); + auto exec_ctxt = ctx->exec_context(); + auto abcd = Cast(in_array, out_type, options, exec_ctxt); + ARROW_ASSIGN_OR_RAISE(Datum cast_values, abcd); + + DCHECK_EQ(Datum::ARRAY, cast_values.kind()); + return Status::OK(); } }; @@ -191,7 +194,6 @@ void AddStructToStructCast(CastFunction* func) { ScalarKernel kernel; kernel.exec = CastStruct::Exec; kernel.signature = - // TODO: create a signature that checks element names / lengths? KernelSignature::Make({InputType(StructType::type_id)}, kOutputTargetType); kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; DCHECK_OK(func->AddKernel(StructType::type_id, std::move(kernel))); diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 7f158752b4d..3d4a5ea09f6 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2216,26 +2216,29 @@ TEST(Cast, ListToListOptionsPassthru) { } } } -/* -static void CheckStructToStruct(const std::vector>& value_types) -{ for (const auto& src_value_type : value_types) { for (const auto& dest_value_type : -value_types) { std::vector field_names = {"a", "b"}; std::shared_ptr -a1, b1, a2, b2; a1 = ArrayFromJSON(src_value_type, "[1, 2]"); b1 = -ArrayFromJSON(src_value_type, "[3, 4]"); a2 = ArrayFromJSON(dest_value_type, "[1, 2]"); b2 -= ArrayFromJSON(dest_value_type, "[3, 4]"); - auto src = StructArray::Make({a1, b2}, field_names).ValueOrDie(); - auto dest = StructArray::Make({a2, b2}, field_names).ValueOrDie(); - - CheckCast(src, dest); +static void CheckStructToStruct( + const std::vector>& value_types) { + for (const auto& src_value_type : value_types) { + for (const auto& dest_value_type : value_types) { + std::vector field_names = {"a", "b"}; + std::shared_ptr a1, b1, a2, b2; + a1 = ArrayFromJSON(src_value_type, "[1, 2]"); + b1 = ArrayFromJSON(src_value_type, "[3, 4]"); + a2 = ArrayFromJSON(dest_value_type, "[1, 2]"); + b2 = ArrayFromJSON(dest_value_type, "[3, 4]"); + + auto src = StructArray::Make({a1, b2}, field_names).ValueOrDie(); + auto dest = StructArray::Make({a2, b2}, field_names).ValueOrDie(); + + CheckCast(src, dest); + } } } -} TEST(Cast, StructToSameSizedAndNamedStruct) { -CheckStructToStruct({int32(), float32(), int64()}); + CheckStructToStruct({int32(), float32(), int64()}); } -*/ TEST(Cast, StructToSameSizedButDifferentNamedStruct) { std::vector field_names = {"a", "b"}; From e817c35330c55289b525afe4e784bbb59f2629a5 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sun, 9 Jan 2022 13:35:13 -0500 Subject: [PATCH 07/51] no more segfault --- .../compute/kernels/scalar_cast_nested.cc | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 7c2ed49cfd4..21738f8f33e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -153,6 +153,7 @@ void AddListCast(CastFunction* func) { struct CastStruct { static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { const CastOptions& options = CastState::Get(ctx); + const auto in_size = checked_cast(*batch[0].type()).num_fields(); const auto out_size = checked_cast(*out->type()).num_fields(); @@ -175,16 +176,16 @@ struct CastStruct { const ArrayData& in_array = *batch[0].array(); ArrayData *out_array = out->mutable_array(); out_array->buffers = in_array.buffers; - - for (int64_t i=0; i < in_array.length; i++) { - + + for (unsigned long i=0; i < in_array.child_data.size(); i++) { + Datum values = in_array.child_data[i]; + + ARROW_ASSIGN_OR_RAISE(Datum cast_values, + Cast(values, out->type()->field(i)->type(), options, ctx->exec_context())); + + DCHECK_EQ(Datum::ARRAY, cast_values.kind()); + out_array->child_data.push_back(cast_values.array()); } - auto out_type = out->type(); - auto exec_ctxt = ctx->exec_context(); - auto abcd = Cast(in_array, out_type, options, exec_ctxt); - ARROW_ASSIGN_OR_RAISE(Datum cast_values, abcd); - - DCHECK_EQ(Datum::ARRAY, cast_values.kind()); return Status::OK(); } From 0e2d7a935a4af2a0c9f9399bcf1bf6ba5f520e78 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 24 Jan 2022 16:44:33 +0100 Subject: [PATCH 08/51] checkpoint for casting elements --- .../compute/kernels/scalar_cast_nested.cc | 28 +++++++++++++++++-- .../arrow/compute/kernels/scalar_cast_test.cc | 9 ++---- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 21738f8f33e..5e7127fc4f3 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -147,13 +147,15 @@ void AddListCast(CastFunction* func) { kernel.signature = KernelSignature::Make({InputType(SrcType::type_id)}, kOutputTargetType); kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; - DCHECK_OK(func->AddKernel(SrcType::type_id, std::move(kernel))); + DCHECK_OK( func->AddKernel(SrcType::type_id, std::move(kernel))); } struct CastStruct { static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { const CastOptions& options = CastState::Get(ctx); + // Note that size refers to the number of struct elements, not the length of the + // arrays const auto in_size = checked_cast(*batch[0].type()).num_fields(); const auto out_size = checked_cast(*out->type()).num_fields(); @@ -173,6 +175,28 @@ struct CastStruct { } } + if (out->kind() == Datum::SCALAR) { + const auto& in_scalar = checked_cast(*batch[0].scalar()); + auto out_scalar = checked_cast(out->scalar().get()); + + std::vector descrs{}; + for (auto i{0}; i < in_size; i++) { + auto field = out->type()->field(i); + auto descr = ValueDescr(field->type()); + descrs.push_back(descr); + } + + auto in_values = static_cast>>(in_scalar.value); + if (in_scalar.is_valid) { + //ARROW_ASSIGN_OR_RAISE(out_scalar->value, + auto converted = Cast(in_values, descrs, + ctx->exec_context()).ValueOrDie(); + + out_scalar->is_valid = true; + } + return Status::OK(); + } + const ArrayData& in_array = *batch[0].array(); ArrayData *out_array = out->mutable_array(); out_array->buffers = in_array.buffers; @@ -195,7 +219,7 @@ void AddStructToStructCast(CastFunction* func) { ScalarKernel kernel; kernel.exec = CastStruct::Exec; kernel.signature = - KernelSignature::Make({InputType(StructType::type_id)}, kOutputTargetType); + KernelSignature::Make({InputType(StructType::type_id)}, kOutputTargetType); kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; DCHECK_OK(func->AddKernel(StructType::type_id, std::move(kernel))); } diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 3d4a5ea09f6..d38a6e76f29 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2221,15 +2221,12 @@ static void CheckStructToStruct( const std::vector>& value_types) { for (const auto& src_value_type : value_types) { for (const auto& dest_value_type : value_types) { - std::vector field_names = {"a", "b"}; + std::vector field_names = {"a"}; std::shared_ptr a1, b1, a2, b2; a1 = ArrayFromJSON(src_value_type, "[1, 2]"); - b1 = ArrayFromJSON(src_value_type, "[3, 4]"); a2 = ArrayFromJSON(dest_value_type, "[1, 2]"); - b2 = ArrayFromJSON(dest_value_type, "[3, 4]"); - - auto src = StructArray::Make({a1, b2}, field_names).ValueOrDie(); - auto dest = StructArray::Make({a2, b2}, field_names).ValueOrDie(); + auto src = StructArray::Make({a1}, field_names).ValueOrDie(); + auto dest = StructArray::Make({a2}, field_names).ValueOrDie(); CheckCast(src, dest); } From c3011d67005e8d8098736c462c50c785322507d5 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 24 Jan 2022 17:36:39 +0100 Subject: [PATCH 09/51] Datum cast hackery checkpoint --- .../arrow/compute/kernels/scalar_cast_nested.cc | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 5e7127fc4f3..399c6b64f9e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -182,15 +182,21 @@ struct CastStruct { std::vector descrs{}; for (auto i{0}; i < in_size; i++) { auto field = out->type()->field(i); - auto descr = ValueDescr(field->type()); + // TODO: don't hard code SCALAR in this call + auto descr = ValueDescr(field->type(), ValueDescr::SCALAR); descrs.push_back(descr); } - auto in_values = static_cast>>(in_scalar.value); + std::vector> in_values = in_scalar.value; + std::vector datums{}; + for (auto i{0}; i < in_size; i++) { + datums.push_back(Datum(in_values[i])); + } + if (in_scalar.is_valid) { //ARROW_ASSIGN_OR_RAISE(out_scalar->value, - auto converted = Cast(in_values, descrs, - ctx->exec_context()).ValueOrDie(); + auto converted = Cast(datums, descrs, ctx->exec_context()).ValueOrDie(); + out_scalar->is_valid = true; } From f099bd53c64e7a3c815e89aae55165a97d1c5626 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 24 Jan 2022 17:43:30 +0100 Subject: [PATCH 10/51] replaced unused code with NotImplemented --- .../arrow/compute/kernels/scalar_cast_nested.cc | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 399c6b64f9e..6d85602eaa1 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -203,21 +203,7 @@ struct CastStruct { return Status::OK(); } - const ArrayData& in_array = *batch[0].array(); - ArrayData *out_array = out->mutable_array(); - out_array->buffers = in_array.buffers; - - for (unsigned long i=0; i < in_array.child_data.size(); i++) { - Datum values = in_array.child_data[i]; - - ARROW_ASSIGN_OR_RAISE(Datum cast_values, - Cast(values, out->type()->field(i)->type(), options, ctx->exec_context())); - - DCHECK_EQ(Datum::ARRAY, cast_values.kind()); - out_array->child_data.push_back(cast_values.array()); - } - - return Status::OK(); + return Status::NotImplemented("Only Scalar Struct casts have been implemented"); } }; From 2d22e082b3ebc6634b4803da1b78855bb0fd7840 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 24 Jan 2022 17:57:30 +0100 Subject: [PATCH 11/51] working scalar impl --- cpp/src/arrow/compute/kernels/scalar_cast_nested.cc | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 6d85602eaa1..a058932ba34 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -152,8 +152,6 @@ void AddListCast(CastFunction* func) { struct CastStruct { static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - const CastOptions& options = CastState::Get(ctx); - // Note that size refers to the number of struct elements, not the length of the // arrays const auto in_size = checked_cast(*batch[0].type()).num_fields(); @@ -187,6 +185,8 @@ struct CastStruct { descrs.push_back(descr); } + // TODO: one of the main issues right now is casting a const vector + // of pointers to Scalars to a vector of Datum and back std::vector> in_values = in_scalar.value; std::vector datums{}; for (auto i{0}; i < in_size; i++) { @@ -196,8 +196,13 @@ struct CastStruct { if (in_scalar.is_valid) { //ARROW_ASSIGN_OR_RAISE(out_scalar->value, auto converted = Cast(datums, descrs, ctx->exec_context()).ValueOrDie(); - + // TODO: same issue with vector of Datum -> Scalar conversion + std::vector> out_values; + for (auto i{0}; i < in_size; i++) { + out_values.push_back(converted[i].scalar()); + } + out_scalar->value = out_values; out_scalar->is_valid = true; } return Status::OK(); From 4a2fd05e8a71d9ad49e436be240de22b6b1d88c3 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 24 Jan 2022 18:10:44 +0100 Subject: [PATCH 12/51] code cleanup --- .../compute/kernels/scalar_cast_nested.cc | 26 +++++++++---------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index a058932ba34..5670b1368fe 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -152,17 +152,15 @@ void AddListCast(CastFunction* func) { struct CastStruct { static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - // Note that size refers to the number of struct elements, not the length of the - // arrays - const auto in_size = checked_cast(*batch[0].type()).num_fields(); - const auto out_size = checked_cast(*out->type()).num_fields(); + const auto in_field_count = checked_cast(*batch[0].type()).num_fields(); + const auto out_field_count = checked_cast(*out->type()).num_fields(); - if (in_size != out_size) { + if (in_field_count != out_field_count) { ARROW_RETURN_NOT_OK( Status(StatusCode::TypeError, "struct field sizes do not match")); } - for (auto i{0}; i < in_size; i++) { + for (auto i{0}; i < in_field_count; i++) { const auto in_field_name = checked_cast(*batch[0].type()).field(i)->name(); const auto out_field_name = @@ -178,9 +176,10 @@ struct CastStruct { auto out_scalar = checked_cast(out->scalar().get()); std::vector descrs{}; - for (auto i{0}; i < in_size; i++) { + for (auto i{0}; i < in_field_count; i++) { auto field = out->type()->field(i); - // TODO: don't hard code SCALAR in this call + // TODO: don't hard code SCALAR in this call; OR alternately + // raise if we only support SCALAR elements auto descr = ValueDescr(field->type(), ValueDescr::SCALAR); descrs.push_back(descr); } @@ -189,22 +188,21 @@ struct CastStruct { // of pointers to Scalars to a vector of Datum and back std::vector> in_values = in_scalar.value; std::vector datums{}; - for (auto i{0}; i < in_size; i++) { + for (auto i{0}; i < in_field_count; i++) { datums.push_back(Datum(in_values[i])); } if (in_scalar.is_valid) { - //ARROW_ASSIGN_OR_RAISE(out_scalar->value, - auto converted = Cast(datums, descrs, ctx->exec_context()).ValueOrDie(); + auto casted = Cast(datums, descrs, ctx->exec_context()).ValueOrDie(); // TODO: same issue with vector of Datum -> Scalar conversion std::vector> out_values; - for (auto i{0}; i < in_size; i++) { - out_values.push_back(converted[i].scalar()); + for (auto i{0}; i < in_field_count; i++) { + out_values.push_back(casted[i].scalar()); } out_scalar->value = out_values; out_scalar->is_valid = true; - } + } return Status::OK(); } From 2bf3e22381d71379b604931e278ec3a79d9ebbe2 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 24 Jan 2022 18:47:40 +0100 Subject: [PATCH 13/51] passing tests --- .../compute/kernels/scalar_cast_nested.cc | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 5670b1368fe..3bd49e972b5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -17,6 +17,7 @@ // Implementation of casting to (or between) list types +#include #include #include #include @@ -152,6 +153,8 @@ void AddListCast(CastFunction* func) { struct CastStruct { static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + // TODO: we aren't using options in the Scalar case + const CastOptions& options = CastState::Get(ctx); const auto in_field_count = checked_cast(*batch[0].type()).num_fields(); const auto out_field_count = checked_cast(*out->type()).num_fields(); @@ -206,7 +209,21 @@ struct CastStruct { return Status::OK(); } - return Status::NotImplemented("Only Scalar Struct casts have been implemented"); + // TODO: should we raise here for is_downcast similar to what the List does? + const ArrayData& in_array = *batch[0].array(); + ArrayData* out_array = out->mutable_array(); + + for (auto i{0}; i < in_field_count; i++) { + auto values = in_array.child_data[0]; + auto target_type = out->type()->field(i)->type(); + ARROW_ASSIGN_OR_RAISE(Datum cast_values, + Cast(values, target_type, options, ctx->exec_context())); + + DCHECK_EQ(Datum::ARRAY, cast_values.kind()); + out_array->child_data.push_back(cast_values.array()); + } + + return Status::OK(); } }; From 7df2f0179709b7d6dfd7005d0218c19f2a89a316 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 24 Jan 2022 18:49:20 +0100 Subject: [PATCH 14/51] format and cleanup --- .../compute/kernels/scalar_cast_nested.cc | 45 ++++++++++--------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 3bd49e972b5..b14ff7469a9 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -17,7 +17,6 @@ // Implementation of casting to (or between) list types -#include #include #include #include @@ -148,15 +147,17 @@ void AddListCast(CastFunction* func) { kernel.signature = KernelSignature::Make({InputType(SrcType::type_id)}, kOutputTargetType); kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; - DCHECK_OK( func->AddKernel(SrcType::type_id, std::move(kernel))); + DCHECK_OK(func->AddKernel(SrcType::type_id, std::move(kernel))); } struct CastStruct { static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { // TODO: we aren't using options in the Scalar case const CastOptions& options = CastState::Get(ctx); - const auto in_field_count = checked_cast(*batch[0].type()).num_fields(); - const auto out_field_count = checked_cast(*out->type()).num_fields(); + const auto in_field_count = + checked_cast(*batch[0].type()).num_fields(); + const auto out_field_count = + checked_cast(*out->type()).num_fields(); if (in_field_count != out_field_count) { ARROW_RETURN_NOT_OK( @@ -180,11 +181,11 @@ struct CastStruct { std::vector descrs{}; for (auto i{0}; i < in_field_count; i++) { - auto field = out->type()->field(i); - // TODO: don't hard code SCALAR in this call; OR alternately - // raise if we only support SCALAR elements - auto descr = ValueDescr(field->type(), ValueDescr::SCALAR); - descrs.push_back(descr); + auto field = out->type()->field(i); + // TODO: don't hard code SCALAR in this call; OR alternately + // raise if we only support SCALAR elements + auto descr = ValueDescr(field->type(), ValueDescr::SCALAR); + descrs.push_back(descr); } // TODO: one of the main issues right now is casting a const vector @@ -192,18 +193,18 @@ struct CastStruct { std::vector> in_values = in_scalar.value; std::vector datums{}; for (auto i{0}; i < in_field_count; i++) { - datums.push_back(Datum(in_values[i])); + datums.push_back(Datum(in_values[i])); } if (in_scalar.is_valid) { - auto casted = Cast(datums, descrs, ctx->exec_context()).ValueOrDie(); - - // TODO: same issue with vector of Datum -> Scalar conversion - std::vector> out_values; - for (auto i{0}; i < in_field_count; i++) { - out_values.push_back(casted[i].scalar()); - } - out_scalar->value = out_values; + auto casted = Cast(datums, descrs, ctx->exec_context()).ValueOrDie(); + + // TODO: same issue with vector of Datum -> Scalar conversion + std::vector> out_values; + for (auto i{0}; i < in_field_count; i++) { + out_values.push_back(casted[i].scalar()); + } + out_scalar->value = out_values; out_scalar->is_valid = true; } return Status::OK(); @@ -217,12 +218,12 @@ struct CastStruct { auto values = in_array.child_data[0]; auto target_type = out->type()->field(i)->type(); ARROW_ASSIGN_OR_RAISE(Datum cast_values, - Cast(values, target_type, options, ctx->exec_context())); - + Cast(values, target_type, options, ctx->exec_context())); + DCHECK_EQ(Datum::ARRAY, cast_values.kind()); out_array->child_data.push_back(cast_values.array()); } - + return Status::OK(); } }; @@ -231,7 +232,7 @@ void AddStructToStructCast(CastFunction* func) { ScalarKernel kernel; kernel.exec = CastStruct::Exec; kernel.signature = - KernelSignature::Make({InputType(StructType::type_id)}, kOutputTargetType); + KernelSignature::Make({InputType(StructType::type_id)}, kOutputTargetType); kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; DCHECK_OK(func->AddKernel(StructType::type_id, std::move(kernel))); } From bc21c880e2ec949719aedbe243f991f2af23903f Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 24 Jan 2022 18:50:00 +0100 Subject: [PATCH 15/51] revert inadvertant license typo change --- cpp/src/arrow/compute/kernels/scalar_cast_nested.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index b14ff7469a9..b09ab9e7498 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -6,7 +6,7 @@ // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // -// htt://www.apache.org/licenses/LICENSE-2.0 +// http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an From 60dedcdb81c6151e72fb4550d40a69989c3437bf Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 24 Jan 2022 22:10:38 +0100 Subject: [PATCH 16/51] simplified impl --- .../compute/kernels/scalar_cast_nested.cc | 38 +++++-------------- 1 file changed, 9 insertions(+), 29 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index b09ab9e7498..07e7c75a8d5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -152,7 +152,6 @@ void AddListCast(CastFunction* func) { struct CastStruct { static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - // TODO: we aren't using options in the Scalar case const CastOptions& options = CastState::Get(ctx); const auto in_field_count = checked_cast(*batch[0].type()).num_fields(); @@ -175,42 +174,23 @@ struct CastStruct { } } - if (out->kind() == Datum::SCALAR) { + if (out->kind() == Datum::SCALAR) { const auto& in_scalar = checked_cast(*batch[0].scalar()); auto out_scalar = checked_cast(out->scalar().get()); - std::vector descrs{}; - for (auto i{0}; i < in_field_count; i++) { - auto field = out->type()->field(i); - // TODO: don't hard code SCALAR in this call; OR alternately - // raise if we only support SCALAR elements - auto descr = ValueDescr(field->type(), ValueDescr::SCALAR); - descrs.push_back(descr); - } - - // TODO: one of the main issues right now is casting a const vector - // of pointers to Scalars to a vector of Datum and back - std::vector> in_values = in_scalar.value; - std::vector datums{}; for (auto i{0}; i < in_field_count; i++) { - datums.push_back(Datum(in_values[i])); - } - - if (in_scalar.is_valid) { - auto casted = Cast(datums, descrs, ctx->exec_context()).ValueOrDie(); - - // TODO: same issue with vector of Datum -> Scalar conversion - std::vector> out_values; - for (auto i{0}; i < in_field_count; i++) { - out_values.push_back(casted[i].scalar()); - } - out_scalar->value = out_values; - out_scalar->is_valid = true; + auto values = in_scalar.value[i]; + auto target_type = out->type()->field(i)->type(); + ARROW_ASSIGN_OR_RAISE(Datum cast_values, + Cast(values, target_type, options, ctx->exec_context())); + DCHECK_EQ(Datum::SCALAR, cast_values.kind()); + out_scalar->value.push_back(cast_values.scalar()); } + + out_scalar->is_valid = true; return Status::OK(); } - // TODO: should we raise here for is_downcast similar to what the List does? const ArrayData& in_array = *batch[0].array(); ArrayData* out_array = out->mutable_array(); From a9d6f53dd6b6bdd61d0abea54527cff948c28e95 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 24 Jan 2022 22:12:44 +0100 Subject: [PATCH 17/51] clang fixup --- .../arrow/compute/kernels/scalar_cast_nested.cc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 07e7c75a8d5..c5676989080 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -174,19 +174,19 @@ struct CastStruct { } } - if (out->kind() == Datum::SCALAR) { + if (out->kind() == Datum::SCALAR) { const auto& in_scalar = checked_cast(*batch[0].scalar()); auto out_scalar = checked_cast(out->scalar().get()); for (auto i{0}; i < in_field_count; i++) { - auto values = in_scalar.value[i]; - auto target_type = out->type()->field(i)->type(); - ARROW_ASSIGN_OR_RAISE(Datum cast_values, - Cast(values, target_type, options, ctx->exec_context())); - DCHECK_EQ(Datum::SCALAR, cast_values.kind()); - out_scalar->value.push_back(cast_values.scalar()); + auto values = in_scalar.value[i]; + auto target_type = out->type()->field(i)->type(); + ARROW_ASSIGN_OR_RAISE(Datum cast_values, + Cast(values, target_type, options, ctx->exec_context())); + DCHECK_EQ(Datum::SCALAR, cast_values.kind()); + out_scalar->value.push_back(cast_values.scalar()); } - + out_scalar->is_valid = true; return Status::OK(); } From daf9ecde6c1debb75453efd64309eaf873c12c7c Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 24 Jan 2022 22:13:37 +0100 Subject: [PATCH 18/51] comment cleanup --- cpp/src/arrow/compute/kernels/scalar_cast_test.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index d38a6e76f29..dff353e9e41 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2244,8 +2244,6 @@ TEST(Cast, StructToSameSizedButDifferentNamedStruct) { b = ArrayFromJSON(int8(), "[3, 4]"); auto src = StructArray::Make({a, b}, field_names).ValueOrDie(); - // TODO: instead of creating an area create StructType directly - // error: call to implicitly-deleted copy constructor of 'arrow::StructType' std::vector field_names2 = {"c", "d"}; std::shared_ptr c, d; c = ArrayFromJSON(int8(), "[1, 2]"); From 48e6ac2512cf6a125bd27215c27b2b20e337a08c Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 24 Jan 2022 22:16:22 +0100 Subject: [PATCH 19/51] remove auto from external loops --- cpp/src/arrow/compute/kernels/scalar_cast_nested.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index c5676989080..98772b4973a 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -163,7 +163,7 @@ struct CastStruct { Status(StatusCode::TypeError, "struct field sizes do not match")); } - for (auto i{0}; i < in_field_count; i++) { + for (int64_t i = 0; i < in_field_count; ++i) { const auto in_field_name = checked_cast(*batch[0].type()).field(i)->name(); const auto out_field_name = @@ -194,7 +194,7 @@ struct CastStruct { const ArrayData& in_array = *batch[0].array(); ArrayData* out_array = out->mutable_array(); - for (auto i{0}; i < in_field_count; i++) { + for (int64_t i = 0; i < in_field_count; ++i) { auto values = in_array.child_data[0]; auto target_type = out->type()->field(i)->type(); ARROW_ASSIGN_OR_RAISE(Datum cast_values, From a5b6ca4e7d8c62c88de12d40d4f72769b3df2eeb Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 24 Jan 2022 20:10:20 -0800 Subject: [PATCH 20/51] TODO removal --- cpp/src/arrow/compute/kernels/scalar_cast_test.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index dff353e9e41..acfa275b046 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2263,8 +2263,6 @@ TEST(Cast, StructToDifferentSizeStruct) { b = ArrayFromJSON(int8(), "[3, 4]"); auto src = StructArray::Make({a, b}, field_names).ValueOrDie(); - // TODO: instead of creating an area create StructType directly - // error: call to implicitly-deleted copy constructor of 'arrow::StructType' std::vector field_names2 = {"a", "b", "c"}; std::shared_ptr a2, b2, c; a2 = ArrayFromJSON(int8(), "[1, 2]"); From 03a1a51bd92dc81412b908fe5c91b7a0e15b9080 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 24 Jan 2022 20:12:26 -0800 Subject: [PATCH 21/51] removed auto i loop initialization --- cpp/src/arrow/compute/kernels/scalar_cast_nested.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 98772b4973a..fb1e7777a2f 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -178,7 +178,7 @@ struct CastStruct { const auto& in_scalar = checked_cast(*batch[0].scalar()); auto out_scalar = checked_cast(out->scalar().get()); - for (auto i{0}; i < in_field_count; i++) { + for (int64_t i = 0; i < in_field_count; i++) { auto values = in_scalar.value[i]; auto target_type = out->type()->field(i)->type(); ARROW_ASSIGN_OR_RAISE(Datum cast_values, From a8f843eb6e993156c4da23355edc23f9f5f34dfc Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 1 Feb 2022 06:28:41 +0000 Subject: [PATCH 22/51] test with slice offset --- cpp/src/arrow/compute/kernels/scalar_cast_test.cc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index acfa275b046..56d0c8fb02c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2229,6 +2229,14 @@ static void CheckStructToStruct( auto dest = StructArray::Make({a2}, field_names).ValueOrDie(); CheckCast(src, dest); + + // Test corner case using children with offsets + auto a3 = ArrayFromJSON(src_value_type, "[1, 2, 3]")->Slice(1, 2); + auto a4 = ArrayFromJSON(dest_value_type, "[2, 3]"); + auto slicedSrc = StructArray::Make({a3}, field_names).ValueOrDie(); + auto slicedDest = StructArray::Make({a4}, field_names).ValueOrDie(); + + CheckCast(slicedSrc, slicedDest); } } } From 212fb2db39ce06315b79a1a5367b2741706b85da Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 1 Feb 2022 06:34:45 +0000 Subject: [PATCH 23/51] simplify return type --- cpp/src/arrow/compute/kernels/scalar_cast_nested.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index fb1e7777a2f..e4a1273569e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -159,8 +159,7 @@ struct CastStruct { checked_cast(*out->type()).num_fields(); if (in_field_count != out_field_count) { - ARROW_RETURN_NOT_OK( - Status(StatusCode::TypeError, "struct field sizes do not match")); + return Status(StatusCode::TypeError, "struct field sizes do not match"); } for (int64_t i = 0; i < in_field_count; ++i) { From f8b1b4ce7a602145d8b38eb3bc00e0e02e8d8ae0 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 1 Feb 2022 06:39:20 +0000 Subject: [PATCH 24/51] scalar valid checks --- .../compute/kernels/scalar_cast_nested.cc | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index e4a1273569e..caa9f892111 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -177,16 +177,18 @@ struct CastStruct { const auto& in_scalar = checked_cast(*batch[0].scalar()); auto out_scalar = checked_cast(out->scalar().get()); - for (int64_t i = 0; i < in_field_count; i++) { - auto values = in_scalar.value[i]; - auto target_type = out->type()->field(i)->type(); - ARROW_ASSIGN_OR_RAISE(Datum cast_values, - Cast(values, target_type, options, ctx->exec_context())); - DCHECK_EQ(Datum::SCALAR, cast_values.kind()); - out_scalar->value.push_back(cast_values.scalar()); + DCHECK(!out_scalar->is_valid); + if (in_scalar.is_valid) { + for (int64_t i = 0; i < in_field_count; i++) { + auto values = in_scalar.value[i]; + auto target_type = out->type()->field(i)->type(); + ARROW_ASSIGN_OR_RAISE(Datum cast_values, + Cast(values, target_type, options, ctx->exec_context())); + DCHECK_EQ(Datum::SCALAR, cast_values.kind()); + out_scalar->value.push_back(cast_values.scalar()); + } + out_scalar->is_valid = true; } - - out_scalar->is_valid = true; return Status::OK(); } From 23aa3bff7a32a78a931694d1826bf0c1f1e633c9 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 1 Feb 2022 07:12:08 +0000 Subject: [PATCH 25/51] clang-format --- .../compute/kernels/scalar_cast_nested.cc | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index caa9f892111..99d71a9381c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -179,15 +179,15 @@ struct CastStruct { DCHECK(!out_scalar->is_valid); if (in_scalar.is_valid) { - for (int64_t i = 0; i < in_field_count; i++) { - auto values = in_scalar.value[i]; - auto target_type = out->type()->field(i)->type(); - ARROW_ASSIGN_OR_RAISE(Datum cast_values, - Cast(values, target_type, options, ctx->exec_context())); - DCHECK_EQ(Datum::SCALAR, cast_values.kind()); - out_scalar->value.push_back(cast_values.scalar()); - } - out_scalar->is_valid = true; + for (int64_t i = 0; i < in_field_count; i++) { + auto values = in_scalar.value[i]; + auto target_type = out->type()->field(i)->type(); + ARROW_ASSIGN_OR_RAISE(Datum cast_values, + Cast(values, target_type, options, ctx->exec_context())); + DCHECK_EQ(Datum::SCALAR, cast_values.kind()); + out_scalar->value.push_back(cast_values.scalar()); + } + out_scalar->is_valid = true; } return Status::OK(); } From 6d0b1970fa4530f0f8615dcde8c25f3a99a753ef Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 2 Feb 2022 18:16:09 -0800 Subject: [PATCH 26/51] initial feedback --- .../compute/kernels/scalar_cast_nested.cc | 5 ++-- .../arrow/compute/kernels/scalar_cast_test.cc | 27 ++++++++++--------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 99d71a9381c..65978431ed4 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -159,7 +159,7 @@ struct CastStruct { checked_cast(*out->type()).num_fields(); if (in_field_count != out_field_count) { - return Status(StatusCode::TypeError, "struct field sizes do not match"); + return Status::TypeError("struct field sizes do not match"); } for (int64_t i = 0; i < in_field_count; ++i) { @@ -168,8 +168,7 @@ struct CastStruct { const auto out_field_name = checked_cast(*out->type()).field(i)->name(); if (in_field_name != out_field_name) { - ARROW_RETURN_NOT_OK( - Status(StatusCode::TypeError, "struct field names do not match")); + return Status::TypeError("struct field names do not match"); } } diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 56d0c8fb02c..13889e7a0b7 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2225,16 +2225,16 @@ static void CheckStructToStruct( std::shared_ptr a1, b1, a2, b2; a1 = ArrayFromJSON(src_value_type, "[1, 2]"); a2 = ArrayFromJSON(dest_value_type, "[1, 2]"); - auto src = StructArray::Make({a1}, field_names).ValueOrDie(); - auto dest = StructArray::Make({a2}, field_names).ValueOrDie(); + ASSERT_OK_AND_ASSIGN(auto src, StructArray::Make({a1}, field_names)); + ASSERT_OK_AND_ASSIGN(auto dest, StructArray::Make({a2}, field_names)); CheckCast(src, dest); // Test corner case using children with offsets - auto a3 = ArrayFromJSON(src_value_type, "[1, 2, 3]")->Slice(1, 2); - auto a4 = ArrayFromJSON(dest_value_type, "[2, 3]"); - auto slicedSrc = StructArray::Make({a3}, field_names).ValueOrDie(); - auto slicedDest = StructArray::Make({a4}, field_names).ValueOrDie(); + auto a3 = ArrayFromJSON(src_value_type, "[1, 2, 3, 4, 5]")->Slice(1, 4); + auto a4 = ArrayFromJSON(dest_value_type, "[2, 3, 4, 5]"); + ASSERT_OK_AND_ASSIGN(auto slicedSrc, StructArray::Make({a3}, field_names)); + ASSERT_OK_AND_ASSIGN(auto slicedDest, StructArray::Make({a4}, field_names)); CheckCast(slicedSrc, slicedDest); } @@ -2250,17 +2250,18 @@ TEST(Cast, StructToSameSizedButDifferentNamedStruct) { std::shared_ptr a, b; a = ArrayFromJSON(int8(), "[1, 2]"); b = ArrayFromJSON(int8(), "[3, 4]"); - auto src = StructArray::Make({a, b}, field_names).ValueOrDie(); + ASSERT_OK_AND_ASSIGN(auto src, StructArray::Make({a, b}, field_names)); std::vector field_names2 = {"c", "d"}; std::shared_ptr c, d; c = ArrayFromJSON(int8(), "[1, 2]"); d = ArrayFromJSON(int8(), "[3, 4]"); - auto dest = StructArray::Make({c, d}, field_names2).ValueOrDie(); - auto options = CastOptions{}; + ASSERT_OK_AND_ASSIGN(auto dest, StructArray::Make({c, d}, field_names2)); + auto options = CastOptions::Safe(dest->type()); options.to_type = dest->type(); - ASSERT_RAISES_WITH_MESSAGE(TypeError, "Type error: struct field names do not match", + EXPECT_RAISES_WITH_MESSAGE_THAT(TypeError, + ::testing::HasSubstr("Type error: struct field names do not match"), Cast(src, options)); } @@ -2269,18 +2270,18 @@ TEST(Cast, StructToDifferentSizeStruct) { std::shared_ptr a, b; a = ArrayFromJSON(int8(), "[1, 2]"); b = ArrayFromJSON(int8(), "[3, 4]"); - auto src = StructArray::Make({a, b}, field_names).ValueOrDie(); + ASSERT_OK_AND_ASSIGN(auto src, StructArray::Make({a, b}, field_names)); std::vector field_names2 = {"a", "b", "c"}; std::shared_ptr a2, b2, c; a2 = ArrayFromJSON(int8(), "[1, 2]"); b2 = ArrayFromJSON(int8(), "[3, 4]"); c = ArrayFromJSON(int8(), "[5, 6]"); - auto dest = StructArray::Make({a2, b2, c}, field_names2).ValueOrDie(); + ASSERT_OK_AND_ASSIGN(auto dest, StructArray::Make({a2, b2, c}, field_names2)); auto options = CastOptions{}; options.to_type = dest->type(); - ASSERT_RAISES_WITH_MESSAGE(TypeError, "Type error: struct field sizes do not match", + EXPECT_RAISES_WITH_MESSAGE_THAT(TypeError, ::testing::HasSubstr("Type error: struct field sizes do not match"), Cast(src, options)); } From 3528df6c9bfce94ed561f29a0dac9244323f76aa Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 2 Feb 2022 19:12:44 -0800 Subject: [PATCH 27/51] better error messages --- cpp/src/arrow/compute/kernels/scalar_cast_nested.cc | 13 +++++++------ cpp/src/arrow/compute/kernels/scalar_cast_test.cc | 4 ++-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 65978431ed4..15f37e14720 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -159,16 +159,17 @@ struct CastStruct { checked_cast(*out->type()).num_fields(); if (in_field_count != out_field_count) { - return Status::TypeError("struct field sizes do not match"); + return Status::TypeError("struct field sizes do not match: ", in_field_count, " and ", out_field_count); } for (int64_t i = 0; i < in_field_count; ++i) { - const auto in_field_name = - checked_cast(*batch[0].type()).field(i)->name(); - const auto out_field_name = - checked_cast(*out->type()).field(i)->name(); + const auto in_field_name = checked_cast(*batch[0].type()).field(i)->name(); + const auto out_field_name = checked_cast(*out->type()).field(i)->name(); if (in_field_name != out_field_name) { - return Status::TypeError("struct field names do not match"); + return Status::TypeError("struct field names do not match: ", + batch[0].type()->ToString(), + " ", + out->type()->ToString()); } } diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 13889e7a0b7..96cb368315e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2261,7 +2261,7 @@ TEST(Cast, StructToSameSizedButDifferentNamedStruct) { options.to_type = dest->type(); EXPECT_RAISES_WITH_MESSAGE_THAT(TypeError, - ::testing::HasSubstr("Type error: struct field names do not match"), + ::testing::HasSubstr("Type error: struct field names do not match: struct struct"), Cast(src, options)); } @@ -2281,7 +2281,7 @@ TEST(Cast, StructToDifferentSizeStruct) { auto options = CastOptions{}; options.to_type = dest->type(); - EXPECT_RAISES_WITH_MESSAGE_THAT(TypeError, ::testing::HasSubstr("Type error: struct field sizes do not match"), + EXPECT_RAISES_WITH_MESSAGE_THAT(TypeError, ::testing::HasSubstr("Type error: struct field sizes do not match: 2 and 3"), Cast(src, options)); } From 59718ecf74a2df993e4f0e7c7aab832bcf95c199 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 2 Feb 2022 19:13:33 -0800 Subject: [PATCH 28/51] clang-format --- .../arrow/compute/kernels/scalar_cast_nested.cc | 16 +++++++++------- .../arrow/compute/kernels/scalar_cast_test.cc | 14 +++++++++----- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 15f37e14720..7195aea5056 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -159,17 +159,19 @@ struct CastStruct { checked_cast(*out->type()).num_fields(); if (in_field_count != out_field_count) { - return Status::TypeError("struct field sizes do not match: ", in_field_count, " and ", out_field_count); + return Status::TypeError("struct field sizes do not match: ", in_field_count, + " and ", out_field_count); } for (int64_t i = 0; i < in_field_count; ++i) { - const auto in_field_name = checked_cast(*batch[0].type()).field(i)->name(); - const auto out_field_name = checked_cast(*out->type()).field(i)->name(); + const auto in_field_name = + checked_cast(*batch[0].type()).field(i)->name(); + const auto out_field_name = + checked_cast(*out->type()).field(i)->name(); if (in_field_name != out_field_name) { - return Status::TypeError("struct field names do not match: ", - batch[0].type()->ToString(), - " ", - out->type()->ToString()); + return Status::TypeError( + "struct field names do not match: ", batch[0].type()->ToString(), " ", + out->type()->ToString()); } } diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 96cb368315e..75e1dd5444b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2260,9 +2260,11 @@ TEST(Cast, StructToSameSizedButDifferentNamedStruct) { auto options = CastOptions::Safe(dest->type()); options.to_type = dest->type(); - EXPECT_RAISES_WITH_MESSAGE_THAT(TypeError, - ::testing::HasSubstr("Type error: struct field names do not match: struct struct"), - Cast(src, options)); + EXPECT_RAISES_WITH_MESSAGE_THAT( + TypeError, + ::testing::HasSubstr("Type error: struct field names do not match: struct struct"), + Cast(src, options)); } TEST(Cast, StructToDifferentSizeStruct) { @@ -2281,8 +2283,10 @@ TEST(Cast, StructToDifferentSizeStruct) { auto options = CastOptions{}; options.to_type = dest->type(); - EXPECT_RAISES_WITH_MESSAGE_THAT(TypeError, ::testing::HasSubstr("Type error: struct field sizes do not match: 2 and 3"), - Cast(src, options)); + EXPECT_RAISES_WITH_MESSAGE_THAT( + TypeError, + ::testing::HasSubstr("Type error: struct field sizes do not match: 2 and 3"), + Cast(src, options)); } TEST(Cast, IdentityCasts) { From c6856cae382cbaeec834930582c99212448e20b2 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 4 Feb 2022 19:20:13 -0600 Subject: [PATCH 29/51] updated scanner test --- cpp/src/arrow/dataset/scanner_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/dataset/scanner_test.cc b/cpp/src/arrow/dataset/scanner_test.cc index e62c626d9f8..8948c2ea438 100644 --- a/cpp/src/arrow/dataset/scanner_test.cc +++ b/cpp/src/arrow/dataset/scanner_test.cc @@ -1626,8 +1626,8 @@ TEST(ScanNode, MaterializationOfNestedVirtualColumn) { // TODO(ARROW-1888): allow scanner to "patch up" structs with casts EXPECT_FINISHES_AND_RAISES_WITH_MESSAGE_THAT( - NotImplemented, - ::testing::HasSubstr("Unsupported cast from struct to struct"), + TypeError, + ::testing::HasSubstr("struct field sizes do not match"), plan.Run()); } From f73893e325c5b28df539f3fb8c6f052a49d27903 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 4 Feb 2022 19:22:15 -0600 Subject: [PATCH 30/51] make longer structs for CheckScalar test --- cpp/src/arrow/compute/kernels/scalar_cast_test.cc | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 75e1dd5444b..ff8f34ed332 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2223,20 +2223,12 @@ static void CheckStructToStruct( for (const auto& dest_value_type : value_types) { std::vector field_names = {"a"}; std::shared_ptr a1, b1, a2, b2; - a1 = ArrayFromJSON(src_value_type, "[1, 2]"); - a2 = ArrayFromJSON(dest_value_type, "[1, 2]"); + a1 = ArrayFromJSON(src_value_type, "[1, 2, 3, 4, 5]"); + a2 = ArrayFromJSON(dest_value_type, "[1, 2, 3, 4, 5]"); ASSERT_OK_AND_ASSIGN(auto src, StructArray::Make({a1}, field_names)); ASSERT_OK_AND_ASSIGN(auto dest, StructArray::Make({a2}, field_names)); CheckCast(src, dest); - - // Test corner case using children with offsets - auto a3 = ArrayFromJSON(src_value_type, "[1, 2, 3, 4, 5]")->Slice(1, 4); - auto a4 = ArrayFromJSON(dest_value_type, "[2, 3, 4, 5]"); - ASSERT_OK_AND_ASSIGN(auto slicedSrc, StructArray::Make({a3}, field_names)); - ASSERT_OK_AND_ASSIGN(auto slicedDest, StructArray::Make({a4}, field_names)); - - CheckCast(slicedSrc, slicedDest); } } } From 5be79f5f5cec5a171ea28bfcbf847c9dba8381aa Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 4 Feb 2022 22:00:14 -0600 Subject: [PATCH 31/51] added comments for research --- .../compute/kernels/scalar_cast_nested.cc | 43 ++++++++++++++++--- 1 file changed, 36 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 7195aea5056..47255150da8 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -195,17 +195,46 @@ struct CastStruct { } const ArrayData& in_array = *batch[0].array(); + // TODO: unlike List implementation, where their associated types + // have an offset type size (i.e. FixedSizeListType with int64_t offsets) + // StructType may not have the same thing + //auto offsets = in_array.GetValues(1); + Datum values = in_array.child_data[0]; + ArrayData* out_array = out->mutable_array(); + out_array->buffers = in_array.buffers; - for (int64_t i = 0; i < in_field_count; ++i) { - auto values = in_array.child_data[0]; - auto target_type = out->type()->field(i)->type(); - ARROW_ASSIGN_OR_RAISE(Datum cast_values, - Cast(values, target_type, options, ctx->exec_context())); + // Shift bitmap in case the source offset is non-zero + if (in_array.offset != 0 && in_array.buffers[0]) { + ARROW_ASSIGN_OR_RAISE(out_array->buffers[0], + CopyBitmap(ctx->memory_pool(), in_array.buffers[0]->data(), + in_array.offset, in_array.length)); + } + + // Handle struct offsets + if (in_array.offset != 0) { + ARROW_ASSIGN_OR_RAISE( + out_array->buffers[1], + ctx->Allocate(sizeof(StructType) * (in_array.length + 1))); - DCHECK_EQ(Datum::ARRAY, cast_values.kind()); - out_array->child_data.push_back(cast_values.array()); + /* + auto shifted_offsets = out_array->GetMutableValues(1); + for (int64_t i = 0; i < in_array.length + 1; ++i) { + shifted_offsets[i] = static_cast(offsets[i] - offsets[0]); + } + values = in_array.child_data[0]->Slice(offsets[0], offsets[in_array.length]); + */ } + + for (int64_t i = 0; i < in_field_count; ++i) { + auto values = in_array.child_data[0]; + auto target_type = out->type()->field(i)->type(); + ARROW_ASSIGN_OR_RAISE(Datum cast_values, + Cast(values, target_type, options, ctx->exec_context())); + + DCHECK_EQ(Datum::ARRAY, cast_values.kind()); + out_array->child_data.push_back(cast_values.array()); + } return Status::OK(); } From a2c134653b467f219f4a80a6652065c4b3d865b8 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 4 Feb 2022 22:20:18 -0600 Subject: [PATCH 32/51] compiling with segfault --- cpp/src/arrow/compute/kernels/scalar_cast_nested.cc | 13 ++++--------- cpp/src/arrow/type.h | 1 + 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 47255150da8..437fc988e62 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -195,10 +195,7 @@ struct CastStruct { } const ArrayData& in_array = *batch[0].array(); - // TODO: unlike List implementation, where their associated types - // have an offset type size (i.e. FixedSizeListType with int64_t offsets) - // StructType may not have the same thing - //auto offsets = in_array.GetValues(1); + auto offsets = in_array.GetValues(1); Datum values = in_array.child_data[0]; ArrayData* out_array = out->mutable_array(); @@ -215,15 +212,13 @@ struct CastStruct { if (in_array.offset != 0) { ARROW_ASSIGN_OR_RAISE( out_array->buffers[1], - ctx->Allocate(sizeof(StructType) * (in_array.length + 1))); + ctx->Allocate(sizeof(StructType::offset_type) * (in_array.length + 1))); - /* - auto shifted_offsets = out_array->GetMutableValues(1); + auto shifted_offsets = out_array->GetMutableValues(1); for (int64_t i = 0; i < in_array.length + 1; ++i) { - shifted_offsets[i] = static_cast(offsets[i] - offsets[0]); + shifted_offsets[i] = static_cast(offsets[i] - offsets[0]); } values = in_array.child_data[0]->Slice(offsets[0], offsets[in_array.length]); - */ } for (int64_t i = 0; i < in_field_count; ++i) { diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 4c439841ba2..8e0bbf5471b 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -972,6 +972,7 @@ class ARROW_EXPORT FixedSizeListType : public BaseListType { class ARROW_EXPORT StructType : public NestedType { public: static constexpr Type::type type_id = Type::STRUCT; + using offset_type = int32_t; static constexpr const char* type_name() { return "struct"; } From b5b96d5eaf1607112737ff450dcb93a9a44771b6 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 4 Feb 2022 22:22:45 -0600 Subject: [PATCH 33/51] added struct ToString for different sizes test --- cpp/src/arrow/compute/kernels/scalar_cast_nested.cc | 5 +++-- cpp/src/arrow/compute/kernels/scalar_cast_test.cc | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 437fc988e62..25208b8a2c0 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -159,8 +159,9 @@ struct CastStruct { checked_cast(*out->type()).num_fields(); if (in_field_count != out_field_count) { - return Status::TypeError("struct field sizes do not match: ", in_field_count, - " and ", out_field_count); + return Status::TypeError( + "struct field sizes do not match: ", batch[0].type()->ToString(), " ", + " and ", out->type()->ToString()); } for (int64_t i = 0; i < in_field_count; ++i) { diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index ff8f34ed332..a66bc4e4633 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2277,7 +2277,8 @@ TEST(Cast, StructToDifferentSizeStruct) { EXPECT_RAISES_WITH_MESSAGE_THAT( TypeError, - ::testing::HasSubstr("Type error: struct field sizes do not match: 2 and 3"), + ::testing::HasSubstr("Type error: struct field sizes do not match: struct struct"), Cast(src, options)); } From ae87ae730011348cda02c0ca4d459325bab5c9ac Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 4 Feb 2022 22:38:27 -0600 Subject: [PATCH 34/51] better test; revert structtype change --- cpp/src/arrow/compute/kernels/scalar_cast_test.cc | 8 +++++--- cpp/src/arrow/type.h | 1 - 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index a66bc4e4633..9614f4886b2 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2221,12 +2221,14 @@ static void CheckStructToStruct( const std::vector>& value_types) { for (const auto& src_value_type : value_types) { for (const auto& dest_value_type : value_types) { - std::vector field_names = {"a"}; + std::vector field_names = {"a", "b"}; std::shared_ptr a1, b1, a2, b2; a1 = ArrayFromJSON(src_value_type, "[1, 2, 3, 4, 5]"); + b1 = ArrayFromJSON(src_value_type, "[6, 7, 8, 9, 0]"); a2 = ArrayFromJSON(dest_value_type, "[1, 2, 3, 4, 5]"); - ASSERT_OK_AND_ASSIGN(auto src, StructArray::Make({a1}, field_names)); - ASSERT_OK_AND_ASSIGN(auto dest, StructArray::Make({a2}, field_names)); + b2 = ArrayFromJSON(dest_value_type, "[6, 7, 8, 9, 0]"); + ASSERT_OK_AND_ASSIGN(auto src, StructArray::Make({a1, b1}, field_names)); + ASSERT_OK_AND_ASSIGN(auto dest, StructArray::Make({a2, b2}, field_names)); CheckCast(src, dest); } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 8e0bbf5471b..4c439841ba2 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -972,7 +972,6 @@ class ARROW_EXPORT FixedSizeListType : public BaseListType { class ARROW_EXPORT StructType : public NestedType { public: static constexpr Type::type type_id = Type::STRUCT; - using offset_type = int32_t; static constexpr const char* type_name() { return "struct"; } From 1de6beae5c0b087a2790e04c0723ae1a24366107 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 4 Feb 2022 22:39:54 -0600 Subject: [PATCH 35/51] revert some things --- .../compute/kernels/scalar_cast_nested.cc | 38 ++++--------------- 1 file changed, 7 insertions(+), 31 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 25208b8a2c0..46412635c80 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -196,41 +196,17 @@ struct CastStruct { } const ArrayData& in_array = *batch[0].array(); - auto offsets = in_array.GetValues(1); - Datum values = in_array.child_data[0]; - ArrayData* out_array = out->mutable_array(); - out_array->buffers = in_array.buffers; - - // Shift bitmap in case the source offset is non-zero - if (in_array.offset != 0 && in_array.buffers[0]) { - ARROW_ASSIGN_OR_RAISE(out_array->buffers[0], - CopyBitmap(ctx->memory_pool(), in_array.buffers[0]->data(), - in_array.offset, in_array.length)); - } - - // Handle struct offsets - if (in_array.offset != 0) { - ARROW_ASSIGN_OR_RAISE( - out_array->buffers[1], - ctx->Allocate(sizeof(StructType::offset_type) * (in_array.length + 1))); - - auto shifted_offsets = out_array->GetMutableValues(1); - for (int64_t i = 0; i < in_array.length + 1; ++i) { - shifted_offsets[i] = static_cast(offsets[i] - offsets[0]); - } - values = in_array.child_data[0]->Slice(offsets[0], offsets[in_array.length]); - } for (int64_t i = 0; i < in_field_count; ++i) { - auto values = in_array.child_data[0]; - auto target_type = out->type()->field(i)->type(); - ARROW_ASSIGN_OR_RAISE(Datum cast_values, - Cast(values, target_type, options, ctx->exec_context())); + auto values = in_array.child_data[0]; + auto target_type = out->type()->field(i)->type(); + ARROW_ASSIGN_OR_RAISE(Datum cast_values, + Cast(values, target_type, options, ctx->exec_context())); - DCHECK_EQ(Datum::ARRAY, cast_values.kind()); - out_array->child_data.push_back(cast_values.array()); - } + DCHECK_EQ(Datum::ARRAY, cast_values.kind()); + out_array->child_data.push_back(cast_values.array()); + } return Status::OK(); } From 4ff815e64ad0e857459733c0ca0575411476f8b8 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 4 Feb 2022 22:49:53 -0600 Subject: [PATCH 36/51] getting warmer --- cpp/src/arrow/compute/kernels/scalar_cast_nested.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 46412635c80..5c979a95a95 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -199,8 +199,9 @@ struct CastStruct { ArrayData* out_array = out->mutable_array(); for (int64_t i = 0; i < in_field_count; ++i) { - auto values = in_array.child_data[0]; + auto values = in_array.child_data[i]; auto target_type = out->type()->field(i)->type(); + ARROW_ASSIGN_OR_RAISE(Datum cast_values, Cast(values, target_type, options, ctx->exec_context())); From b051623029f88bfba8ecb125c233aaad2240b437 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 4 Feb 2022 22:53:19 -0600 Subject: [PATCH 37/51] clang-format --- cpp/src/arrow/compute/kernels/scalar_cast_nested.cc | 8 ++++---- cpp/src/arrow/compute/kernels/scalar_cast_test.cc | 2 +- cpp/src/arrow/dataset/scanner_test.cc | 4 +--- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 5c979a95a95..87e80bc3979 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -160,8 +160,8 @@ struct CastStruct { if (in_field_count != out_field_count) { return Status::TypeError( - "struct field sizes do not match: ", batch[0].type()->ToString(), " ", - " and ", out->type()->ToString()); + "struct field sizes do not match: ", batch[0].type()->ToString(), " ", " and ", + out->type()->ToString()); } for (int64_t i = 0; i < in_field_count; ++i) { @@ -197,13 +197,13 @@ struct CastStruct { const ArrayData& in_array = *batch[0].array(); ArrayData* out_array = out->mutable_array(); - + for (int64_t i = 0; i < in_field_count; ++i) { auto values = in_array.child_data[i]; auto target_type = out->type()->field(i)->type(); ARROW_ASSIGN_OR_RAISE(Datum cast_values, - Cast(values, target_type, options, ctx->exec_context())); + Cast(values, target_type, options, ctx->exec_context())); DCHECK_EQ(Datum::ARRAY, cast_values.kind()); out_array->child_data.push_back(cast_values.array()); diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 9614f4886b2..5ed370ad494 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2280,7 +2280,7 @@ TEST(Cast, StructToDifferentSizeStruct) { EXPECT_RAISES_WITH_MESSAGE_THAT( TypeError, ::testing::HasSubstr("Type error: struct field sizes do not match: struct struct"), + "b: int8> struct"), Cast(src, options)); } diff --git a/cpp/src/arrow/dataset/scanner_test.cc b/cpp/src/arrow/dataset/scanner_test.cc index 8948c2ea438..c37b0666bbb 100644 --- a/cpp/src/arrow/dataset/scanner_test.cc +++ b/cpp/src/arrow/dataset/scanner_test.cc @@ -1626,9 +1626,7 @@ TEST(ScanNode, MaterializationOfNestedVirtualColumn) { // TODO(ARROW-1888): allow scanner to "patch up" structs with casts EXPECT_FINISHES_AND_RAISES_WITH_MESSAGE_THAT( - TypeError, - ::testing::HasSubstr("struct field sizes do not match"), - plan.Run()); + TypeError, ::testing::HasSubstr("struct field sizes do not match"), plan.Run()); } TEST(ScanNode, MinimalEndToEnd) { From 20fa5d9893836346fb1d7eab1e0988d7bb3dd4fd Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sun, 6 Feb 2022 15:42:00 -0500 Subject: [PATCH 38/51] passing test --- cpp/src/arrow/compute/kernels/scalar_cast_nested.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 87e80bc3979..6227ed42d57 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -200,6 +200,9 @@ struct CastStruct { for (int64_t i = 0; i < in_field_count; ++i) { auto values = in_array.child_data[i]; + if (in_array.offset != 0) { + values = values->Slice(in_array.offset, in_array.length); + } auto target_type = out->type()->field(i)->type(); ARROW_ASSIGN_OR_RAISE(Datum cast_values, From 1ed998469ff7daa4fcfc88745b63a6cec7bb40e2 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sun, 6 Feb 2022 15:47:11 -0500 Subject: [PATCH 39/51] all tests passing --- cpp/src/arrow/compute/kernels/scalar_cast_nested.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 6227ed42d57..775956b9ce0 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -160,7 +160,7 @@ struct CastStruct { if (in_field_count != out_field_count) { return Status::TypeError( - "struct field sizes do not match: ", batch[0].type()->ToString(), " ", " and ", + "struct field sizes do not match: ", batch[0].type()->ToString(), " ", out->type()->ToString()); } From 090c040bad67c2a38e73c467d3693a9712d18a18 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sun, 6 Feb 2022 17:43:34 -0600 Subject: [PATCH 40/51] clang-format --- cpp/src/arrow/compute/kernels/scalar_cast_nested.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 775956b9ce0..632cce7ca31 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -159,9 +159,8 @@ struct CastStruct { checked_cast(*out->type()).num_fields(); if (in_field_count != out_field_count) { - return Status::TypeError( - "struct field sizes do not match: ", batch[0].type()->ToString(), " ", - out->type()->ToString()); + return Status::TypeError("struct field sizes do not match: ", + batch[0].type()->ToString(), " ", out->type()->ToString()); } for (int64_t i = 0; i < in_field_count; ++i) { @@ -201,7 +200,7 @@ struct CastStruct { for (int64_t i = 0; i < in_field_count; ++i) { auto values = in_array.child_data[i]; if (in_array.offset != 0) { - values = values->Slice(in_array.offset, in_array.length); + values = values->Slice(in_array.offset, in_array.length); } auto target_type = out->type()->field(i)->type(); From 9a505d61717345fb95cdb92fd8cc5dbe301b0b6e Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 7 Feb 2022 17:35:02 -0800 Subject: [PATCH 41/51] new test --- .../arrow/compute/kernels/scalar_cast_test.cc | 39 +++++++++++++++++-- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 24937de14a0..2188736ff08 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2253,7 +2253,6 @@ TEST(Cast, StructToSameSizedButDifferentNamedStruct) { d = ArrayFromJSON(int8(), "[3, 4]"); ASSERT_OK_AND_ASSIGN(auto dest, StructArray::Make({c, d}, field_names2)); auto options = CastOptions::Safe(dest->type()); - options.to_type = dest->type(); EXPECT_RAISES_WITH_MESSAGE_THAT( TypeError, @@ -2275,8 +2274,7 @@ TEST(Cast, StructToDifferentSizeStruct) { b2 = ArrayFromJSON(int8(), "[3, 4]"); c = ArrayFromJSON(int8(), "[5, 6]"); ASSERT_OK_AND_ASSIGN(auto dest, StructArray::Make({a2, b2, c}, field_names2)); - auto options = CastOptions{}; - options.to_type = dest->type(); + auto options = CastOptions::Safe(dest->type()); EXPECT_RAISES_WITH_MESSAGE_THAT( TypeError, @@ -2285,6 +2283,41 @@ TEST(Cast, StructToDifferentSizeStruct) { Cast(src, options)); } +TEST(Cast, StructToSameSizedButDifferentNullabilityStruct) { + // OK to go from non-nullable to nullable... + std::vector field_names = {"a", "b"}; + std::shared_ptr a1, b1; + a1 = ArrayFromJSON(int8(), "[1, 2]"); + b1 = ArrayFromJSON(int8(), "[3, 4]"); + ASSERT_OK_AND_ASSIGN(auto src1, StructArray::Make({a1, b1}, field_names)); + + std::shared_ptr c1, d1; + c1 = ArrayFromJSON(int8(), "[1, null]"); + d1 = ArrayFromJSON(int8(), "[3, 4]"); + ASSERT_OK_AND_ASSIGN(auto dest1, StructArray::Make({c1, d1}, field_names)); + + CheckCast(src1, dest1); + + // But not the other way around + std::shared_ptr a2, b2; + a2 = ArrayFromJSON(int8(), "[1, null]"); + b2 = ArrayFromJSON(int8(), "[3, 4]"); + ASSERT_OK_AND_ASSIGN(auto src2, StructArray::Make({a2, b2}, field_names)); + + std::shared_ptr c2, d2; + c2 = ArrayFromJSON(int8(), "[1, 2]"); + d2 = ArrayFromJSON(int8(), "[3, 4]"); + ASSERT_OK_AND_ASSIGN(auto dest2, StructArray::Make({c2, d2}, field_names)); + auto options = CastOptions::Safe(dest2->type()); + + EXPECT_RAISES_WITH_MESSAGE_THAT( + TypeError, + ::testing::HasSubstr("Type error: struct nullability does not match: struct struct"), + Cast(src2, options)); +} + + TEST(Cast, IdentityCasts) { // ARROW-4102 auto CheckIdentityCast = [](std::shared_ptr type, const std::string& json) { From 1e6b66059eeaa7be3ee05de9df140a20eeaf7616 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 7 Feb 2022 17:37:46 -0800 Subject: [PATCH 42/51] MSVC compat --- cpp/src/arrow/compute/kernels/scalar_cast_nested.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 632cce7ca31..f62e5f994d2 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -163,7 +163,7 @@ struct CastStruct { batch[0].type()->ToString(), " ", out->type()->ToString()); } - for (int64_t i = 0; i < in_field_count; ++i) { + for (int i = 0; i < in_field_count; ++i) { const auto in_field_name = checked_cast(*batch[0].type()).field(i)->name(); const auto out_field_name = @@ -181,7 +181,7 @@ struct CastStruct { DCHECK(!out_scalar->is_valid); if (in_scalar.is_valid) { - for (int64_t i = 0; i < in_field_count; i++) { + for (int i = 0; i < in_field_count; i++) { auto values = in_scalar.value[i]; auto target_type = out->type()->field(i)->type(); ARROW_ASSIGN_OR_RAISE(Datum cast_values, @@ -197,7 +197,7 @@ struct CastStruct { const ArrayData& in_array = *batch[0].array(); ArrayData* out_array = out->mutable_array(); - for (int64_t i = 0; i < in_field_count; ++i) { + for (int i = 0; i < in_field_count; ++i) { auto values = in_array.child_data[i]; if (in_array.offset != 0) { values = values->Slice(in_array.offset, in_array.length); From cc096b9b9b5d660feca2dd49224d4c2a1fa84e19 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 7 Feb 2022 18:29:03 -0800 Subject: [PATCH 43/51] semi passing tests --- .../compute/kernels/scalar_cast_nested.cc | 23 +++++++++ .../arrow/compute/kernels/scalar_cast_test.cc | 49 ++++++++++++------- 2 files changed, 55 insertions(+), 17 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index f62e5f994d2..d170b1e0658 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -173,8 +173,31 @@ struct CastStruct { "struct field names do not match: ", batch[0].type()->ToString(), " ", out->type()->ToString()); } + + const auto in_field_nullable = + checked_cast(*batch[0].type()).field(i)->nullable(); + const auto out_field_nullable = + checked_cast(*out->type()).field(i)->nullable(); + + if (in_field_nullable && !out_field_nullable) { + return Status::TypeError( + "cannot cast non-nullable struct to nullable struct: ", batch[0].type()->ToString(), " ", + out->type()->ToString()); + } } + for (int i = 0; i < in_field_count; ++i) { + const auto in_field_name = + checked_cast(*batch[0].type()).field(i)->name(); + const auto out_field_name = + checked_cast(*out->type()).field(i)->name(); + if (in_field_name != out_field_name) { + return Status::TypeError( + "struct field names do not match: ", batch[0].type()->ToString(), " ", + out->type()->ToString()); + } + } + if (out->kind() == Datum::SCALAR) { const auto& in_scalar = checked_cast(*batch[0].scalar()); auto out_scalar = checked_cast(out->scalar().get()); diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 2188736ff08..684b6ce1500 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2285,35 +2285,50 @@ TEST(Cast, StructToDifferentSizeStruct) { TEST(Cast, StructToSameSizedButDifferentNullabilityStruct) { // OK to go from non-nullable to nullable... - std::vector field_names = {"a", "b"}; + std::vector> fields1 = { + std::make_shared("a", int8(), false), + std::make_shared("b", int8(), false) + }; std::shared_ptr a1, b1; a1 = ArrayFromJSON(int8(), "[1, 2]"); b1 = ArrayFromJSON(int8(), "[3, 4]"); - ASSERT_OK_AND_ASSIGN(auto src1, StructArray::Make({a1, b1}, field_names)); - - std::shared_ptr c1, d1; - c1 = ArrayFromJSON(int8(), "[1, null]"); - d1 = ArrayFromJSON(int8(), "[3, 4]"); - ASSERT_OK_AND_ASSIGN(auto dest1, StructArray::Make({c1, d1}, field_names)); + ASSERT_OK_AND_ASSIGN(auto src1, StructArray::Make({a1, b1}, fields1)); - CheckCast(src1, dest1); - - // But not the other way around + std::vector> fields2 = { + std::make_shared("a", int8(), true), + std::make_shared("b", int8(), true) + }; std::shared_ptr a2, b2; a2 = ArrayFromJSON(int8(), "[1, null]"); b2 = ArrayFromJSON(int8(), "[3, 4]"); - ASSERT_OK_AND_ASSIGN(auto src2, StructArray::Make({a2, b2}, field_names)); + ASSERT_OK_AND_ASSIGN(auto dest1, StructArray::Make({a2, b2}, fields2)); - std::shared_ptr c2, d2; - c2 = ArrayFromJSON(int8(), "[1, 2]"); - d2 = ArrayFromJSON(int8(), "[3, 4]"); - ASSERT_OK_AND_ASSIGN(auto dest2, StructArray::Make({c2, d2}, field_names)); + CheckCast(src1, dest1); + + // But not the other way around + std::vector> fields3 = { + std::make_shared("a", int8(), true), + std::make_shared("b", int8(), true) + }; + std::shared_ptr a3, b3; + a3 = ArrayFromJSON(int8(), "[1, null]"); + b3 = ArrayFromJSON(int8(), "[3, 4]"); + ASSERT_OK_AND_ASSIGN(auto src2, StructArray::Make({a3, b3}, fields3)); + + std::vector> fields4 = { + std::make_shared("a", int8(), false), + std::make_shared("b", int8(), false) + }; + std::shared_ptr a4, b4; + a4 = ArrayFromJSON(int8(), "[1, 2]"); + a4 = ArrayFromJSON(int8(), "[3, 4]"); + ASSERT_OK_AND_ASSIGN(auto dest2, StructArray::Make({a4, b4}, fields4)); auto options = CastOptions::Safe(dest2->type()); EXPECT_RAISES_WITH_MESSAGE_THAT( TypeError, - ::testing::HasSubstr("Type error: struct nullability does not match: struct struct"), + ::testing::HasSubstr("Type error: cannot cast non-nullable struct to nullable struct: struct"), Cast(src2, options)); } From dc9f386d4de20ce9f43093c7ef9397d6388b655a Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 7 Feb 2022 19:06:15 -0800 Subject: [PATCH 44/51] clang-format --- .../compute/kernels/scalar_cast_nested.cc | 10 +++---- .../arrow/compute/kernels/scalar_cast_test.cc | 30 ++++++++----------- 2 files changed, 18 insertions(+), 22 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index d170b1e0658..ec3084dd99e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -175,14 +175,14 @@ struct CastStruct { } const auto in_field_nullable = - checked_cast(*batch[0].type()).field(i)->nullable(); + checked_cast(*batch[0].type()).field(i)->nullable(); const auto out_field_nullable = checked_cast(*out->type()).field(i)->nullable(); if (in_field_nullable && !out_field_nullable) { - return Status::TypeError( - "cannot cast non-nullable struct to nullable struct: ", batch[0].type()->ToString(), " ", - out->type()->ToString()); + return Status::TypeError("cannot cast non-nullable struct to nullable struct: ", + batch[0].type()->ToString(), " ", + out->type()->ToString()); } } @@ -196,7 +196,7 @@ struct CastStruct { "struct field names do not match: ", batch[0].type()->ToString(), " ", out->type()->ToString()); } - } + } if (out->kind() == Datum::SCALAR) { const auto& in_scalar = checked_cast(*batch[0].scalar()); diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 684b6ce1500..b39f3c15cdd 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2286,18 +2286,16 @@ TEST(Cast, StructToDifferentSizeStruct) { TEST(Cast, StructToSameSizedButDifferentNullabilityStruct) { // OK to go from non-nullable to nullable... std::vector> fields1 = { - std::make_shared("a", int8(), false), - std::make_shared("b", int8(), false) - }; + std::make_shared("a", int8(), false), + std::make_shared("b", int8(), false)}; std::shared_ptr a1, b1; a1 = ArrayFromJSON(int8(), "[1, 2]"); b1 = ArrayFromJSON(int8(), "[3, 4]"); ASSERT_OK_AND_ASSIGN(auto src1, StructArray::Make({a1, b1}, fields1)); std::vector> fields2 = { - std::make_shared("a", int8(), true), - std::make_shared("b", int8(), true) - }; + std::make_shared("a", int8(), true), + std::make_shared("b", int8(), true)}; std::shared_ptr a2, b2; a2 = ArrayFromJSON(int8(), "[1, null]"); b2 = ArrayFromJSON(int8(), "[3, 4]"); @@ -2307,31 +2305,29 @@ TEST(Cast, StructToSameSizedButDifferentNullabilityStruct) { // But not the other way around std::vector> fields3 = { - std::make_shared("a", int8(), true), - std::make_shared("b", int8(), true) - }; + std::make_shared("a", int8(), true), + std::make_shared("b", int8(), true)}; std::shared_ptr a3, b3; a3 = ArrayFromJSON(int8(), "[1, null]"); b3 = ArrayFromJSON(int8(), "[3, 4]"); ASSERT_OK_AND_ASSIGN(auto src2, StructArray::Make({a3, b3}, fields3)); std::vector> fields4 = { - std::make_shared("a", int8(), false), - std::make_shared("b", int8(), false) - }; + std::make_shared("a", int8(), false), + std::make_shared("b", int8(), false)}; std::shared_ptr a4, b4; a4 = ArrayFromJSON(int8(), "[1, 2]"); a4 = ArrayFromJSON(int8(), "[3, 4]"); ASSERT_OK_AND_ASSIGN(auto dest2, StructArray::Make({a4, b4}, fields4)); auto options = CastOptions::Safe(dest2->type()); - + EXPECT_RAISES_WITH_MESSAGE_THAT( TypeError, - ::testing::HasSubstr("Type error: cannot cast non-nullable struct to nullable struct: struct"), - Cast(src2, options)); -} - + Cast(src2, options)); +} TEST(Cast, IdentityCasts) { // ARROW-4102 From c7ee877c49a5bd05ee2fb784bbdb9da77dca34b4 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 8 Feb 2022 18:47:31 -0800 Subject: [PATCH 45/51] passing nullability test --- cpp/src/arrow/compute/kernels/scalar_cast_nested.cc | 3 ++- cpp/src/arrow/compute/kernels/scalar_cast_test.cc | 12 ++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index ec3084dd99e..01f59fc8a3a 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -180,7 +180,7 @@ struct CastStruct { checked_cast(*out->type()).field(i)->nullable(); if (in_field_nullable && !out_field_nullable) { - return Status::TypeError("cannot cast non-nullable struct to nullable struct: ", + return Status::TypeError("cannot cast nullable struct to non-nullable struct: ", batch[0].type()->ToString(), " ", out->type()->ToString()); } @@ -219,6 +219,7 @@ struct CastStruct { const ArrayData& in_array = *batch[0].array(); ArrayData* out_array = out->mutable_array(); + out_array->buffers = in_array.buffers; for (int i = 0; i < in_field_count; ++i) { auto values = in_array.child_data[i]; diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index b39f3c15cdd..3ee02e451d6 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2284,7 +2284,7 @@ TEST(Cast, StructToDifferentSizeStruct) { } TEST(Cast, StructToSameSizedButDifferentNullabilityStruct) { - // OK to go from non-nullable to nullable... + // OK to go from not-nullable to nullable... std::vector> fields1 = { std::make_shared("a", int8(), false), std::make_shared("b", int8(), false)}; @@ -2297,7 +2297,7 @@ TEST(Cast, StructToSameSizedButDifferentNullabilityStruct) { std::make_shared("a", int8(), true), std::make_shared("b", int8(), true)}; std::shared_ptr a2, b2; - a2 = ArrayFromJSON(int8(), "[1, null]"); + a2 = ArrayFromJSON(int8(), "[1, 2]"); b2 = ArrayFromJSON(int8(), "[3, 4]"); ASSERT_OK_AND_ASSIGN(auto dest1, StructArray::Make({a2, b2}, fields2)); @@ -2317,15 +2317,15 @@ TEST(Cast, StructToSameSizedButDifferentNullabilityStruct) { std::make_shared("b", int8(), false)}; std::shared_ptr a4, b4; a4 = ArrayFromJSON(int8(), "[1, 2]"); - a4 = ArrayFromJSON(int8(), "[3, 4]"); + b4 = ArrayFromJSON(int8(), "[3, 4]"); ASSERT_OK_AND_ASSIGN(auto dest2, StructArray::Make({a4, b4}, fields4)); auto options = CastOptions::Safe(dest2->type()); EXPECT_RAISES_WITH_MESSAGE_THAT( TypeError, - ::testing::HasSubstr("Type error: cannot cast non-nullable struct to nullable " - "struct: struct"), + ::testing::HasSubstr( + "Type error: cannot cast nullable struct to non-nullable " + "struct: struct struct"), Cast(src2, options)); } From c7150aaaeeaa99cebb5e3919e08e58fecf0bf341 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 9 Feb 2022 17:06:11 -0800 Subject: [PATCH 46/51] variable cleanup --- .../compute/kernels/scalar_cast_nested.cc | 45 ++++++------------- 1 file changed, 13 insertions(+), 32 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 01f59fc8a3a..2b4b76a830e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -153,48 +153,29 @@ void AddListCast(CastFunction* func) { struct CastStruct { static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { const CastOptions& options = CastState::Get(ctx); + const StructType& in_type = checked_cast(*batch[0].type()); + const StructType& out_type = checked_cast(*out->type()); const auto in_field_count = - checked_cast(*batch[0].type()).num_fields(); - const auto out_field_count = - checked_cast(*out->type()).num_fields(); + in_type.num_fields(); - if (in_field_count != out_field_count) { + if (in_field_count != out_type.num_fields()) { return Status::TypeError("struct field sizes do not match: ", - batch[0].type()->ToString(), " ", out->type()->ToString()); + in_type.ToString(), " ", out_type.ToString()); } for (int i = 0; i < in_field_count; ++i) { - const auto in_field_name = - checked_cast(*batch[0].type()).field(i)->name(); - const auto out_field_name = - checked_cast(*out->type()).field(i)->name(); - if (in_field_name != out_field_name) { + const auto in_field = in_type.field(i); + const auto out_field = out_type.field(i); + if (in_field->name() != out_field->name()) { return Status::TypeError( - "struct field names do not match: ", batch[0].type()->ToString(), " ", - out->type()->ToString()); + "struct field names do not match: ", in_type.ToString(), " ", + out_type.ToString()); } - const auto in_field_nullable = - checked_cast(*batch[0].type()).field(i)->nullable(); - const auto out_field_nullable = - checked_cast(*out->type()).field(i)->nullable(); - - if (in_field_nullable && !out_field_nullable) { + if (in_field->nullable() && !out_field->nullable()) { return Status::TypeError("cannot cast nullable struct to non-nullable struct: ", - batch[0].type()->ToString(), " ", - out->type()->ToString()); - } - } - - for (int i = 0; i < in_field_count; ++i) { - const auto in_field_name = - checked_cast(*batch[0].type()).field(i)->name(); - const auto out_field_name = - checked_cast(*out->type()).field(i)->name(); - if (in_field_name != out_field_name) { - return Status::TypeError( - "struct field names do not match: ", batch[0].type()->ToString(), " ", - out->type()->ToString()); + in_type.ToString(), " ", + out_type.ToString()); } } From 5ebefd65b73bc7fd6493b24e4e42694c5b5f5bac Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 9 Feb 2022 17:13:16 -0800 Subject: [PATCH 47/51] simplified tests --- .../arrow/compute/kernels/scalar_cast_test.cc | 31 ++++++++----------- 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 3ee02e451d6..0968a5d2922 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2247,12 +2247,11 @@ TEST(Cast, StructToSameSizedButDifferentNamedStruct) { b = ArrayFromJSON(int8(), "[3, 4]"); ASSERT_OK_AND_ASSIGN(auto src, StructArray::Make({a, b}, field_names)); - std::vector field_names2 = {"c", "d"}; - std::shared_ptr c, d; - c = ArrayFromJSON(int8(), "[1, 2]"); - d = ArrayFromJSON(int8(), "[3, 4]"); - ASSERT_OK_AND_ASSIGN(auto dest, StructArray::Make({c, d}, field_names2)); - auto options = CastOptions::Safe(dest->type()); + const auto dest = arrow::struct_({ + std::make_shared("c", int8()), + std::make_shared("d", int8()) + }); + const auto options = CastOptions::Safe(dest); EXPECT_RAISES_WITH_MESSAGE_THAT( TypeError, @@ -2268,13 +2267,12 @@ TEST(Cast, StructToDifferentSizeStruct) { b = ArrayFromJSON(int8(), "[3, 4]"); ASSERT_OK_AND_ASSIGN(auto src, StructArray::Make({a, b}, field_names)); - std::vector field_names2 = {"a", "b", "c"}; - std::shared_ptr a2, b2, c; - a2 = ArrayFromJSON(int8(), "[1, 2]"); - b2 = ArrayFromJSON(int8(), "[3, 4]"); - c = ArrayFromJSON(int8(), "[5, 6]"); - ASSERT_OK_AND_ASSIGN(auto dest, StructArray::Make({a2, b2, c}, field_names2)); - auto options = CastOptions::Safe(dest->type()); + const auto dest = arrow::struct_({ + std::make_shared("a", int8()), + std::make_shared("b", int8()), + std::make_shared("c", int8()) + }); + const auto options = CastOptions::Safe(dest); EXPECT_RAISES_WITH_MESSAGE_THAT( TypeError, @@ -2315,11 +2313,8 @@ TEST(Cast, StructToSameSizedButDifferentNullabilityStruct) { std::vector> fields4 = { std::make_shared("a", int8(), false), std::make_shared("b", int8(), false)}; - std::shared_ptr a4, b4; - a4 = ArrayFromJSON(int8(), "[1, 2]"); - b4 = ArrayFromJSON(int8(), "[3, 4]"); - ASSERT_OK_AND_ASSIGN(auto dest2, StructArray::Make({a4, b4}, fields4)); - auto options = CastOptions::Safe(dest2->type()); + const auto dest2 = arrow::struct_(fields4); + const auto options = CastOptions::Safe(dest2); EXPECT_RAISES_WITH_MESSAGE_THAT( TypeError, From 620002310df9236f250dbb5ab493463c0b376d36 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 9 Feb 2022 17:13:40 -0800 Subject: [PATCH 48/51] clang-format --- .../arrow/compute/kernels/scalar_cast_nested.cc | 15 ++++++--------- cpp/src/arrow/compute/kernels/scalar_cast_test.cc | 14 +++++--------- 2 files changed, 11 insertions(+), 18 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 2b4b76a830e..93a1ee5fb0e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -155,27 +155,24 @@ struct CastStruct { const CastOptions& options = CastState::Get(ctx); const StructType& in_type = checked_cast(*batch[0].type()); const StructType& out_type = checked_cast(*out->type()); - const auto in_field_count = - in_type.num_fields(); + const auto in_field_count = in_type.num_fields(); if (in_field_count != out_type.num_fields()) { - return Status::TypeError("struct field sizes do not match: ", - in_type.ToString(), " ", out_type.ToString()); + return Status::TypeError("struct field sizes do not match: ", in_type.ToString(), + " ", out_type.ToString()); } for (int i = 0; i < in_field_count; ++i) { const auto in_field = in_type.field(i); const auto out_field = out_type.field(i); if (in_field->name() != out_field->name()) { - return Status::TypeError( - "struct field names do not match: ", in_type.ToString(), " ", - out_type.ToString()); + return Status::TypeError("struct field names do not match: ", in_type.ToString(), + " ", out_type.ToString()); } if (in_field->nullable() && !out_field->nullable()) { return Status::TypeError("cannot cast nullable struct to non-nullable struct: ", - in_type.ToString(), " ", - out_type.ToString()); + in_type.ToString(), " ", out_type.ToString()); } } diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 0968a5d2922..4c9764e4945 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2247,10 +2247,8 @@ TEST(Cast, StructToSameSizedButDifferentNamedStruct) { b = ArrayFromJSON(int8(), "[3, 4]"); ASSERT_OK_AND_ASSIGN(auto src, StructArray::Make({a, b}, field_names)); - const auto dest = arrow::struct_({ - std::make_shared("c", int8()), - std::make_shared("d", int8()) - }); + const auto dest = arrow::struct_( + {std::make_shared("c", int8()), std::make_shared("d", int8())}); const auto options = CastOptions::Safe(dest); EXPECT_RAISES_WITH_MESSAGE_THAT( @@ -2267,11 +2265,9 @@ TEST(Cast, StructToDifferentSizeStruct) { b = ArrayFromJSON(int8(), "[3, 4]"); ASSERT_OK_AND_ASSIGN(auto src, StructArray::Make({a, b}, field_names)); - const auto dest = arrow::struct_({ - std::make_shared("a", int8()), - std::make_shared("b", int8()), - std::make_shared("c", int8()) - }); + const auto dest = arrow::struct_({std::make_shared("a", int8()), + std::make_shared("b", int8()), + std::make_shared("c", int8())}); const auto options = CastOptions::Safe(dest); EXPECT_RAISES_WITH_MESSAGE_THAT( From 8e8e5d68ea8a9cb0b95dcfde8f6245c5a5ba794f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 9 Feb 2022 17:20:04 -0800 Subject: [PATCH 49/51] introduce null data --- cpp/src/arrow/compute/kernels/scalar_cast_nested.cc | 4 +--- cpp/src/arrow/compute/kernels/scalar_cast_test.cc | 8 ++++---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 93a1ee5fb0e..9467ff62288 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -198,12 +198,10 @@ struct CastStruct { const ArrayData& in_array = *batch[0].array(); ArrayData* out_array = out->mutable_array(); out_array->buffers = in_array.buffers; + out_array->offset = in_array.offset; for (int i = 0; i < in_field_count; ++i) { auto values = in_array.child_data[i]; - if (in_array.offset != 0) { - values = values->Slice(in_array.offset, in_array.length); - } auto target_type = out->type()->field(i)->type(); ARROW_ASSIGN_OR_RAISE(Datum cast_values, diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index 4c9764e4945..a363bb597ee 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2224,10 +2224,10 @@ static void CheckStructToStruct( for (const auto& dest_value_type : value_types) { std::vector field_names = {"a", "b"}; std::shared_ptr a1, b1, a2, b2; - a1 = ArrayFromJSON(src_value_type, "[1, 2, 3, 4, 5]"); - b1 = ArrayFromJSON(src_value_type, "[6, 7, 8, 9, 0]"); - a2 = ArrayFromJSON(dest_value_type, "[1, 2, 3, 4, 5]"); - b2 = ArrayFromJSON(dest_value_type, "[6, 7, 8, 9, 0]"); + a1 = ArrayFromJSON(src_value_type, "[1, 2, 3, 4, null]"); + b1 = ArrayFromJSON(src_value_type, "[null, 7, 8, 9, 0]"); + a2 = ArrayFromJSON(dest_value_type, "[1, 2, 3, 4, null]"); + b2 = ArrayFromJSON(dest_value_type, "[null, 7, 8, 9, 0]"); ASSERT_OK_AND_ASSIGN(auto src, StructArray::Make({a1, b1}, field_names)); ASSERT_OK_AND_ASSIGN(auto dest, StructArray::Make({a2, b2}, field_names)); From 4214565a3ceb37ff895158fd43d0b4bf93b9ad90 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 9 Feb 2022 18:19:30 -0800 Subject: [PATCH 50/51] added test with nullability buffer --- cpp/src/arrow/compute/kernels/scalar_cast_test.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index a363bb597ee..8a4f2c69456 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2232,6 +2232,15 @@ static void CheckStructToStruct( ASSERT_OK_AND_ASSIGN(auto dest, StructArray::Make({a2, b2}, field_names)); CheckCast(src, dest); + + std::shared_ptr null_bitmap; + BitmapFromVector({0, 1, 0, 1, 0}, &null_bitmap); + + ASSERT_OK_AND_ASSIGN(auto src_nulls, + StructArray::Make({a1, b1}, field_names, null_bitmap)); + ASSERT_OK_AND_ASSIGN(auto dest_nulls, + StructArray::Make({a2, b2}, field_names, null_bitmap)); + CheckCast(src_nulls, dest_nulls); } } } From d61627184564f4b1d4e5cd161ce808038763d586 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 10 Feb 2022 12:56:39 -0800 Subject: [PATCH 51/51] more efficient slice / buffer handling --- cpp/src/arrow/compute/kernels/scalar_cast_nested.cc | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index 9467ff62288..cc36c510363 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -197,11 +197,15 @@ struct CastStruct { const ArrayData& in_array = *batch[0].array(); ArrayData* out_array = out->mutable_array(); - out_array->buffers = in_array.buffers; - out_array->offset = in_array.offset; + + if (in_array.buffers[0]) { + ARROW_ASSIGN_OR_RAISE(out_array->buffers[0], + CopyBitmap(ctx->memory_pool(), in_array.buffers[0]->data(), + in_array.offset, in_array.length)); + } for (int i = 0; i < in_field_count; ++i) { - auto values = in_array.child_data[i]; + auto values = in_array.child_data[i]->Slice(in_array.offset, in_array.length); auto target_type = out->type()->field(i)->type(); ARROW_ASSIGN_OR_RAISE(Datum cast_values,