From fea8f3480632441d5c3a268be3f18d2cfa739a56 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Mon, 26 Jul 2021 19:09:13 +0200 Subject: [PATCH] ARROW-6072: [C++] Implement casting List <-> LargeList Implement bidirectional casts between List and LargeList assuming T and U can be cast to each other. --- .../compute/kernels/scalar_cast_internal.cc | 4 + .../compute/kernels/scalar_cast_nested.cc | 141 ++++++++++++------ .../arrow/compute/kernels/scalar_cast_test.cc | 79 +++++----- python/pyarrow/tests/test_compute.py | 5 + 4 files changed, 144 insertions(+), 85 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc index 198c82bd97e..8076c35a132 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc @@ -29,6 +29,8 @@ namespace internal { // ---------------------------------------------------------------------- +namespace { + template ARROW_DISABLE_UBSAN("float-cast-overflow") void DoStaticCast(const void* in_data, int64_t in_offset, int64_t length, @@ -117,6 +119,8 @@ void CastNumberImpl(Type::type out_type, const Datum& input, Datum* out) { } } +} // namespace + void CastNumberToNumberUnsafe(Type::type in_type, Type::type out_type, const Datum& input, Datum* out) { switch (in_type) { diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc index ec92dbb5d60..ab583bbbe8c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc @@ -17,6 +17,7 @@ // Implementation of casting to (or between) list types +#include #include #include @@ -26,6 +27,7 @@ #include "arrow/compute/kernels/common.h" #include "arrow/compute/kernels/scalar_cast_internal.h" #include "arrow/util/bitmap_ops.h" +#include "arrow/util/int_util.h" namespace arrow { @@ -34,82 +36,135 @@ using internal::CopyBitmap; namespace compute { namespace internal { -template -Status CastListExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - using offset_type = typename Type::offset_type; - using ScalarType = typename TypeTraits::ScalarType; +namespace { - const CastOptions& options = CastState::Get(ctx); +// (Large)List -> (Large)List - auto child_type = checked_cast(*out->type()).value_type(); +template +typename std::enable_if::type +CastListOffsets(KernelContext* ctx, const ArrayData& in_array, ArrayData* out_array) { + return Status::OK(); +} + +template +typename std::enable_if::type +CastListOffsets(KernelContext* ctx, const ArrayData& in_array, ArrayData* out_array) { + using src_offset_type = typename SrcType::offset_type; + using dest_offset_type = typename DestType::offset_type; + + ARROW_ASSIGN_OR_RAISE(out_array->buffers[1], + ctx->Allocate(sizeof(dest_offset_type) * (in_array.length + 1))); + ::arrow::internal::CastInts(in_array.GetValues(1), + out_array->GetMutableValues(1), + in_array.length + 1); + return Status::OK(); +} + +template +struct CastList { + using src_offset_type = typename SrcType::offset_type; + using dest_offset_type = typename DestType::offset_type; + + static constexpr bool is_upcast = sizeof(src_offset_type) < sizeof(dest_offset_type); + static constexpr bool is_downcast = sizeof(src_offset_type) > sizeof(dest_offset_type); + + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + const CastOptions& options = CastState::Get(ctx); + + auto child_type = checked_cast(*out->type()).value_type(); - if (out->kind() == Datum::SCALAR) { - const auto& in_scalar = checked_cast(*batch[0].scalar()); - auto out_scalar = checked_cast(out->scalar().get()); + if (out->kind() == Datum::SCALAR) { + // The scalar case is simple, as only the underlying values must be cast + const auto& in_scalar = checked_cast(*batch[0].scalar()); + auto out_scalar = checked_cast(out->scalar().get()); - DCHECK(!out_scalar->is_valid); - if (in_scalar.is_valid) { - ARROW_ASSIGN_OR_RAISE(out_scalar->value, Cast(*in_scalar.value, child_type, options, - ctx->exec_context())); + DCHECK(!out_scalar->is_valid); + if (in_scalar.is_valid) { + ARROW_ASSIGN_OR_RAISE(out_scalar->value, Cast(*in_scalar.value, child_type, + options, ctx->exec_context())); - out_scalar->is_valid = true; + out_scalar->is_valid = true; + } + return Status::OK(); } - return Status::OK(); - } - const ArrayData& in_array = *batch[0].array(); - ArrayData* out_array = out->mutable_array(); + const ArrayData& in_array = *batch[0].array(); + auto offsets = in_array.GetValues(1); + Datum values = in_array.child_data[0]; - // Copy from parent - out_array->buffers = in_array.buffers; - Datum values = in_array.child_data[0]; + ArrayData* out_array = out->mutable_array(); + out_array->buffers = in_array.buffers; - if (in_array.offset != 0) { - if (in_array.buffers[0]) { + // Shift bitmap in case the source offset is non-zero + if (in_array.offset != 0 && in_array.buffers[0]) { ARROW_ASSIGN_OR_RAISE(out_array->buffers[0], CopyBitmap(ctx->memory_pool(), in_array.buffers[0]->data(), in_array.offset, in_array.length)); } - ARROW_ASSIGN_OR_RAISE(out_array->buffers[1], - ctx->Allocate(sizeof(offset_type) * (in_array.length + 1))); - auto offsets = in_array.GetValues(1); - auto shifted_offsets = out_array->GetMutableValues(1); + // Handle list offsets + // Several cases can arise: + // - the source offset is non-zero, in which case we slice the underlying values + // and shift the list offsets (regardless of their respective types) + // - the source offset is zero but source and destination types have + // different list offset types, in which case we cast the list offsets + // - otherwise, we simply keep the original list offsets + if (is_downcast) { + if (offsets[in_array.length] > std::numeric_limits::max()) { + return Status::Invalid("Array of type ", in_array.type->ToString(), + " too large to convert to ", out_array->type->ToString()); + } + } - for (int64_t i = 0; i < in_array.length + 1; ++i) { - shifted_offsets[i] = offsets[i] - offsets[0]; + if (in_array.offset != 0) { + ARROW_ASSIGN_OR_RAISE( + out_array->buffers[1], + ctx->Allocate(sizeof(dest_offset_type) * (in_array.length + 1))); + + auto shifted_offsets = out_array->GetMutableValues(1); + for (int64_t i = 0; i < in_array.length + 1; ++i) { + shifted_offsets[i] = static_cast(offsets[i] - offsets[0]); + } + values = in_array.child_data[0]->Slice(offsets[0], offsets[in_array.length]); + } else { + RETURN_NOT_OK((CastListOffsets(ctx, in_array, out_array))); } - values = in_array.child_data[0]->Slice(offsets[0], offsets[in_array.length]); - } - ARROW_ASSIGN_OR_RAISE(Datum cast_values, - Cast(values, child_type, options, ctx->exec_context())); + // Handle values + ARROW_ASSIGN_OR_RAISE(Datum cast_values, + Cast(values, child_type, options, ctx->exec_context())); - DCHECK_EQ(Datum::ARRAY, cast_values.kind()); - out_array->child_data.push_back(cast_values.array()); - return Status::OK(); -} + DCHECK_EQ(Datum::ARRAY, cast_values.kind()); + out_array->child_data.push_back(cast_values.array()); + return Status::OK(); + } +}; -template +template void AddListCast(CastFunction* func) { ScalarKernel kernel; - kernel.exec = CastListExec; - kernel.signature = KernelSignature::Make({InputType(Type::type_id)}, kOutputTargetType); + kernel.exec = CastList::Exec; + kernel.signature = + KernelSignature::Make({InputType(SrcType::type_id)}, kOutputTargetType); kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; - DCHECK_OK(func->AddKernel(Type::type_id, std::move(kernel))); + DCHECK_OK(func->AddKernel(SrcType::type_id, std::move(kernel))); } +} // namespace + std::vector> GetNestedCasts() { // We use the list from the CastOptions when resolving the output type auto cast_list = std::make_shared("cast_list", Type::LIST); AddCommonCasts(Type::LIST, kOutputTargetType, cast_list.get()); - AddListCast(cast_list.get()); + AddListCast(cast_list.get()); + AddListCast(cast_list.get()); auto cast_large_list = std::make_shared("cast_large_list", Type::LARGE_LIST); AddCommonCasts(Type::LARGE_LIST, kOutputTargetType, cast_large_list.get()); - AddListCast(cast_large_list.get()); + AddListCast(cast_large_list.get()); + AddListCast(cast_large_list.get()); // FSL is a bit incomplete at the moment auto cast_fsl = diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index fce8518dd3b..9f537fecf55 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -1676,56 +1676,51 @@ TEST(Cast, ListToPrimitive) { Cast(*ArrayFromJSON(list(binary()), R"([["1", "2"], ["3", "4"]])"), utf8())); } -TEST(Cast, ListToList) { - using make_list_t = std::shared_ptr(const std::shared_ptr&); - for (auto make_list : std::vector{&list, &large_list}) { - auto list_int32 = - ArrayFromJSON(make_list(int32()), - "[[0], [1], null, [2, 3, 4], [5, 6], null, [], [7], [8, 9]]") - ->data(); - - auto list_int64 = list_int32->Copy(); - list_int64->type = make_list(int64()); - list_int64->child_data[0] = Cast(list_int32->child_data[0], int64())->array(); - ValidateOutput(*list_int64); - - auto list_float32 = list_int32->Copy(); - list_float32->type = make_list(float32()); - list_float32->child_data[0] = Cast(list_int32->child_data[0], float32())->array(); - ValidateOutput(*list_float32); - - CheckCast(MakeArray(list_int32), MakeArray(list_float32)); - CheckCast(MakeArray(list_float32), MakeArray(list_int64)); - CheckCast(MakeArray(list_int64), MakeArray(list_float32)); - - CheckCast(MakeArray(list_int32), MakeArray(list_int64)); - CheckCast(MakeArray(list_float32), MakeArray(list_int32)); - CheckCast(MakeArray(list_int64), MakeArray(list_int32)); +using make_list_t = std::shared_ptr(const std::shared_ptr&); + +static const auto list_factories = std::vector{&list, &large_list}; + +static void CheckListToList(const std::vector>& value_types, + const std::string& json_data) { + for (auto make_src_list : list_factories) { + for (auto make_dest_list : list_factories) { + for (const auto& src_value_type : value_types) { + for (const auto& dest_value_type : value_types) { + const auto src_type = make_src_list(src_value_type); + const auto dest_type = make_dest_list(dest_value_type); + ARROW_SCOPED_TRACE("src_type = ", src_type->ToString(), + ", dest_type = ", dest_type->ToString()); + CheckCast(ArrayFromJSON(src_type, json_data), + ArrayFromJSON(dest_type, json_data)); + } + } + } } +} - // No nulls (ARROW-12568) - for (auto make_list : std::vector{&list, &large_list}) { - auto list_int32 = ArrayFromJSON(make_list(int32()), - "[[0], [1], [2, 3, 4], [5, 6], [], [7], [8, 9]]") - ->data(); - auto list_int64 = list_int32->Copy(); - list_int64->type = make_list(int64()); - list_int64->child_data[0] = Cast(list_int32->child_data[0], int64())->array(); - ValidateOutput(*list_int64); +TEST(Cast, ListToList) { + CheckListToList({int32(), float32(), int64()}, + "[[0], [1], null, [2, 3, 4], [5, 6], null, [], [7], [8, 9]]"); +} - CheckCast(MakeArray(list_int32), MakeArray(list_int64)); - CheckCast(MakeArray(list_int64), MakeArray(list_int32)); - } +TEST(Cast, ListToListNoNulls) { + // ARROW-12568 + CheckListToList({int32(), float32(), int64()}, + "[[0], [1], [2, 3, 4], [5, 6], [], [7], [8, 9]]"); } TEST(Cast, ListToListOptionsPassthru) { - auto list_int32 = ArrayFromJSON(list(int32()), "[[87654321]]"); + for (auto make_src_list : list_factories) { + for (auto make_dest_list : list_factories) { + auto list_int32 = ArrayFromJSON(make_src_list(int32()), "[[87654321]]"); - auto options = CastOptions::Safe(list(int16())); - CheckCastFails(list_int32, options); + auto options = CastOptions::Safe(make_dest_list(int16())); + CheckCastFails(list_int32, options); - options.allow_int_overflow = true; - CheckCast(list_int32, ArrayFromJSON(list(int16()), "[[32689]]"), options); + options.allow_int_overflow = true; + CheckCast(list_int32, ArrayFromJSON(make_dest_list(int16()), "[[32689]]"), options); + } + } } TEST(Cast, IdentityCasts) { diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 993742fe664..b0baa76e50a 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -1347,6 +1347,11 @@ def test_cast(): expected = pa.array([1262304000000, 1420070400000], type='timestamp[ms]') assert pc.cast(arr, 'timestamp[ms]') == expected + arr = pa.array([[1, 2], [3, 4, 5]], type=pa.large_list(pa.int8())) + expected = pa.array([["1", "2"], ["3", "4", "5"]], + type=pa.list_(pa.utf8())) + assert pc.cast(arr, expected.type) == expected + def test_strptime(): arr = pa.array(["5/1/2020", None, "12/13/1900"])