From 058aad07aef4e1af3ee995abcee5e0473521a6d5 Mon Sep 17 00:00:00 2001 From: Licht-T Date: Sun, 3 Dec 2017 19:22:56 +0900 Subject: [PATCH 1/6] ENH: Implement List to List cast --- cpp/src/arrow/compute/kernels/cast.cc | 34 +++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc index 465be958cfa..a4f54e2e072 100644 --- a/cpp/src/arrow/compute/kernels/cast.cc +++ b/cpp/src/arrow/compute/kernels/cast.cc @@ -460,6 +460,29 @@ struct CastFunctor { } }; +// ---------------------------------------------------------------------- +// List to List + +template <> +struct CastFunctor { + void operator()(FunctionContext* ctx, const CastOptions& options, + const ArrayData& input, ArrayData* output) { + Datum datum_out; + ListArray input_array(input.Copy()); + + const ListType& type = static_cast(*output->type); + + FUNC_RETURN_NOT_OK( + Cast(ctx, Datum(input_array.values()), type.value_type(), options, &datum_out)); + + FUNC_RETURN_NOT_OK( + input.buffers[0]->Copy(0, input.buffers[0]->size(), &output->buffers[0])); + FUNC_RETURN_NOT_OK( + input.buffers[1]->Copy(0, input.buffers[1]->size(), &output->buffers[1])); + output->child_data.emplace_back(datum_out.array()); + } +}; + // ---------------------------------------------------------------------- // Dictionary to other things @@ -698,13 +721,16 @@ static Status AllocateIfNotPreallocated(FunctionContext* ctx, const ArrayData& i const Type::type type_id = out->type->id(); if (!(is_primitive(type_id) || type_id == Type::FIXED_SIZE_BINARY || - type_id == Type::DECIMAL)) { + type_id == Type::DECIMAL) && + type_id != Type::LIST) { std::stringstream ss; ss << "Cannot pre-allocate memory for type: " << out->type->ToString(); return Status::NotImplemented(ss.str()); } - if (type_id != Type::NA) { + if (type_id == Type::LIST) { + out->buffers.resize(2); + } else if (type_id != Type::NA) { const auto& fw_type = static_cast(*out->type); int bit_width = fw_type.bit_width(); @@ -860,6 +886,8 @@ class CastKernel : public UnaryKernel { FN(IN_TYPE, BinaryType); \ FN(IN_TYPE, StringType); +#define LIST_CASES(FN, IN_TYPE) FN(IN_TYPE, ListType) + #define GET_CAST_FUNCTION(CASE_GENERATOR, InType) \ static std::unique_ptr Get##InType##CastFunc( \ const std::shared_ptr& out_type, const CastOptions& options) { \ @@ -897,6 +925,7 @@ GET_CAST_FUNCTION(TIME64_CASES, Time64Type); GET_CAST_FUNCTION(TIMESTAMP_CASES, TimestampType); GET_CAST_FUNCTION(DICTIONARY_CASES, DictionaryType); +GET_CAST_FUNCTION(LIST_CASES, ListType); #define CAST_FUNCTION_CASE(InType) \ case InType::type_id: \ @@ -924,6 +953,7 @@ Status GetCastFunction(const DataType& in_type, const std::shared_ptr& CAST_FUNCTION_CASE(Time64Type); CAST_FUNCTION_CASE(TimestampType); CAST_FUNCTION_CASE(DictionaryType); + CAST_FUNCTION_CASE(ListType); default: break; } From cca90681bbd14584363a9ff081a89e25e3710aad Mon Sep 17 00:00:00 2001 From: Licht-T Date: Sun, 3 Dec 2017 21:44:28 +0900 Subject: [PATCH 2/6] TST: Add test for casting from List to List --- cpp/src/arrow/compute/compute-test.cc | 41 +++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/cpp/src/arrow/compute/compute-test.cc b/cpp/src/arrow/compute/compute-test.cc index 84af8f7c6b0..3fc15018630 100644 --- a/cpp/src/arrow/compute/compute-test.cc +++ b/cpp/src/arrow/compute/compute-test.cc @@ -801,6 +801,47 @@ TYPED_TEST(TestDictionaryCast, Basic) { this->CheckPass(*plain_array, *dict_array, dict_array->type(), options); }*/ +TEST_F(TestCast, ListToList) { + CastOptions options; + std::shared_ptr offsets; + + vector offsets_values = {0, 1, 2, 5, 7, 7, 8, 10}; + std::vector offsets_is_valid = {true, true, true, true, false, true, true, true}; + ArrayFromVector(offsets_is_valid, offsets_values, &offsets); + + shared_ptr int32_plain_array = + TestBase::MakeRandomArray::ArrayType>(10, 2); + std::shared_ptr int32_list_array; + ASSERT_OK( + ListArray::FromArrays(*offsets, *int32_plain_array, pool_, &int32_list_array)); + + std::shared_ptr int64_plain_array; + ASSERT_OK(Cast(&this->ctx_, *int32_plain_array, int64(), options, &int64_plain_array)); + std::shared_ptr int64_list_array; + ASSERT_OK( + ListArray::FromArrays(*offsets, *int64_plain_array, pool_, &int64_list_array)); + + std::shared_ptr float64_plain_array; + ASSERT_OK( + Cast(&this->ctx_, *int32_plain_array, float64(), options, &float64_plain_array)); + std::shared_ptr float64_list_array; + ASSERT_OK( + ListArray::FromArrays(*offsets, *float64_plain_array, pool_, &float64_list_array)); + + this->CheckPass(*int32_list_array, *int64_list_array, int64_list_array->type(), + options); + this->CheckPass(*int32_list_array, *float64_list_array, float64_list_array->type(), + options); + this->CheckPass(*int64_list_array, *int32_list_array, int32_list_array->type(), + options); + this->CheckPass(*int64_list_array, *float64_list_array, float64_list_array->type(), + options); + this->CheckPass(*float64_list_array, *int32_list_array, int32_list_array->type(), + options); + this->CheckPass(*float64_list_array, *int64_list_array, int64_list_array->type(), + options); +} + // ---------------------------------------------------------------------- // Dictionary tests From a8046d084f1c6425aeed6a9a3a646c8ca72b1d19 Mon Sep 17 00:00:00 2001 From: Licht-T Date: Tue, 5 Dec 2017 08:05:15 +0900 Subject: [PATCH 3/6] Implement child cast kernel checking before cast evaluation --- cpp/src/arrow/compute/kernels/cast.cc | 85 +++++++++++++++++---------- 1 file changed, 53 insertions(+), 32 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc index a4f54e2e072..927f81eb24a 100644 --- a/cpp/src/arrow/compute/kernels/cast.cc +++ b/cpp/src/arrow/compute/kernels/cast.cc @@ -467,19 +467,8 @@ template <> struct CastFunctor { void operator()(FunctionContext* ctx, const CastOptions& options, const ArrayData& input, ArrayData* output) { - Datum datum_out; - ListArray input_array(input.Copy()); - - const ListType& type = static_cast(*output->type); - - FUNC_RETURN_NOT_OK( - Cast(ctx, Datum(input_array.values()), type.value_type(), options, &datum_out)); - - FUNC_RETURN_NOT_OK( - input.buffers[0]->Copy(0, input.buffers[0]->size(), &output->buffers[0])); - FUNC_RETURN_NOT_OK( - input.buffers[1]->Copy(0, input.buffers[1]->size(), &output->buffers[1])); - output->child_data.emplace_back(datum_out.array()); + output->buffers[0] = input.buffers[0]; + output->buffers[1] = input.buffers[1]; } }; @@ -764,6 +753,23 @@ class CastKernel : public UnaryKernel { can_pre_allocate_values_(can_pre_allocate_values), out_type_(out_type) {} + Status SetChildCastKernel(const DataType& in_type) { + if (in_type.num_children() != out_type_->num_children()) { + return Status::SerializationError("Number of children are not same."); + } + + for (int i = 0; i < in_type.num_children(); i++) { + auto in_field = in_type.child(i); + auto out_field = out_type_->child(i); + std::unique_ptr child_cast_kernel; + RETURN_NOT_OK(GetCastFunction(*in_field->type(), out_field->type(), options_, + &child_cast_kernel)); + + child_cast_kernels.emplace_back(std::move(child_cast_kernel)); + } + return Status::OK(); + } + Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override { DCHECK_EQ(Datum::ARRAY, input.kind()); @@ -782,6 +788,14 @@ class CastKernel : public UnaryKernel { } func_(ctx, options_, in_data, result); + for (size_t i = 0; i < in_data.child_data.size(); i++) { + auto in_child_data = in_data.child_data[i]; + + Datum datum_out; + RETURN_NOT_OK(child_cast_kernels[i]->Call(ctx, Datum(in_child_data), &datum_out)); + result->child_data.emplace_back(datum_out.array()); + } + RETURN_IF_ERROR(ctx); return Status::OK(); } @@ -792,6 +806,7 @@ class CastKernel : public UnaryKernel { bool is_zero_copy_; bool can_pre_allocate_values_; std::shared_ptr out_type_; + std::vector> child_cast_kernels; }; #define CAST_CASE(InType, OutType) \ @@ -888,22 +903,28 @@ class CastKernel : public UnaryKernel { #define LIST_CASES(FN, IN_TYPE) FN(IN_TYPE, ListType) -#define GET_CAST_FUNCTION(CASE_GENERATOR, InType) \ - static std::unique_ptr Get##InType##CastFunc( \ - const std::shared_ptr& out_type, const CastOptions& options) { \ - CastFunction func; \ - bool is_zero_copy = false; \ - bool can_pre_allocate_values = true; \ - switch (out_type->id()) { \ - CASE_GENERATOR(CAST_CASE, InType); \ - default: \ - break; \ - } \ - if (func != nullptr) { \ - return std::unique_ptr(new CastKernel( \ - options, func, is_zero_copy, can_pre_allocate_values, out_type)); \ - } \ - return nullptr; \ +#define GET_CAST_FUNCTION(CASE_GENERATOR, InType) \ + static std::unique_ptr Get##InType##CastFunc( \ + const DataType& in_type, const std::shared_ptr& out_type, \ + const CastOptions& options) { \ + CastFunction func; \ + bool is_zero_copy = false; \ + bool can_pre_allocate_values = true; \ + switch (out_type->id()) { \ + CASE_GENERATOR(CAST_CASE, InType); \ + default: \ + break; \ + } \ + if (func != nullptr) { \ + auto cast_kernel = new CastKernel(options, func, is_zero_copy, \ + can_pre_allocate_values, out_type); \ + if (cast_kernel->SetChildCastKernel(in_type).ok()) { \ + return std::unique_ptr(cast_kernel); \ + } else { \ + return nullptr; \ + } \ + } \ + return nullptr; \ } GET_CAST_FUNCTION(NULL_CASES, NullType); @@ -927,9 +948,9 @@ GET_CAST_FUNCTION(TIMESTAMP_CASES, TimestampType); GET_CAST_FUNCTION(DICTIONARY_CASES, DictionaryType); GET_CAST_FUNCTION(LIST_CASES, ListType); -#define CAST_FUNCTION_CASE(InType) \ - case InType::type_id: \ - *kernel = Get##InType##CastFunc(out_type, options); \ +#define CAST_FUNCTION_CASE(InType) \ + case InType::type_id: \ + *kernel = Get##InType##CastFunc(in_type, out_type, options); \ break Status GetCastFunction(const DataType& in_type, const std::shared_ptr& out_type, From 87a002077bf69c50a1f2f8b14cd206f07185c036 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 5 Dec 2017 17:43:07 -0500 Subject: [PATCH 4/6] Implement ListCastKernel Change-Id: I6202c1de8663c0bec9408e158eb19aa29636426f --- cpp/src/arrow/compute/kernels/cast.cc | 145 ++++++++++++++------------ 1 file changed, 80 insertions(+), 65 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc index 927f81eb24a..0628857e4c2 100644 --- a/cpp/src/arrow/compute/kernels/cast.cc +++ b/cpp/src/arrow/compute/kernels/cast.cc @@ -463,13 +463,44 @@ struct CastFunctor { // ---------------------------------------------------------------------- // List to List -template <> -struct CastFunctor { - void operator()(FunctionContext* ctx, const CastOptions& options, - const ArrayData& input, ArrayData* output) { - output->buffers[0] = input.buffers[0]; - output->buffers[1] = input.buffers[1]; +class ListCastKernel : public UnaryKernel { + public: + ListCastKernel(std::unique_ptr child_caster, + const std::shared_ptr& out_type) + : child_caster_(std::move(child_caster)), out_type_(out_type) {} + + Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override { + DCHECK_EQ(Datum::ARRAY, input.kind()); + + const ArrayData& in_data = *input.array(); + DCHECK_EQ(Type::LIST, in_data.type->id()); + ArrayData* result; + + if (in_data.offset != 0) { + return Status::NotImplemented( + "Casting sliced lists (non-zero offset) not yet implemented"); + } + + if (out->kind() == Datum::NONE) { + out->value = ArrayData::Make(out_type_, in_data.length); + } + + result = out->array().get(); + + // Copy buffers from parent + result->buffers = in_data.buffers; + + Datum casted_child; + RETURN_NOT_OK(child_caster_->Call(ctx, Datum(in_data.child_data[0]), &casted_child)); + result->child_data.push_back(casted_child.array()); + + RETURN_IF_ERROR(ctx); + return Status::OK(); } + + private: + std::unique_ptr child_caster_; + std::shared_ptr out_type_; }; // ---------------------------------------------------------------------- @@ -710,16 +741,13 @@ static Status AllocateIfNotPreallocated(FunctionContext* ctx, const ArrayData& i const Type::type type_id = out->type->id(); if (!(is_primitive(type_id) || type_id == Type::FIXED_SIZE_BINARY || - type_id == Type::DECIMAL) && - type_id != Type::LIST) { + type_id == Type::DECIMAL)) { std::stringstream ss; ss << "Cannot pre-allocate memory for type: " << out->type->ToString(); return Status::NotImplemented(ss.str()); } - if (type_id == Type::LIST) { - out->buffers.resize(2); - } else if (type_id != Type::NA) { + if (type_id != Type::NA) { const auto& fw_type = static_cast(*out->type); int bit_width = fw_type.bit_width(); @@ -753,23 +781,6 @@ class CastKernel : public UnaryKernel { can_pre_allocate_values_(can_pre_allocate_values), out_type_(out_type) {} - Status SetChildCastKernel(const DataType& in_type) { - if (in_type.num_children() != out_type_->num_children()) { - return Status::SerializationError("Number of children are not same."); - } - - for (int i = 0; i < in_type.num_children(); i++) { - auto in_field = in_type.child(i); - auto out_field = out_type_->child(i); - std::unique_ptr child_cast_kernel; - RETURN_NOT_OK(GetCastFunction(*in_field->type(), out_field->type(), options_, - &child_cast_kernel)); - - child_cast_kernels.emplace_back(std::move(child_cast_kernel)); - } - return Status::OK(); - } - Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override { DCHECK_EQ(Datum::ARRAY, input.kind()); @@ -788,14 +799,6 @@ class CastKernel : public UnaryKernel { } func_(ctx, options_, in_data, result); - for (size_t i = 0; i < in_data.child_data.size(); i++) { - auto in_child_data = in_data.child_data[i]; - - Datum datum_out; - RETURN_NOT_OK(child_cast_kernels[i]->Call(ctx, Datum(in_child_data), &datum_out)); - result->child_data.emplace_back(datum_out.array()); - } - RETURN_IF_ERROR(ctx); return Status::OK(); } @@ -806,7 +809,6 @@ class CastKernel : public UnaryKernel { bool is_zero_copy_; bool can_pre_allocate_values_; std::shared_ptr out_type_; - std::vector> child_cast_kernels; }; #define CAST_CASE(InType, OutType) \ @@ -903,28 +905,22 @@ class CastKernel : public UnaryKernel { #define LIST_CASES(FN, IN_TYPE) FN(IN_TYPE, ListType) -#define GET_CAST_FUNCTION(CASE_GENERATOR, InType) \ - static std::unique_ptr Get##InType##CastFunc( \ - const DataType& in_type, const std::shared_ptr& out_type, \ - const CastOptions& options) { \ - CastFunction func; \ - bool is_zero_copy = false; \ - bool can_pre_allocate_values = true; \ - switch (out_type->id()) { \ - CASE_GENERATOR(CAST_CASE, InType); \ - default: \ - break; \ - } \ - if (func != nullptr) { \ - auto cast_kernel = new CastKernel(options, func, is_zero_copy, \ - can_pre_allocate_values, out_type); \ - if (cast_kernel->SetChildCastKernel(in_type).ok()) { \ - return std::unique_ptr(cast_kernel); \ - } else { \ - return nullptr; \ - } \ - } \ - return nullptr; \ +#define GET_CAST_FUNCTION(CASE_GENERATOR, InType) \ + static std::unique_ptr Get##InType##CastFunc( \ + const std::shared_ptr& out_type, const CastOptions& options) { \ + CastFunction func; \ + bool is_zero_copy = false; \ + bool can_pre_allocate_values = true; \ + switch (out_type->id()) { \ + CASE_GENERATOR(CAST_CASE, InType); \ + default: \ + break; \ + } \ + if (func != nullptr) { \ + return std::unique_ptr(new CastKernel( \ + options, func, is_zero_copy, can_pre_allocate_values, out_type)); \ + } \ + return nullptr; \ } GET_CAST_FUNCTION(NULL_CASES, NullType); @@ -944,15 +940,32 @@ GET_CAST_FUNCTION(DATE64_CASES, Date64Type); GET_CAST_FUNCTION(TIME32_CASES, Time32Type); GET_CAST_FUNCTION(TIME64_CASES, Time64Type); GET_CAST_FUNCTION(TIMESTAMP_CASES, TimestampType); - GET_CAST_FUNCTION(DICTIONARY_CASES, DictionaryType); -GET_CAST_FUNCTION(LIST_CASES, ListType); -#define CAST_FUNCTION_CASE(InType) \ - case InType::type_id: \ - *kernel = Get##InType##CastFunc(in_type, out_type, options); \ +#define CAST_FUNCTION_CASE(InType) \ + case InType::type_id: \ + *kernel = Get##InType##CastFunc(out_type, options); \ break +namespace { + +Status GetListCastFunc(const DataType& in_type, const std::shared_ptr& out_type, + const CastOptions& options, std::unique_ptr* kernel) { + if (out_type->id() != Type::LIST) { + // Kernel will be null + return Status::OK(); + } + const DataType& in_value_type = *static_cast(in_type).value_type(); + std::shared_ptr out_value_type = + static_cast(*out_type).value_type(); + std::unique_ptr child_caster; + RETURN_NOT_OK(GetCastFunction(in_value_type, out_value_type, options, &child_caster)); + *kernel = + std::unique_ptr(new ListCastKernel(std::move(child_caster), out_type)); + return Status::OK(); +} +} + Status GetCastFunction(const DataType& in_type, const std::shared_ptr& out_type, const CastOptions& options, std::unique_ptr* kernel) { switch (in_type.id()) { @@ -974,7 +987,9 @@ Status GetCastFunction(const DataType& in_type, const std::shared_ptr& CAST_FUNCTION_CASE(Time64Type); CAST_FUNCTION_CASE(TimestampType); CAST_FUNCTION_CASE(DictionaryType); - CAST_FUNCTION_CASE(ListType); + case Type::LIST: + RETURN_NOT_OK(GetListCastFunc(in_type, out_type, options, kernel)); + break; default: break; } From 0903e5fc58ab048a114f14aa3d9f4c4b99c92262 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 5 Dec 2017 17:45:20 -0500 Subject: [PATCH 5/6] Remove unneeded macro Change-Id: I7ed4b2d7ff7ab800ec2dc8d5df794dc2c6b1bdd7 --- cpp/src/arrow/compute/kernels/cast.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc index 0628857e4c2..475655e234a 100644 --- a/cpp/src/arrow/compute/kernels/cast.cc +++ b/cpp/src/arrow/compute/kernels/cast.cc @@ -903,8 +903,6 @@ class CastKernel : public UnaryKernel { FN(IN_TYPE, BinaryType); \ FN(IN_TYPE, StringType); -#define LIST_CASES(FN, IN_TYPE) FN(IN_TYPE, ListType) - #define GET_CAST_FUNCTION(CASE_GENERATOR, InType) \ static std::unique_ptr Get##InType##CastFunc( \ const std::shared_ptr& out_type, const CastOptions& options) { \ From 53a21a5f7b37f581bcf4e92876afac9c4082d972 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 6 Dec 2017 15:53:27 -0500 Subject: [PATCH 6/6] Fix cpplint warning Change-Id: I7167f22768b935202aabc57a9635b0ede13e2f03 --- cpp/src/arrow/compute/kernels/cast.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc index 475655e234a..afa05485f65 100644 --- a/cpp/src/arrow/compute/kernels/cast.cc +++ b/cpp/src/arrow/compute/kernels/cast.cc @@ -962,7 +962,8 @@ Status GetListCastFunc(const DataType& in_type, const std::shared_ptr& std::unique_ptr(new ListCastKernel(std::move(child_caster), out_type)); return Status::OK(); } -} + +} // namespace Status GetCastFunction(const DataType& in_type, const std::shared_ptr& out_type, const CastOptions& options, std::unique_ptr* kernel) {