From 26e8854db6d8505ba0aee8d47d8a7b37abadafa3 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 1 Jun 2020 16:58:12 -0500 Subject: [PATCH 1/2] Use Take to implement dictionary to dense cast for less code duplication --- .../compute/kernels/scalar_cast_internal.cc | 17 ++ .../compute/kernels/scalar_cast_internal.h | 177 +----------------- .../compute/kernels/scalar_cast_numeric.cc | 2 +- 3 files changed, 23 insertions(+), 173 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc index 4780da60749..4f32a9978f1 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc @@ -24,6 +24,23 @@ namespace arrow { namespace compute { namespace internal { +void UnpackDictionary(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + DictionaryArray dict_arr(batch[0].array()); + Result result = Take(Datum(dict_arr.dictionary()), Datum(dict_arr.indices()), + /*options=*/TakeOptions::Defaults(), ctx->exec_context()); + if (!result.ok()) { + ctx->SetStatus(result.status()); + return; + } + *out = *result; +} + +void OutputAllNull(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + ArrayData* output = out->mutable_array(); + output->buffers = {nullptr}; + output->null_count = batch.length; +} + void CastFromExtension(KernelContext* ctx, const ExecBatch& batch, Datum* out) { const CastOptions& options = checked_cast(ctx->state())->options; diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h index e6c687288ed..7d9baf6c0ce 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h +++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h @@ -21,6 +21,7 @@ #include #include "arrow/builder.h" +#include "arrow/compute/api_vector.h" #include "arrow/compute/cast.h" #include "arrow/compute/cast_internal.h" #include "arrow/compute/kernels/common.h" @@ -47,177 +48,9 @@ void CastFromExtension(KernelContext* ctx, const ExecBatch& batch, Datum* out); // ---------------------------------------------------------------------- // Dictionary to other things -template -struct FromDictVisitor {}; +void UnpackDictionary(KernelContext* ctx, const ExecBatch& batch, Datum* out); -// Visitor for Dict -template -struct FromDictVisitor> { - using ArrayType = typename TypeTraits::ArrayType; - - FromDictVisitor(KernelContext* ctx, const ArrayType& dictionary, ArrayData* output) - : dictionary_(dictionary), - byte_width_(dictionary.byte_width()), - out_(output->buffers[1]->mutable_data() + byte_width_ * output->offset) {} - - Status Init() { return Status::OK(); } - - Status VisitNull() { - memset(out_, 0, byte_width_); - out_ += byte_width_; - return Status::OK(); - } - - Status VisitValue(typename IndexType::c_type dict_index) { - const uint8_t* value = dictionary_.Value(dict_index); - memcpy(out_, value, byte_width_); - out_ += byte_width_; - return Status::OK(); - } - - Status Finish() { return Status::OK(); } - - const ArrayType& dictionary_; - int32_t byte_width_; - uint8_t* out_; -}; - -// Visitor for Dict -template -struct FromDictVisitor> { - using ArrayType = typename TypeTraits::ArrayType; - - FromDictVisitor(KernelContext* ctx, const ArrayType& dictionary, ArrayData* output) - : ctx_(ctx), dictionary_(dictionary), output_(output) {} - - Status Init() { - RETURN_NOT_OK(MakeBuilder(ctx_->memory_pool(), output_->type, &builder_)); - binary_builder_ = checked_cast(builder_.get()); - return Status::OK(); - } - - Status VisitNull() { return binary_builder_->AppendNull(); } - - Status VisitValue(typename IndexType::c_type dict_index) { - return binary_builder_->Append(dictionary_.GetView(dict_index)); - } - - Status Finish() { - std::shared_ptr plain_array; - RETURN_NOT_OK(binary_builder_->Finish(&plain_array)); - output_->buffers = plain_array->data()->buffers; - return Status::OK(); - } - - KernelContext* ctx_; - const ArrayType& dictionary_; - ArrayData* output_; - std::unique_ptr builder_; - BinaryBuilder* binary_builder_; -}; - -// Visitor for Dict -template -struct FromDictVisitor< - T, IndexType, enable_if_t::value || is_temporal_type::value>> { - using ArrayType = typename TypeTraits::ArrayType; - - using value_type = typename T::c_type; - - FromDictVisitor(KernelContext* ctx, const ArrayType& dictionary, ArrayData* output) - : dictionary_(dictionary), out_(output->GetMutableValues(1)) {} - - Status Init() { return Status::OK(); } - - Status VisitNull() { - *out_++ = value_type{}; // Zero-initialize - return Status::OK(); - } - - Status VisitValue(typename IndexType::c_type dict_index) { - *out_++ = dictionary_.Value(dict_index); - return Status::OK(); - } - - Status Finish() { return Status::OK(); } - - const ArrayType& dictionary_; - value_type* out_; -}; - -template -struct FromDictUnpackHelper { - using ArrayType = typename TypeTraits::ArrayType; - - template - void Unpack(KernelContext* ctx, const ArrayData& indices, const ArrayType& dictionary, - ArrayData* output) { - FromDictVisitor visitor{ctx, dictionary, output}; - KERNEL_RETURN_IF_ERROR(ctx, visitor.Init()); - KERNEL_RETURN_IF_ERROR(ctx, ArrayDataVisitor::Visit(indices, &visitor)); - KERNEL_RETURN_IF_ERROR(ctx, visitor.Finish()); - } -}; - -// Dispatch dictionary casts to UnpackHelper -template -struct FromDictionaryCast { - using ArrayType = typename TypeTraits::ArrayType; - - static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - const ArrayData& input = *batch[0].array(); - ArrayData* output = out->mutable_array(); - - const DictionaryType& type = checked_cast(*input.type); - const Array& dictionary = *input.dictionary; - const DataType& values_type = *dictionary.type(); - - // ARROW-7077 - if (!values_type.Equals(*output->type)) { - ctx->SetStatus(Status::Invalid("Cannot unpack dictionary of type ", type.ToString(), - " to type ", output->type->ToString())); - return; - } - - FromDictUnpackHelper unpack_helper; - switch (type.index_type()->id()) { - case Type::INT8: - unpack_helper.template Unpack( - ctx, input, static_cast(dictionary), output); - break; - case Type::INT16: - unpack_helper.template Unpack( - ctx, input, static_cast(dictionary), output); - break; - case Type::INT32: - unpack_helper.template Unpack( - ctx, input, static_cast(dictionary), output); - break; - case Type::INT64: - unpack_helper.template Unpack( - ctx, input, static_cast(dictionary), output); - break; - default: - ctx->SetStatus( - Status::TypeError("Invalid index type: ", type.index_type()->ToString())); - break; - } - } -}; - -template <> -struct FromDictionaryCast { - static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - ArrayData* output = out->mutable_array(); - output->buffers = {nullptr}; - output->null_count = batch.length; - } -}; - -template <> -struct FromDictionaryCast { - static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {} -}; +void OutputAllNull(KernelContext* ctx, const ExecBatch& batch, Datum* out); template struct FromNullCast { @@ -258,11 +91,11 @@ struct MaybeAddFromDictionary { template struct MaybeAddFromDictionary< T, enable_if_t::value && !is_nested_type::value && - !std::is_same::value>> { + !is_null_type::value && !std::is_same::value>> { static void Add(const OutputType& out_ty, CastFunction* func) { // Dictionary unpacking not implemented for boolean or nested types DCHECK_OK(func->AddKernel(Type::DICTIONARY, {InputType::Array(Type::DICTIONARY)}, - out_ty, FromDictionaryCast::Exec)); + out_ty, UnpackDictionary)); } }; diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc index 773decd9d76..983d407f348 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc @@ -521,7 +521,7 @@ std::vector> GetNumericCasts() { // to cast from dict -> null but there are unit tests for it auto cast_null = std::make_shared("cast_null", Type::NA); DCHECK_OK(cast_null->AddKernel(Type::DICTIONARY, {InputType::Array(Type::DICTIONARY)}, - null(), FromDictionaryCast::Exec)); + null(), OutputAllNull)); functions.push_back(cast_null); functions.push_back(GetCastToInteger("cast_int8")); From f7ba46ef7a845f8177ec43b1e38be17909f1d595 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 1 Jun 2020 17:18:43 -0500 Subject: [PATCH 2/2] Fix test failures caused by preallocation and inadequate error checking --- cpp/src/arrow/compute/kernels/scalar_cast_internal.cc | 10 ++++++++++ cpp/src/arrow/compute/kernels/scalar_cast_internal.h | 10 +++++++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc index 4f32a9978f1..585a75b2ed5 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc @@ -26,6 +26,16 @@ namespace internal { void UnpackDictionary(KernelContext* ctx, const ExecBatch& batch, Datum* out) { DictionaryArray dict_arr(batch[0].array()); + const CastOptions& options = checked_cast(*ctx->state()).options; + + const auto& dict_type = *dict_arr.dictionary()->type(); + if (!dict_type.Equals(options.to_type)) { + ctx->SetStatus(Status::Invalid("Cast type ", options.to_type->ToString(), + " incompatible with dictionary type ", + dict_type.ToString())); + return; + } + Result result = Take(Datum(dict_arr.dictionary()), Datum(dict_arr.indices()), /*options=*/TakeOptions::Defaults(), ctx->exec_context()); if (!result.ok()) { diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h index 7d9baf6c0ce..ecbf9e7c062 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h +++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h @@ -93,9 +93,13 @@ struct MaybeAddFromDictionary< T, enable_if_t::value && !is_nested_type::value && !is_null_type::value && !std::is_same::value>> { static void Add(const OutputType& out_ty, CastFunction* func) { - // Dictionary unpacking not implemented for boolean or nested types - DCHECK_OK(func->AddKernel(Type::DICTIONARY, {InputType::Array(Type::DICTIONARY)}, - out_ty, UnpackDictionary)); + // Dictionary unpacking not implemented for boolean or nested types. + // + // XXX: Uses Take and does its own memory allocation for the moment. We can + // fix this later. + DCHECK_OK(func->AddKernel( + Type::DICTIONARY, {InputType::Array(Type::DICTIONARY)}, out_ty, UnpackDictionary, + NullHandling::COMPUTED_NO_PREALLOCATE, MemAllocation::NO_PREALLOCATE)); } };