Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,33 @@ namespace arrow {
namespace compute {
namespace internal {

void UnpackDictionary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
DictionaryArray dict_arr(batch[0].array());
const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;

const auto& dict_type = *dict_arr.dictionary()->type();
if (!dict_type.Equals(options.to_type)) {
ctx->SetStatus(Status::Invalid("Cast type ", options.to_type->ToString(),
" incompatible with dictionary type ",
dict_type.ToString()));
return;
}

Result<Datum> result = Take(Datum(dict_arr.dictionary()), Datum(dict_arr.indices()),
/*options=*/TakeOptions::Defaults(), ctx->exec_context());
if (!result.ok()) {
ctx->SetStatus(result.status());
return;
}
*out = *result;
}

void OutputAllNull(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
ArrayData* output = out->mutable_array();
output->buffers = {nullptr};
output->null_count = batch.length;
}

void CastFromExtension(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const CastOptions& options = checked_cast<const CastState*>(ctx->state())->options;

Expand Down
185 changes: 11 additions & 174 deletions cpp/src/arrow/compute/kernels/scalar_cast_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <vector>

#include "arrow/builder.h"
#include "arrow/compute/api_vector.h"
#include "arrow/compute/cast.h"
#include "arrow/compute/cast_internal.h"
#include "arrow/compute/kernels/common.h"
Expand All @@ -47,177 +48,9 @@ void CastFromExtension(KernelContext* ctx, const ExecBatch& batch, Datum* out);
// ----------------------------------------------------------------------
// Dictionary to other things

template <typename T, typename IndexType, typename Enable = void>
struct FromDictVisitor {};
void UnpackDictionary(KernelContext* ctx, const ExecBatch& batch, Datum* out);

// Visitor for Dict<FixedSizeBinaryType>
template <typename T, typename IndexType>
struct FromDictVisitor<T, IndexType, enable_if_fixed_size_binary<T>> {
using ArrayType = typename TypeTraits<T>::ArrayType;

FromDictVisitor(KernelContext* ctx, const ArrayType& dictionary, ArrayData* output)
: dictionary_(dictionary),
byte_width_(dictionary.byte_width()),
out_(output->buffers[1]->mutable_data() + byte_width_ * output->offset) {}

Status Init() { return Status::OK(); }

Status VisitNull() {
memset(out_, 0, byte_width_);
out_ += byte_width_;
return Status::OK();
}

Status VisitValue(typename IndexType::c_type dict_index) {
const uint8_t* value = dictionary_.Value(dict_index);
memcpy(out_, value, byte_width_);
out_ += byte_width_;
return Status::OK();
}

Status Finish() { return Status::OK(); }

const ArrayType& dictionary_;
int32_t byte_width_;
uint8_t* out_;
};

// Visitor for Dict<BinaryType>
template <typename T, typename IndexType>
struct FromDictVisitor<T, IndexType, enable_if_base_binary<T>> {
using ArrayType = typename TypeTraits<T>::ArrayType;

FromDictVisitor(KernelContext* ctx, const ArrayType& dictionary, ArrayData* output)
: ctx_(ctx), dictionary_(dictionary), output_(output) {}

Status Init() {
RETURN_NOT_OK(MakeBuilder(ctx_->memory_pool(), output_->type, &builder_));
binary_builder_ = checked_cast<BinaryBuilder*>(builder_.get());
return Status::OK();
}

Status VisitNull() { return binary_builder_->AppendNull(); }

Status VisitValue(typename IndexType::c_type dict_index) {
return binary_builder_->Append(dictionary_.GetView(dict_index));
}

Status Finish() {
std::shared_ptr<Array> plain_array;
RETURN_NOT_OK(binary_builder_->Finish(&plain_array));
output_->buffers = plain_array->data()->buffers;
return Status::OK();
}

KernelContext* ctx_;
const ArrayType& dictionary_;
ArrayData* output_;
std::unique_ptr<ArrayBuilder> builder_;
BinaryBuilder* binary_builder_;
};

// Visitor for Dict<NumericType | TemporalType>
template <typename T, typename IndexType>
struct FromDictVisitor<
T, IndexType, enable_if_t<is_number_type<T>::value || is_temporal_type<T>::value>> {
using ArrayType = typename TypeTraits<T>::ArrayType;

using value_type = typename T::c_type;

FromDictVisitor(KernelContext* ctx, const ArrayType& dictionary, ArrayData* output)
: dictionary_(dictionary), out_(output->GetMutableValues<value_type>(1)) {}

Status Init() { return Status::OK(); }

Status VisitNull() {
*out_++ = value_type{}; // Zero-initialize
return Status::OK();
}

Status VisitValue(typename IndexType::c_type dict_index) {
*out_++ = dictionary_.Value(dict_index);
return Status::OK();
}

Status Finish() { return Status::OK(); }

const ArrayType& dictionary_;
value_type* out_;
};

template <typename T>
struct FromDictUnpackHelper {
using ArrayType = typename TypeTraits<T>::ArrayType;

template <typename IndexType>
void Unpack(KernelContext* ctx, const ArrayData& indices, const ArrayType& dictionary,
ArrayData* output) {
FromDictVisitor<T, IndexType> visitor{ctx, dictionary, output};
KERNEL_RETURN_IF_ERROR(ctx, visitor.Init());
KERNEL_RETURN_IF_ERROR(ctx, ArrayDataVisitor<IndexType>::Visit(indices, &visitor));
KERNEL_RETURN_IF_ERROR(ctx, visitor.Finish());
}
};

// Dispatch dictionary casts to UnpackHelper
template <typename T>
struct FromDictionaryCast {
using ArrayType = typename TypeTraits<T>::ArrayType;

static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const ArrayData& input = *batch[0].array();
ArrayData* output = out->mutable_array();

const DictionaryType& type = checked_cast<const DictionaryType&>(*input.type);
const Array& dictionary = *input.dictionary;
const DataType& values_type = *dictionary.type();

// ARROW-7077
if (!values_type.Equals(*output->type)) {
ctx->SetStatus(Status::Invalid("Cannot unpack dictionary of type ", type.ToString(),
" to type ", output->type->ToString()));
return;
}

FromDictUnpackHelper<T> unpack_helper;
switch (type.index_type()->id()) {
case Type::INT8:
unpack_helper.template Unpack<Int8Type>(
ctx, input, static_cast<const ArrayType&>(dictionary), output);
break;
case Type::INT16:
unpack_helper.template Unpack<Int16Type>(
ctx, input, static_cast<const ArrayType&>(dictionary), output);
break;
case Type::INT32:
unpack_helper.template Unpack<Int32Type>(
ctx, input, static_cast<const ArrayType&>(dictionary), output);
break;
case Type::INT64:
unpack_helper.template Unpack<Int64Type>(
ctx, input, static_cast<const ArrayType&>(dictionary), output);
break;
default:
ctx->SetStatus(
Status::TypeError("Invalid index type: ", type.index_type()->ToString()));
break;
}
}
};

template <>
struct FromDictionaryCast<NullType> {
static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
ArrayData* output = out->mutable_array();
output->buffers = {nullptr};
output->null_count = batch.length;
}
};

template <>
struct FromDictionaryCast<BooleanType> {
static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {}
};
void OutputAllNull(KernelContext* ctx, const ExecBatch& batch, Datum* out);

template <typename T>
struct FromNullCast {
Expand Down Expand Up @@ -258,11 +91,15 @@ struct MaybeAddFromDictionary {
template <typename T>
struct MaybeAddFromDictionary<
T, enable_if_t<!is_boolean_type<T>::value && !is_nested_type<T>::value &&
!std::is_same<DictionaryType, T>::value>> {
!is_null_type<T>::value && !std::is_same<DictionaryType, T>::value>> {
static void Add(const OutputType& out_ty, CastFunction* func) {
// Dictionary unpacking not implemented for boolean or nested types
DCHECK_OK(func->AddKernel(Type::DICTIONARY, {InputType::Array(Type::DICTIONARY)},
out_ty, FromDictionaryCast<T>::Exec));
// Dictionary unpacking not implemented for boolean or nested types.
//
// XXX: Uses Take and does its own memory allocation for the moment. We can
// fix this later.
DCHECK_OK(func->AddKernel(
Type::DICTIONARY, {InputType::Array(Type::DICTIONARY)}, out_ty, UnpackDictionary,
NullHandling::COMPUTED_NO_PREALLOCATE, MemAllocation::NO_PREALLOCATE));
}
};

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ std::vector<std::shared_ptr<CastFunction>> GetNumericCasts() {
// to cast from dict<null> -> null but there are unit tests for it
auto cast_null = std::make_shared<CastFunction>("cast_null", Type::NA);
DCHECK_OK(cast_null->AddKernel(Type::DICTIONARY, {InputType::Array(Type::DICTIONARY)},
null(), FromDictionaryCast<NullType>::Exec));
null(), OutputAllNull));
functions.push_back(cast_null);

functions.push_back(GetCastToInteger<Int8Type>("cast_int8"));
Expand Down