diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h index efb45252535..c8081968073 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.h +++ b/cpp/src/arrow/compute/kernels/codegen_internal.h @@ -783,6 +783,7 @@ ArrayKernelExec GeneratePhysicalInteger(detail::GetTypeId get_id) { case Type::DATE64: case Type::TIMESTAMP: case Type::TIME64: + case Type::DURATION: return Generator::Exec; case Type::UINT8: return Generator::Exec; diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc index 62fbdb4136a..b51124e4bab 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash.cc @@ -410,7 +410,6 @@ using enable_if_can_hash = template struct HashInitFunctor { - using ArrayType = typename TypeTraits::ArrayType; using HashKernelType = typename HashKernelTraits::HashKernelImpl; static std::unique_ptr Init(KernelContext* ctx, @@ -423,19 +422,48 @@ struct HashInitFunctor { }; template -struct HashInitVisitor { - VectorKernel* out; - - Status Visit(const DataType& type) { - return Status::NotImplemented("Hashing not available for ", type.ToString()); +KernelInit GetHashInit(Type::type type_id) { + // ARROW-8933: Generate only a single hash kernel per physical data + // representation + switch (type_id) { + case Type::NA: + return HashInitFunctor::Init; + case Type::BOOL: + return HashInitFunctor::Init; + case Type::INT8: + case Type::UINT8: + return HashInitFunctor::Init; + case Type::INT16: + case Type::UINT16: + return HashInitFunctor::Init; + case Type::INT32: + case Type::UINT32: + case Type::FLOAT: + case Type::DATE32: + case Type::TIME32: + return HashInitFunctor::Init; + case Type::INT64: + case Type::UINT64: + case Type::DOUBLE: + case Type::DATE64: + case Type::TIME64: + case Type::TIMESTAMP: + case Type::DURATION: + return HashInitFunctor::Init; + case Type::BINARY: + case Type::STRING: + return HashInitFunctor::Init; + case Type::LARGE_BINARY: + case Type::LARGE_STRING: + return HashInitFunctor::Init; + case Type::FIXED_SIZE_BINARY: + case Type::DECIMAL: + return HashInitFunctor::Init; + default: + DCHECK(false); + return nullptr; } - - template - enable_if_can_hash Visit(const Type&) { - out->init = HashInitFunctor::Init; - return Status::OK(); - } -}; +} void HashExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { auto hash_impl = checked_cast(ctx->state()); @@ -485,34 +513,28 @@ ValueDescr ValueCountsOutput(KernelContext*, const std::vector& desc {field(kValuesFieldName, descrs[0].type), field(kCountsFieldName, int64())})); } -template -void AddKernel(VectorFunction* func, VectorKernel kernel, - const std::shared_ptr& type) { - HashInitVisitor visitor{&kernel}; - DCHECK_OK(VisitTypeInline(*type, &visitor)); - DCHECK_OK(func->AddKernel(std::move(kernel))); -} - template void AddHashKernels(VectorFunction* func, VectorKernel base, OutputType::Resolver out_resolver) { OutputType out_ty(out_resolver); for (const auto& ty : PrimitiveTypes()) { + base.init = GetHashInit(ty->id()); base.signature = KernelSignature::Make({InputType::Array(ty)}, out_ty); - AddKernel(func, base, ty); + DCHECK_OK(func->AddKernel(base)); } // Example parametric types that we want to match only on Type::type auto parametric_types = {time32(TimeUnit::SECOND), time64(TimeUnit::MICRO), timestamp(TimeUnit::SECOND), fixed_size_binary(0)}; for (const auto& ty : parametric_types) { + base.init = GetHashInit(ty->id()); base.signature = KernelSignature::Make({InputType::Array(ty->id())}, out_ty); - AddKernel(func, base, /*dummy=*/ty); + DCHECK_OK(func->AddKernel(base)); } - // Handle Decimal as a physical string, not a number + base.init = GetHashInit(Type::DECIMAL); base.signature = KernelSignature::Make({InputType::Array(Type::DECIMAL)}, out_ty); - AddKernel(func, base, fixed_size_binary(0)); + DCHECK_OK(func->AddKernel(base)); } } // namespace diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc index ffe1e6d7f7e..0e5336a7f9e 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort.cc @@ -19,6 +19,7 @@ #include #include +#include "arrow/array/data.h" #include "arrow/compute/api_vector.h" #include "arrow/compute/kernels/common.h" #include "arrow/util/optional.h" @@ -37,11 +38,26 @@ struct PartitionIndicesState : public KernelState { int64_t pivot; }; +Status GetPhysicalView(const std::shared_ptr& arr, + const std::shared_ptr& type, + std::shared_ptr* out) { + if (!arr->type->Equals(*type)) { + return ::arrow::internal::GetArrayView(arr, type).Value(out); + } else { + *out = arr; + return Status::OK(); + } +} + template struct PartitionIndices { using ArrayType = typename TypeTraits::ArrayType; static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - ArrayType arr(batch[0].array()); + std::shared_ptr arg0; + KERNEL_RETURN_IF_ERROR( + ctx, + GetPhysicalView(batch[0].array(), TypeTraits::type_singleton(), &arg0)); + ArrayType arr(arg0); int64_t pivot = checked_cast(*ctx->state()).pivot; if (pivot > arr.length()) { @@ -227,7 +243,11 @@ template struct SortIndices { using ArrayType = typename TypeTraits::ArrayType; static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - ArrayType arr(batch[0].array()); + std::shared_ptr arg0; + KERNEL_RETURN_IF_ERROR( + ctx, + GetPhysicalView(batch[0].array(), TypeTraits::type_singleton(), &arg0)); + ArrayType arr(arg0); ArrayData* out_arr = out->mutable_array(); uint64_t* out_begin = out_arr->GetMutableValues(1); uint64_t* out_end = out_begin + arr.length(); @@ -259,7 +279,7 @@ void AddSortingKernels(VectorKernel base, VectorFunction* func) { } for (const auto& ty : BaseBinaryTypes()) { base.signature = KernelSignature::Make({InputType::Array(ty)}, uint64()); - base.exec = GenerateVarBinary(*ty); + base.exec = GenerateVarBinaryBase(*ty); DCHECK_OK(func->AddKernel(base)); } }