From 0e98691224243a459ae28e9369c564664851919d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 23 Jun 2020 17:48:48 -0500 Subject: [PATCH 1/4] Only generate one hash kernel per physical data layout --- cpp/src/arrow/compute/kernels/vector_hash.cc | 71 +++++++++++++------- 1 file changed, 47 insertions(+), 24 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc index 62fbdb4136a..a149ba54d7d 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash.cc @@ -410,7 +410,6 @@ using enable_if_can_hash = template struct HashInitFunctor { - using ArrayType = typename TypeTraits::ArrayType; using HashKernelType = typename HashKernelTraits::HashKernelImpl; static std::unique_ptr Init(KernelContext* ctx, @@ -423,19 +422,48 @@ struct HashInitFunctor { }; template -struct HashInitVisitor { - VectorKernel* out; - - Status Visit(const DataType& type) { - return Status::NotImplemented("Hashing not available for ", type.ToString()); +KernelInit GetHashInit(Type::type type_id) { + // ARROW-8933: Generate only a single hash kernel per physical data + // representation + switch (type_id) { + case Type::NA: + return HashInitFunctor::Init; + case Type::BOOL: + return HashInitFunctor::Init; + case Type::INT8: + case Type::UINT8: + return HashInitFunctor::Init; + case Type::INT16: + case Type::UINT16: + return HashInitFunctor::Init; + case Type::INT32: + case Type::UINT32: + case Type::FLOAT: + case Type::DATE32: + case Type::TIME32: + return HashInitFunctor::Init; + case Type::INT64: + case Type::UINT64: + case Type::DOUBLE: + case Type::DATE64: + case Type::TIME64: + case Type::TIMESTAMP: + case Type::DURATION: + return HashInitFunctor::Init; + case Type::BINARY: + case Type::STRING: + return HashInitFunctor::Init; + case Type::LARGE_BINARY: + case Type::LARGE_STRING: + return HashInitFunctor::Init; + case Type::FIXED_SIZE_BINARY: + case Type::DECIMAL: + return HashInitFunctor::Init; + default: + DCHECK(false); + return nullptr; } - - template - enable_if_can_hash Visit(const Type&) { - out->init = HashInitFunctor::Init; - return Status::OK(); - } -}; +} void HashExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { auto hash_impl = checked_cast(ctx->state()); @@ -485,34 +513,29 @@ ValueDescr ValueCountsOutput(KernelContext*, const std::vector& desc {field(kValuesFieldName, descrs[0].type), field(kCountsFieldName, int64())})); } -template -void AddKernel(VectorFunction* func, VectorKernel kernel, - const std::shared_ptr& type) { - HashInitVisitor visitor{&kernel}; - DCHECK_OK(VisitTypeInline(*type, &visitor)); - DCHECK_OK(func->AddKernel(std::move(kernel))); -} - template void AddHashKernels(VectorFunction* func, VectorKernel base, OutputType::Resolver out_resolver) { OutputType out_ty(out_resolver); for (const auto& ty : PrimitiveTypes()) { + base.init = GetHashInit(ty->id()); base.signature = KernelSignature::Make({InputType::Array(ty)}, out_ty); - AddKernel(func, base, ty); + DCHECK_OK(func->AddKernel(base)); } // Example parametric types that we want to match only on Type::type auto parametric_types = {time32(TimeUnit::SECOND), time64(TimeUnit::MICRO), timestamp(TimeUnit::SECOND), fixed_size_binary(0)}; for (const auto& ty : parametric_types) { + base.init = GetHashInit(ty->id()); base.signature = KernelSignature::Make({InputType::Array(ty->id())}, out_ty); - AddKernel(func, base, /*dummy=*/ty); + DCHECK_OK(func->AddKernel(base)); } // Handle Decimal as a physical string, not a number + base.init = GetHashInit(Type::DECIMAL); base.signature = KernelSignature::Make({InputType::Array(Type::DECIMAL)}, out_ty); - AddKernel(func, base, fixed_size_binary(0)); + DCHECK_OK(func->AddKernel(base)); } } // namespace From a8725493660b3d15349462dec4d0c5584714026d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 23 Jun 2020 17:52:54 -0500 Subject: [PATCH 2/4] Remove outdated comment --- cpp/src/arrow/compute/kernels/vector_hash.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc index a149ba54d7d..b51124e4bab 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash.cc @@ -532,7 +532,6 @@ void AddHashKernels(VectorFunction* func, VectorKernel base, DCHECK_OK(func->AddKernel(base)); } - // Handle Decimal as a physical string, not a number base.init = GetHashInit(Type::DECIMAL); base.signature = KernelSignature::Make({InputType::Array(Type::DECIMAL)}, out_ty); DCHECK_OK(func->AddKernel(base)); From 3522e589f3c5f4e77fc3d043693c06d14013e4b1 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 23 Jun 2020 18:00:55 -0500 Subject: [PATCH 3/4] Trim sort kernels a bit too --- cpp/src/arrow/compute/kernels/codegen_internal.h | 1 + cpp/src/arrow/compute/kernels/vector_sort.cc | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h index efb45252535..c8081968073 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.h +++ b/cpp/src/arrow/compute/kernels/codegen_internal.h @@ -783,6 +783,7 @@ ArrayKernelExec GeneratePhysicalInteger(detail::GetTypeId get_id) { case Type::DATE64: case Type::TIMESTAMP: case Type::TIME64: + case Type::DURATION: return Generator::Exec; case Type::UINT8: return Generator::Exec; diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc index ffe1e6d7f7e..9aa91c6a76f 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort.cc @@ -254,12 +254,12 @@ template