From a15573dd78220684c2eeb739467aeccf5bed2806 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 26 May 2020 15:55:23 -0500 Subject: [PATCH 1/3] Implement a couple example string kernels and relevant code generation support --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/compare.cc | 2 +- cpp/src/arrow/compute/kernels/CMakeLists.txt | 3 +- .../arrow/compute/kernels/codegen_internal.cc | 5 + .../arrow/compute/kernels/codegen_internal.h | 116 +++++++++++++++--- .../compute/kernels/scalar_set_lookup.cc | 6 +- .../compute/kernels/scalar_string_ascii.cc | 66 ++++++++++ .../compute/kernels/scalar_string_internal.h | 44 +++++++ .../compute/kernels/scalar_string_test.cc | 73 +++++++++++ cpp/src/arrow/compute/registry.cc | 1 + cpp/src/arrow/compute/registry_internal.h | 1 + cpp/src/arrow/scalar.cc | 6 + cpp/src/arrow/scalar.h | 2 + cpp/src/arrow/testing/gtest_util.cc | 16 +++ cpp/src/arrow/testing/gtest_util.h | 3 + cpp/src/arrow/type_traits.h | 2 + 16 files changed, 323 insertions(+), 24 deletions(-) create mode 100644 cpp/src/arrow/compute/kernels/scalar_string_ascii.cc create mode 100644 cpp/src/arrow/compute/kernels/scalar_string_internal.h create mode 100644 cpp/src/arrow/compute/kernels/scalar_string_test.cc diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 30c4c737081..031ae5d39c5 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -340,6 +340,7 @@ if(ARROW_COMPUTE) compute/kernels/scalar_cast_temporal.cc compute/kernels/scalar_compare.cc compute/kernels/scalar_set_lookup.cc + compute/kernels/scalar_string_ascii.cc compute/kernels/vector_filter.cc compute/kernels/vector_hash.cc compute/kernels/vector_sort.cc diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 351a42f4918..7c19a6fdc23 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -850,7 +850,7 @@ class ScalarEqualsVisitor { template typename std::enable_if::value, Status>::type Visit(const T& left) { - const auto& right = checked_cast(right_); + const auto& right = checked_cast(right_); result_ = internal::SharedPtrEquals(left.value, right.value); return Status::OK(); } diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index 361e24b7523..74493a85e18 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -24,7 +24,8 @@ add_arrow_compute_test(scalar_test scalar_boolean_test.cc scalar_cast_test.cc scalar_compare_test.cc - scalar_set_lookup_test.cc) + scalar_set_lookup_test.cc + scalar_string_test.cc) add_arrow_benchmark(scalar_compare_benchmark PREFIX "arrow-compute") diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc index 2771b6a89f8..5db4c92471e 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.cc +++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc @@ -102,6 +102,11 @@ const std::vector>& BaseBinaryTypes() { return g_base_binary_types; } +const std::vector>& StringTypes() { + static DataTypeVector types = {utf8(), large_utf8()}; + return types; +} + const std::vector>& SignedIntTypes() { std::call_once(codegen_static_initialized, InitStaticData); return g_signed_int_types; diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h index bf504a320fc..f08286abb01 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.h +++ b/cpp/src/arrow/compute/kernels/codegen_internal.h @@ -122,20 +122,55 @@ struct UnboxScalar> { }; template -struct GetValueType; +struct GetViewType; template -struct GetValueType> { +struct GetViewType> { using T = typename Type::c_type; }; template -struct GetValueType< +struct GetViewType< Type, enable_if_t::value || is_decimal_type::value || is_fixed_size_binary_type::value>> { using T = util::string_view; }; +template +struct GetOutputType; + +template +struct GetOutputType> { + using T = typename Type::c_type; +}; + +template +struct GetOutputType< + Type, enable_if_t::value>> { + using T = std::string; +}; + +template +struct BoxScalar; + +template +struct BoxScalar> { + using T = typename GetOutputType::T; + using ScalarType = typename TypeTraits::ScalarType; + static std::shared_ptr Box(T val, const std::shared_ptr& type) { + return std::make_shared(val, type); + } +}; + +template +struct BoxScalar> { + using T = typename GetOutputType::T; + using ScalarType = typename TypeTraits::ScalarType; + static std::shared_ptr Box(T val, const std::shared_ptr&) { + return std::make_shared(val); + } +}; + // ---------------------------------------------------------------------- // Reusable type resolvers @@ -154,6 +189,7 @@ void BinaryExecFlipped(KernelContext* ctx, ArrayKernelExec exec, // functions const std::vector>& BaseBinaryTypes(); +const std::vector>& StringTypes(); const std::vector>& SignedIntTypes(); const std::vector>& UnsignedIntTypes(); const std::vector>& IntTypes(); @@ -327,10 +363,8 @@ struct OutputAdapter> { // }; template struct ScalarUnary { - using OutScalar = typename TypeTraits::ScalarType; - - using OUT = typename GetValueType::T; - using ARG0 = typename GetValueType::T; + using OUT = typename GetOutputType::T; + using ARG0 = typename GetViewType::T; static void Array(KernelContext* ctx, const ExecBatch& batch, Datum* out) { ArrayIterator arg0(*batch[0].array()); @@ -342,8 +376,9 @@ struct ScalarUnary { static void Scalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) { if (batch[0].scalar()->is_valid) { ARG0 arg0 = UnboxScalar::Unbox(batch[0]); - out->value = std::make_shared(Op::template Call(ctx, arg0), - out->type()); + out->value = BoxScalar::Box( + Op::template Call(ctx, arg0), + out->type()); } else { out->value = MakeNullScalar(batch[0].type()); } @@ -363,9 +398,8 @@ struct ScalarUnary { template struct ScalarUnaryNotNullStateful { using ThisType = ScalarUnaryNotNullStateful; - using OutScalar = typename TypeTraits::ScalarType; - using OUT = typename GetValueType::T; - using ARG0 = typename GetValueType::T; + using OUT = typename GetOutputType::T; + using ARG0 = typename GetViewType::T; Op op; ScalarUnaryNotNullStateful(Op op) : op(std::move(op)) {} @@ -394,6 +428,30 @@ struct ScalarUnaryNotNullStateful { } }; + template + struct ArrayExec> { + static void Exec(const ThisType& functor, KernelContext* ctx, const ExecBatch& batch, + Datum* out) { + typename TypeTraits::BuilderType builder; + Status s = VisitArrayDataInline( + *batch[0].array(), [&](util::optional v) -> Status { + if (v.has_value()) { + return builder.Append(functor.op.Call(ctx, *v)); + } else { + return builder.AppendNull(); + } + }); + if (!s.ok()) { + ctx->SetStatus(s); + return; + } else { + std::shared_ptr result; + ctx->SetStatus(builder.FinishInternal(&result)); + out->value = std::move(result); + } + } + }; + template struct ArrayExec::value>> { static void Exec(const ThisType& functor, KernelContext* ctx, const ExecBatch& batch, @@ -416,7 +474,7 @@ struct ScalarUnaryNotNullStateful { void Scalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) { if (batch[0].scalar()->is_valid) { ARG0 arg0 = UnboxScalar::Unbox(batch[0]); - out->value = std::make_shared( + out->value = BoxScalar::Box( this->op.template Call(ctx, arg0), out->type()); } else { @@ -438,6 +496,9 @@ struct ScalarUnaryNotNullStateful { // operator requires some initialization use ScalarUnaryNotNullStateful template struct ScalarUnaryNotNull { + using OUT = typename GetOutputType::T; + using ARG0 = typename GetViewType::T; + static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { // Seed kernel with dummy state ScalarUnaryNotNullStateful kernel({}); @@ -464,11 +525,9 @@ struct ScalarUnaryNotNull { template struct ScalarBinary { - using OutScalarType = typename TypeTraits::ScalarType; - - using OUT = typename GetValueType::T; - using ARG0 = typename GetValueType::T; - using ARG1 = typename GetValueType::T; + using OUT = typename GetOutputType::T; + using ARG0 = typename GetViewType::T; + using ARG1 = typename GetViewType::T; template static void ArrayArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) { @@ -492,7 +551,8 @@ struct ScalarBinary { static void ScalarScalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) { auto arg0 = UnboxScalar::Unbox(batch[0]); auto arg1 = UnboxScalar::Unbox(batch[1]); - out->value = std::make_shared(ChosenOp::template Call(ctx, arg0, arg1)); + out->value = BoxScalar::Box(ChosenOp::template Call(ctx, arg0, arg1), + out->type()); } static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { @@ -746,6 +806,24 @@ ArrayKernelExec BaseBinary(detail::GetTypeId get_id) { } } +// Generate a kernel given a templated functor for string types +// +// See "Numeric" above for description of the generator functor +template