From 66c08249589789fa5bbdff66b05c596c6b5bb506 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 14 Jul 2020 11:13:04 +0200 Subject: [PATCH 1/3] ARROW-9390: [C++][Doc] Review compute function names Modified function names: * minmax -> min_max * binary_isascii -> string_isascii (only works on string types) * ascii_length -> binary_length (also make it work on binary types) * binary_contains_exact -> match_substring (other possibility: has_substring ?) * match -> index_in * isin -> is_in * list_value_lengths -> list_value_length * partition_indices -> partition_nth_indices (other kinds of partitioning would be possible, e.g. using a predicate) Document string predicate functions (ARROW-9444). Also fix the allocation of IsValid output buffer in certain cases. --- cpp/src/arrow/compute/api_aggregate.cc | 2 +- cpp/src/arrow/compute/api_scalar.cc | 6 +- cpp/src/arrow/compute/api_scalar.h | 17 +- cpp/src/arrow/compute/api_vector.cc | 6 +- cpp/src/arrow/compute/api_vector.h | 4 +- .../arrow/compute/kernels/aggregate_basic.cc | 2 +- .../arrow/compute/kernels/aggregate_test.cc | 4 +- .../arrow/compute/kernels/scalar_nested.cc | 16 +- .../compute/kernels/scalar_nested_test.cc | 4 +- .../compute/kernels/scalar_set_lookup.cc | 47 ++--- .../compute/kernels/scalar_set_lookup_test.cc | 22 +-- .../arrow/compute/kernels/scalar_string.cc | 67 +++---- .../kernels/scalar_string_benchmark.cc | 8 +- .../compute/kernels/scalar_string_test.cc | 88 ++++----- .../arrow/compute/kernels/scalar_validity.cc | 36 ++-- cpp/src/arrow/compute/kernels/test_util.cc | 5 +- .../compute/kernels/vector_nested_test.cc | 12 +- .../compute/kernels/vector_selection_test.cc | 1 + cpp/src/arrow/compute/kernels/vector_sort.cc | 18 +- .../arrow/compute/kernels/vector_sort_test.cc | 3 +- cpp/src/arrow/dataset/partition.cc | 2 +- cpp/src/arrow/type_traits.h | 2 + docs/source/cpp/compute.rst | 168 +++++++++++++----- python/pyarrow/_compute.pyx | 13 +- python/pyarrow/array.pxi | 2 +- python/pyarrow/compute.py | 24 ++- python/pyarrow/includes/libarrow.pxd | 6 +- python/pyarrow/tests/test_compute.py | 24 ++- r/R/compute.R | 4 +- r/src/compute.cpp | 2 +- 30 files changed, 374 insertions(+), 241 deletions(-) diff --git a/cpp/src/arrow/compute/api_aggregate.cc b/cpp/src/arrow/compute/api_aggregate.cc index 358a5ad0cee..95e859e20ee 100644 --- a/cpp/src/arrow/compute/api_aggregate.cc +++ b/cpp/src/arrow/compute/api_aggregate.cc @@ -38,7 +38,7 @@ Result Sum(const Datum& value, ExecContext* ctx) { } Result MinMax(const Datum& value, const MinMaxOptions& options, ExecContext* ctx) { - return CallFunction("minmax", {value}, &options, ctx); + return CallFunction("min_max", {value}, &options, ctx); } } // namespace compute diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index 77893f74fcd..9a911030999 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -73,12 +73,12 @@ static Result ExecSetLookup(const std::string& func_name, const Datum& da } Result IsIn(const Datum& values, const Datum& value_set, ExecContext* ctx) { - return ExecSetLookup("isin", values, value_set, + return ExecSetLookup("is_in", values, value_set, /*add_nulls_to_hash_table=*/false, ctx); } -Result Match(const Datum& values, const Datum& value_set, ExecContext* ctx) { - return ExecSetLookup("match", values, value_set, +Result IndexIn(const Datum& values, const Datum& value_set, ExecContext* ctx) { + return ExecSetLookup("index_in", values, value_set, /*add_nulls_to_hash_table=*/true, ctx); } diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 1d8ef091815..80e3ebb98b3 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -42,15 +42,14 @@ struct ArithmeticOptions : public FunctionOptions { bool check_overflow; }; -struct ARROW_EXPORT BinaryContainsExactOptions : public FunctionOptions { - explicit BinaryContainsExactOptions(std::string pattern) - : pattern(std::move(pattern)) {} +struct ARROW_EXPORT MatchSubstringOptions : public FunctionOptions { + explicit MatchSubstringOptions(std::string pattern) : pattern(std::move(pattern)) {} - /// The exact pattern to look for inside input values. + /// The exact substring to look for inside input values. std::string pattern; }; -/// Options for IsIn and Match functions +/// Options for IsIn and IndexIn functions struct ARROW_EXPORT SetLookupOptions : public FunctionOptions { explicit SetLookupOptions(Datum value_set, bool skip_nulls) : value_set(std::move(value_set)), skip_nulls(skip_nulls) {} @@ -60,7 +59,7 @@ struct ARROW_EXPORT SetLookupOptions : public FunctionOptions { /// Whether nulls in `value_set` count for lookup. /// /// If true, any null in `value_set` is ignored and nulls in the input - /// produce null (Match) or false (IsIn) values in the output. + /// produce null (IndexIn) or false (IsIn) values in the output. /// If false, any null in `value_set` is successfully matched in /// the input. bool skip_nulls; @@ -238,7 +237,7 @@ ARROW_EXPORT Result IsIn(const Datum& values, const Datum& value_set, ExecContext* ctx = NULLPTR); -/// \brief Match examines each slot in the values against a value_set array. +/// \brief IndexIn examines each slot in the values against a value_set array. /// If the value is not found in value_set, null will be output. /// If found, the index of occurrence within value_set (ignoring duplicates) /// will be output. @@ -259,8 +258,8 @@ Result IsIn(const Datum& values, const Datum& value_set, /// \since 1.0.0 /// \note API not yet finalized ARROW_EXPORT -Result Match(const Datum& values, const Datum& value_set, - ExecContext* ctx = NULLPTR); +Result IndexIn(const Datum& values, const Datum& value_set, + ExecContext* ctx = NULLPTR); /// \brief IsValid returns true for each element of `values` that is not null, /// false otherwise diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc index dd9c43910f1..9a36714d107 100644 --- a/cpp/src/arrow/compute/api_vector.cc +++ b/cpp/src/arrow/compute/api_vector.cc @@ -35,9 +35,9 @@ namespace compute { Result> NthToIndices(const Array& values, int64_t n, ExecContext* ctx) { - PartitionOptions options(/*pivot=*/n); - ARROW_ASSIGN_OR_RAISE( - Datum result, CallFunction("partition_indices", {Datum(values)}, &options, ctx)); + PartitionNthOptions options(/*pivot=*/n); + ARROW_ASSIGN_OR_RAISE(Datum result, CallFunction("partition_nth_indices", + {Datum(values)}, &options, ctx)); return result.make_array(); } diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index c3e9dc9d2fc..3aa3434c098 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -59,8 +59,8 @@ struct ARROW_EXPORT TakeOptions : public FunctionOptions { }; /// \brief Partitioning options for NthToIndices -struct PartitionOptions : public FunctionOptions { - explicit PartitionOptions(int64_t pivot) : pivot(pivot) {} +struct PartitionNthOptions : public FunctionOptions { + explicit PartitionNthOptions(int64_t pivot) : pivot(pivot) {} /// The index into the equivalent sorted array of the partition pivot element. int64_t pivot; diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 8765914ac73..2349360ba82 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -398,7 +398,7 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::move(func))); static auto default_minmax_options = MinMaxOptions::Defaults(); - func = std::make_shared("minmax", Arity::Unary(), + func = std::make_shared("min_max", Arity::Unary(), &default_minmax_options); aggregate::AddMinMaxKernels(aggregate::MinMaxInit, {boolean()}, func.get()); aggregate::AddMinMaxKernels(aggregate::MinMaxInit, NumericTypes(), func.get()); diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index db548f27b36..6658a7e9234 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -583,11 +583,11 @@ TYPED_TEST(TestFloatingMinMaxKernel, Floats) { TYPED_TEST(TestFloatingMinMaxKernel, DefaultOptions) { auto values = ArrayFromJSON(this->type_singleton(), "[0, 1, 2, 3, 4]"); - ASSERT_OK_AND_ASSIGN(auto no_options_provided, CallFunction("minmax", {values})); + ASSERT_OK_AND_ASSIGN(auto no_options_provided, CallFunction("min_max", {values})); auto default_options = MinMaxOptions::Defaults(); ASSERT_OK_AND_ASSIGN(auto explicit_defaults, - CallFunction("minmax", {values}, &default_options)); + CallFunction("min_max", {values}, &default_options)); AssertDatumsEqual(explicit_defaults, no_options_provided); } diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index 7c61aa3b476..677c577364e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -28,7 +28,7 @@ namespace internal { namespace { template -void ListValueLengths(KernelContext* ctx, const ExecBatch& batch, Datum* out) { +void ListValueLength(KernelContext* ctx, const ExecBatch& batch, Datum* out) { using ScalarType = typename TypeTraits::ScalarType; using OffsetScalarType = typename TypeTraits::OffsetScalarType; @@ -55,13 +55,13 @@ void ListValueLengths(KernelContext* ctx, const ExecBatch& batch, Datum* out) { } // namespace void RegisterScalarNested(FunctionRegistry* registry) { - auto list_value_lengths = - std::make_shared("list_value_lengths", Arity::Unary()); - DCHECK_OK(list_value_lengths->AddKernel({InputType(Type::LIST)}, int32(), - ListValueLengths)); - DCHECK_OK(list_value_lengths->AddKernel({InputType(Type::LARGE_LIST)}, int64(), - ListValueLengths)); - DCHECK_OK(registry->AddFunction(std::move(list_value_lengths))); + auto list_value_length = + std::make_shared("list_value_length", Arity::Unary()); + DCHECK_OK(list_value_length->AddKernel({InputType(Type::LIST)}, int32(), + ListValueLength)); + DCHECK_OK(list_value_length->AddKernel({InputType(Type::LARGE_LIST)}, int64(), + ListValueLength)); + DCHECK_OK(registry->AddFunction(std::move(list_value_length))); } } // namespace internal diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc index 4657c41a407..24776913ee0 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc @@ -29,9 +29,9 @@ static std::shared_ptr GetOffsetType(const DataType& type) { return type.id() == Type::LIST ? int32() : int64(); } -TEST(TestScalarNested, ListValueLengths) { +TEST(TestScalarNested, ListValueLength) { for (auto ty : {list(int32()), large_list(int32())}) { - CheckScalarUnary("list_value_lengths", ty, "[[0, null, 1], null, [2, 3], []]", + CheckScalarUnary("list_value_length", ty, "[[0, null, 1], null, [2, 3], []]", GetOffsetType(*ty), "[3, null, 2, 0]"); } } diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc index 67e6204a8de..726b01fb477 100644 --- a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc +++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc @@ -165,13 +165,13 @@ std::unique_ptr InitSetLookup(KernelContext* ctx, return result; } -struct MatchVisitor { +struct IndexInVisitor { KernelContext* ctx; const ArrayData& data; Datum* out; Int32Builder builder; - MatchVisitor(KernelContext* ctx, const ArrayData& data, Datum* out) + IndexInVisitor(KernelContext* ctx, const ArrayData& data, Datum* out) : ctx(ctx), data(data), out(out), builder(ctx->exec_context()->memory_pool()) {} Status Visit(const DataType&) { @@ -190,7 +190,7 @@ struct MatchVisitor { } template - Status ProcessMatch() { + Status ProcessIndexIn() { using T = typename GetViewType::T; const auto& state = checked_cast&>(*ctx->state()); @@ -223,23 +223,24 @@ struct MatchVisitor { template enable_if_boolean Visit(const Type&) { - return ProcessMatch(); + return ProcessIndexIn(); } template enable_if_t::value && !is_boolean_type::value, Status> Visit( const Type&) { - return ProcessMatch::Type>(); + return ProcessIndexIn< + typename UnsignedIntType::Type>(); } template enable_if_base_binary Visit(const Type&) { - return ProcessMatch(); + return ProcessIndexIn(); } // Handle Decimal128Type, FixedSizeBinaryType Status Visit(const FixedSizeBinaryType& type) { - return ProcessMatch(); + return ProcessIndexIn(); } Status Execute() { @@ -269,9 +270,9 @@ void ExecArrayOrScalar(KernelContext* ctx, const Datum& in, Datum* out, *out = std::move(out_scalar); } -void ExecMatch(KernelContext* ctx, const ExecBatch& batch, Datum* out) { +void ExecIndexIn(KernelContext* ctx, const ExecBatch& batch, Datum* out) { ExecArrayOrScalar(ctx, batch[0], out, [&](const ArrayData& in) { - return MatchVisitor(ctx, in, out).Execute(); + return IndexInVisitor(ctx, in, out).Execute(); }); } @@ -395,10 +396,10 @@ void AddBasicSetLookupKernels(ScalarKernel kernel, } } -// Enables calling isin with CallFunction as though it were binary. +// Enables calling is_in with CallFunction as though it were binary. class IsInMetaBinary : public MetaFunction { public: - IsInMetaBinary() : MetaFunction("isin_meta_binary", Arity::Binary()) {} + IsInMetaBinary() : MetaFunction("is_in_meta_binary", Arity::Binary()) {} Result ExecuteImpl(const std::vector& args, const FunctionOptions* options, @@ -408,16 +409,16 @@ class IsInMetaBinary : public MetaFunction { } }; -// Enables calling match with CallFunction as though it were binary. -class MatchMetaBinary : public MetaFunction { +// Enables calling index_in with CallFunction as though it were binary. +class IndexInMetaBinary : public MetaFunction { public: - MatchMetaBinary() : MetaFunction("match_meta_binary", Arity::Binary()) {} + IndexInMetaBinary() : MetaFunction("index_in_meta_binary", Arity::Binary()) {} Result ExecuteImpl(const std::vector& args, const FunctionOptions* options, ExecContext* ctx) const override { DCHECK_EQ(options, nullptr); - return Match(args[0], args[1], ctx); + return IndexIn(args[0], args[1], ctx); } }; @@ -429,33 +430,33 @@ void RegisterScalarSetLookup(FunctionRegistry* registry) { ScalarKernel isin_base; isin_base.init = InitSetLookup; isin_base.exec = ExecIsIn; - auto isin = std::make_shared("isin", Arity::Unary()); + auto is_in = std::make_shared("is_in", Arity::Unary()); - AddBasicSetLookupKernels(isin_base, /*output_type=*/boolean(), isin.get()); + AddBasicSetLookupKernels(isin_base, /*output_type=*/boolean(), is_in.get()); isin_base.signature = KernelSignature::Make({null()}, boolean()); isin_base.null_handling = NullHandling::COMPUTED_PREALLOCATE; - DCHECK_OK(isin->AddKernel(isin_base)); - DCHECK_OK(registry->AddFunction(isin)); + DCHECK_OK(is_in->AddKernel(isin_base)); + DCHECK_OK(registry->AddFunction(is_in)); DCHECK_OK(registry->AddFunction(std::make_shared())); } - // Match uses Int32Builder and so is responsible for all its own allocation + // IndexIn uses Int32Builder and so is responsible for all its own allocation { ScalarKernel match_base; match_base.init = InitSetLookup; - match_base.exec = ExecMatch; + match_base.exec = ExecIndexIn; match_base.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; match_base.mem_allocation = MemAllocation::NO_PREALLOCATE; - auto match = std::make_shared("match", Arity::Unary()); + auto match = std::make_shared("index_in", Arity::Unary()); AddBasicSetLookupKernels(match_base, /*output_type=*/int32(), match.get()); match_base.signature = KernelSignature::Make({null()}, int32()); DCHECK_OK(match->AddKernel(match_base)); DCHECK_OK(registry->AddFunction(match)); - DCHECK_OK(registry->AddFunction(std::make_shared())); + DCHECK_OK(registry->AddFunction(std::make_shared())); } } diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc index a20fe61d1bc..e1540d15b1f 100644 --- a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc @@ -73,9 +73,9 @@ class TestIsInKernel : public ::testing::Test {}; TEST_F(TestIsInKernel, CallBinary) { auto haystack = ArrayFromJSON(int8(), "[0, 1, 2, 3, 4, 5, 6, 7, 8]"); auto needles = ArrayFromJSON(int8(), "[2, 3, 5, 7]"); - ASSERT_RAISES(Invalid, CallFunction("isin", {haystack, needles})); + ASSERT_RAISES(Invalid, CallFunction("is_in", {haystack, needles})); - ASSERT_OK_AND_ASSIGN(Datum out, CallFunction("isin_meta_binary", {haystack, needles})); + ASSERT_OK_AND_ASSIGN(Datum out, CallFunction("is_in_meta_binary", {haystack, needles})); auto expected = ArrayFromJSON(boolean(), ("[false, false, true, true, false," "true, false, true, false]")); AssertArraysEqual(*expected, *out.make_array()); @@ -118,6 +118,7 @@ TYPED_TEST(TestIsInKernelPrimitive, IsIn) { CheckIsIn( type, {2, 1, 2, 3}, {false, false, false, false}, {2, 1, 2, 1, 2, 3, 3}, {false, false, false, false, false, false, false}, {true, true, true, true}, {}); + // No Match CheckIsIn( type, {2, 1, 7, 3, 8}, {true, false, true, true, true}, {2, 1, 2, 1, 6, 3, 3}, @@ -411,7 +412,7 @@ TEST_F(TestIsInKernel, IsInChunkedArrayInvoke) { AssertChunkedEquivalent(*expected_carr, *encoded_out.chunked_array()); } // ---------------------------------------------------------------------- -// Match tests +// IndexIn tests class TestMatchKernel : public ::testing::Test { public: @@ -421,7 +422,7 @@ class TestMatchKernel : public ::testing::Test { std::shared_ptr needles = ArrayFromJSON(type, needles_json); std::shared_ptr expected = ArrayFromJSON(int32(), expected_json); - ASSERT_OK_AND_ASSIGN(Datum actual_datum, Match(haystack, needles)); + ASSERT_OK_AND_ASSIGN(Datum actual_datum, IndexIn(haystack, needles)); std::shared_ptr actual = actual_datum.make_array(); AssertArraysEqual(*expected, *actual, /*verbose=*/true); } @@ -430,9 +431,10 @@ class TestMatchKernel : public ::testing::Test { TEST_F(TestMatchKernel, CallBinary) { auto haystack = ArrayFromJSON(int8(), "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"); auto needles = ArrayFromJSON(int8(), "[2, 3, 5, 7]"); - ASSERT_RAISES(Invalid, CallFunction("match", {haystack, needles})); + ASSERT_RAISES(Invalid, CallFunction("index_in", {haystack, needles})); - ASSERT_OK_AND_ASSIGN(Datum out, CallFunction("match_meta_binary", {haystack, needles})); + ASSERT_OK_AND_ASSIGN(Datum out, + CallFunction("index_in_meta_binary", {haystack, needles})); auto expected = ArrayFromJSON(int32(), ("[null, null, 0, 1, null, 2, null, 3, null," " null, null]")); AssertArraysEqual(*expected, *out.make_array()); @@ -448,7 +450,7 @@ using PrimitiveDictionaries = TYPED_TEST_SUITE(TestMatchKernelPrimitive, PrimitiveDictionaries); -TYPED_TEST(TestMatchKernelPrimitive, Match) { +TYPED_TEST(TestMatchKernelPrimitive, IndexIn) { auto type = TypeTraits::type_singleton(); // No Nulls @@ -508,7 +510,7 @@ TYPED_TEST(TestMatchKernelPrimitive, PrimitiveResizeTable) { needles = haystack; ASSERT_OK(expected_builder.Finish(&expected)); - ASSERT_OK_AND_ASSIGN(Datum actual_datum, Match(haystack, needles)); + ASSERT_OK_AND_ASSIGN(Datum actual_datum, IndexIn(haystack, needles)); std::shared_ptr actual = actual_datum.make_array(); ASSERT_ARRAYS_EQUAL(*expected, *actual); } @@ -667,7 +669,7 @@ TEST_F(TestMatchKernel, BinaryResizeTable) { needles = haystack; ASSERT_OK(expected_builder.Finish(&expected)); - ASSERT_OK_AND_ASSIGN(Datum actual_datum, Match(haystack, needles)); + ASSERT_OK_AND_ASSIGN(Datum actual_datum, IndexIn(haystack, needles)); std::shared_ptr actual = actual_datum.make_array(); ASSERT_ARRAYS_EQUAL(*expected, *actual); } @@ -749,7 +751,7 @@ TEST_F(TestMatchKernel, MatchChunkedArrayInvoke) { ArrayVector expected = {i1, i2}; auto expected_carr = std::make_shared(expected); - ASSERT_OK_AND_ASSIGN(Datum encoded_out, Match(carr, Datum(member_set))); + ASSERT_OK_AND_ASSIGN(Datum encoded_out, IndexIn(carr, Datum(member_set))); ASSERT_EQ(Datum::CHUNKED_ARRAY, encoded_out.kind()); AssertChunkedEquivalent(*expected_carr, *encoded_out.chunked_array()); diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index fbacaab54d6..31517930c06 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -55,9 +55,7 @@ static inline bool IsAsciiCharacter(T character) { return character < 128; } -// TODO: optional ascii validation - -struct AsciiLength { +struct BinaryLength { template static OUT Call(KernelContext*, ARG0 val) { return static_cast(val.size()); @@ -297,17 +295,6 @@ struct AsciiLower { } }; -void AddAsciiLength(FunctionRegistry* registry) { - auto func = std::make_shared("ascii_length", Arity::Unary()); - ArrayKernelExec exec_offset_32 = - applicator::ScalarUnaryNotNull::Exec; - ArrayKernelExec exec_offset_64 = - applicator::ScalarUnaryNotNull::Exec; - DCHECK_OK(func->AddKernel({utf8()}, int32(), exec_offset_32)); - DCHECK_OK(func->AddKernel({large_utf8()}, int64(), exec_offset_64)); - DCHECK_OK(registry->AddFunction(std::move(func))); -} - // ---------------------------------------------------------------------- // exact pattern detection @@ -344,10 +331,9 @@ void StringBoolTransform(KernelContext* ctx, const ExecBatch& batch, } template -void TransformBinaryContainsExact(const uint8_t* pattern, int64_t pattern_length, - const offset_type* offsets, const uint8_t* data, - int64_t length, int64_t output_offset, - uint8_t* output) { +void TransformMatchSubstring(const uint8_t* pattern, int64_t pattern_length, + const offset_type* offsets, const uint8_t* data, + int64_t length, int64_t output_offset, uint8_t* output) { // This is an implementation of the Knuth-Morris-Pratt algorithm // Phase 1: Build the prefix table @@ -385,20 +371,20 @@ void TransformBinaryContainsExact(const uint8_t* pattern, int64_t pattern_length bitmap_writer.Finish(); } -using BinaryContainsExactState = OptionsWrapper; +using MatchSubstringState = OptionsWrapper; template -struct BinaryContainsExact { +struct MatchSubstring { using offset_type = typename Type::offset_type; static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - BinaryContainsExactOptions arg = BinaryContainsExactState::Get(ctx); + MatchSubstringOptions arg = MatchSubstringState::Get(ctx); const uint8_t* pat = reinterpret_cast(arg.pattern.c_str()); const int64_t pat_size = arg.pattern.length(); StringBoolTransform( ctx, batch, [pat, pat_size](const void* offsets, const uint8_t* data, int64_t length, int64_t output_offset, uint8_t* output) { - TransformBinaryContainsExact( + TransformMatchSubstring( pat, pat_size, reinterpret_cast(offsets), data, length, output_offset, output); }, @@ -406,14 +392,13 @@ struct BinaryContainsExact { } }; -void AddBinaryContainsExact(FunctionRegistry* registry) { - auto func = std::make_shared("binary_contains_exact", Arity::Unary()); - auto exec_32 = BinaryContainsExact::Exec; - auto exec_64 = BinaryContainsExact::Exec; +void AddMatchSubstring(FunctionRegistry* registry) { + auto func = std::make_shared("match_substring", Arity::Unary()); + auto exec_32 = MatchSubstring::Exec; + auto exec_64 = MatchSubstring::Exec; + DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init)); DCHECK_OK( - func->AddKernel({utf8()}, boolean(), exec_32, BinaryContainsExactState::Init)); - DCHECK_OK(func->AddKernel({large_utf8()}, boolean(), exec_64, - BinaryContainsExactState::Init)); + func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init)); DCHECK_OK(registry->AddFunction(std::move(func))); } @@ -870,6 +855,21 @@ void AddStrptime(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::move(func))); } +void AddBinaryLength(FunctionRegistry* registry) { + auto func = std::make_shared("binary_length", Arity::Unary()); + ArrayKernelExec exec_offset_32 = + applicator::ScalarUnaryNotNull::Exec; + ArrayKernelExec exec_offset_64 = + applicator::ScalarUnaryNotNull::Exec; + for (const auto input_type : {binary(), utf8()}) { + DCHECK_OK(func->AddKernel({input_type}, int32(), exec_offset_32)); + } + for (const auto input_type : {large_binary(), large_utf8()}) { + DCHECK_OK(func->AddKernel({input_type}, int64(), exec_offset_64)); + } + DCHECK_OK(registry->AddFunction(std::move(func))); +} + template