From 4702bc54327209bba67889ac1f2e9994bb940808 Mon Sep 17 00:00:00 2001 From: David Li Date: Thu, 3 Jun 2021 16:47:03 -0500 Subject: [PATCH 1/6] ARROW-12949: [C++] Add starts_with/ends_with kernels --- .../arrow/compute/kernels/scalar_string.cc | 141 +++++++++++++++--- .../compute/kernels/scalar_string_test.cc | 59 ++++++++ 2 files changed, 179 insertions(+), 21 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index df3a3991fcf..e62572487af 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -492,6 +492,42 @@ struct PlainSubstringMatcher { bool Match(util::string_view current) const { return Find(current) >= 0; } }; +struct PlainStartsWithMatcher { + const MatchSubstringOptions& options_; + + explicit PlainStartsWithMatcher(const MatchSubstringOptions& options) + : options_(options) {} + + static Result> Make( + const MatchSubstringOptions& options) { + // Should be handled by partial template specialization below + DCHECK(!options.ignore_case); + return ::arrow::internal::make_unique(options); + } + + bool Match(util::string_view current) const { + return current.starts_with(options_.pattern); + } +}; + +struct PlainEndsWithMatcher { + const MatchSubstringOptions& options_; + + explicit PlainEndsWithMatcher(const MatchSubstringOptions& options) + : options_(options) {} + + static Result> Make( + const MatchSubstringOptions& options) { + // Should be handled by partial template specialization below + DCHECK(!options.ignore_case); + return ::arrow::internal::make_unique(options); + } + + bool Match(util::string_view current) const { + return current.ends_with(options_.pattern); + } +}; + #ifdef ARROW_WITH_RE2 struct RegexSubstringMatcher { const MatchSubstringOptions& options_; @@ -581,6 +617,48 @@ struct MatchSubstring { } }; +template +struct MatchSubstring { + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + auto options = MatchSubstringState::Get(ctx); + if (options.ignore_case) { +#ifdef ARROW_WITH_RE2 + MatchSubstringOptions converted_options = options; + converted_options.pattern = "^" + RE2::QuoteMeta(options.pattern); + ARROW_ASSIGN_OR_RAISE(auto matcher, RegexSubstringMatcher::Make(converted_options)); + return MatchSubstringImpl::Exec(ctx, batch, out, + matcher.get()); +#else + return Status::NotImplemented("ignore_case requires RE2"); +#endif + } + ARROW_ASSIGN_OR_RAISE(auto matcher, PlainStartsWithMatcher::Make(options)); + return MatchSubstringImpl::Exec(ctx, batch, out, + matcher.get()); + } +}; + +template +struct MatchSubstring { + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + auto options = MatchSubstringState::Get(ctx); + if (options.ignore_case) { +#ifdef ARROW_WITH_RE2 + MatchSubstringOptions converted_options = options; + converted_options.pattern = RE2::QuoteMeta(options.pattern) + "$"; + ARROW_ASSIGN_OR_RAISE(auto matcher, RegexSubstringMatcher::Make(converted_options)); + return MatchSubstringImpl::Exec(ctx, batch, out, + matcher.get()); +#else + return Status::NotImplemented("ignore_case requires RE2"); +#endif + } + ARROW_ASSIGN_OR_RAISE(auto matcher, PlainEndsWithMatcher::Make(options)); + return MatchSubstringImpl::Exec(ctx, batch, out, + matcher.get()); + } +}; + const FunctionDoc match_substring_doc( "Match strings against literal pattern", ("For each string in `strings`, emit true iff it contains a given pattern.\n" @@ -588,6 +666,20 @@ const FunctionDoc match_substring_doc( "If ignore_case is set, only simple case folding is performed."), {"strings"}, "MatchSubstringOptions"); +const FunctionDoc starts_with_doc( + "Check if strings start with a pattern", + ("For each string in `strings`, emit true iff it starts with a given pattern.\n" + "Null inputs emit null. The pattern must be given in MatchSubstringOptions. " + "If ignore_case is set, only simple case folding is performed."), + {"strings"}, "MatchSubstringOptions"); + +const FunctionDoc ends_with_doc( + "Check if strings end with a pattern", + ("For each string in `strings`, emit true iff it ends with a given pattern.\n" + "Null inputs emit null. The pattern must be given in MatchSubstringOptions. " + "If ignore_case is set, only simple case folding is performed."), + {"strings"}, "MatchSubstringOptions"); + #ifdef ARROW_WITH_RE2 const FunctionDoc match_substring_regex_doc( "Match strings against regex pattern", @@ -643,36 +735,23 @@ std::string MakeLikeRegex(const MatchSubstringOptions& options) { return like_pattern; } -// A LIKE pattern matching this regex can be translated into a substring search. -static RE2 kLikePatternIsSubstringMatch("%+([^%_]*)%+"); - // Evaluate a SQL-like LIKE pattern by translating it to a regexp or // substring search as appropriate. See what Apache Impala does: // https://github.com/apache/impala/blob/9c38568657d62b6f6d7b10aa1c721ba843374dd8/be/src/exprs/like-predicate.cc -// Note that Impala optimizes more cases (e.g. prefix match) but we -// don't have kernels for those. +// Note we don't optimize regex matches to substring matches like Impala does (see the +// MatchLikeEscaping test) template struct MatchLike { static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { auto original_options = MatchSubstringState::Get(ctx); auto original_state = ctx->state(); - Status status; - std::string pattern; - if (!original_options.ignore_case && - re2::RE2::FullMatch(original_options.pattern, kLikePatternIsSubstringMatch, - &pattern)) { - MatchSubstringOptions converted_options{pattern, original_options.ignore_case}; - MatchSubstringState converted_state(converted_options); - ctx->SetState(&converted_state); - status = MatchSubstring::Exec(ctx, batch, out); - } else { - MatchSubstringOptions converted_options{MakeLikeRegex(original_options), - original_options.ignore_case}; - MatchSubstringState converted_state(converted_options); - ctx->SetState(&converted_state); - status = MatchSubstring::Exec(ctx, batch, out); - } + MatchSubstringOptions converted_options{MakeLikeRegex(original_options), + original_options.ignore_case}; + MatchSubstringState converted_state(converted_options); + ctx->SetState(&converted_state); + auto status = + MatchSubstring::Exec(ctx, batch, out); ctx->SetState(original_state); return status; } @@ -700,6 +779,26 @@ void AddMatchSubstring(FunctionRegistry* registry) { func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init)); DCHECK_OK(registry->AddFunction(std::move(func))); } + { + auto func = std::make_shared("starts_with", Arity::Unary(), + &match_substring_doc); + auto exec_32 = MatchSubstring::Exec; + auto exec_64 = MatchSubstring::Exec; + DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init)); + DCHECK_OK( + func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init)); + DCHECK_OK(registry->AddFunction(std::move(func))); + } + { + auto func = std::make_shared("ends_with", Arity::Unary(), + &match_substring_doc); + auto exec_32 = MatchSubstring::Exec; + auto exec_64 = MatchSubstring::Exec; + DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init)); + DCHECK_OK( + func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init)); + DCHECK_OK(registry->AddFunction(std::move(func))); + } #ifdef ARROW_WITH_RE2 { auto func = std::make_shared("match_substring_regex", Arity::Unary(), diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 9b4cef494d7..6ec5db14d4f 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -445,6 +445,60 @@ TYPED_TEST(TestStringKernels, MatchSubstringIgnoreCase) { } #endif +TYPED_TEST(TestStringKernels, MatchStartsWith) { + MatchSubstringOptions options{"abab"}; + this->CheckUnary("starts_with", "[]", boolean(), "[]", &options); + this->CheckUnary("starts_with", R"([null, "", "ab", "abab", "$abab", "abab$"])", + boolean(), "[null, false, false, true, false, true]", &options); + this->CheckUnary("starts_with", R"(["ABAB", "BABAB", "ABABC", "bAbAb", "aBaBc"])", + boolean(), "[false, false, false, false, false]", &options); +} + +TYPED_TEST(TestStringKernels, MatchEndsWith) { + MatchSubstringOptions options{"abab"}; + this->CheckUnary("ends_with", "[]", boolean(), "[]", &options); + this->CheckUnary("ends_with", R"([null, "", "ab", "abab", "$abab", "abab$"])", + boolean(), "[null, false, false, true, true, false]", &options); + this->CheckUnary("ends_with", R"(["ABAB", "BABAB", "ABABC", "bAbAb", "aBaBc"])", + boolean(), "[false, false, false, false, false]", &options); +} + +#ifdef ARROW_WITH_RE2 +TYPED_TEST(TestStringKernels, MatchStartsWithIgnoreCase) { + MatchSubstringOptions options{"abab", /*ignore_case=*/true}; + this->CheckUnary("starts_with", "[]", boolean(), "[]", &options); + this->CheckUnary("starts_with", R"([null, "", "ab", "abab", "$abab", "abab$"])", + boolean(), "[null, false, false, true, false, true]", &options); + this->CheckUnary("starts_with", R"(["ABAB", "$ABAB", "ABAB$", "$AbAb", "aBaB$"])", + boolean(), "[true, false, true, false, true]", &options); +} + +TYPED_TEST(TestStringKernels, MatchEndsWithIgnoreCase) { + MatchSubstringOptions options{"abab", /*ignore_case=*/true}; + this->CheckUnary("ends_with", "[]", boolean(), "[]", &options); + this->CheckUnary("ends_with", R"([null, "", "ab", "abab", "$abab", "abab$"])", + boolean(), "[null, false, false, true, true, false]", &options); + this->CheckUnary("ends_with", R"(["ABAB", "$ABAB", "ABAB$", "$AbAb", "aBaB$"])", + boolean(), "[true, true, false, true, false]", &options); +} +#else +TYPED_TEST(TestStringKernels, MatchStartsWithIgnoreCase) { + Datum input = ArrayFromJSON(this->type(), R"(["a"])"); + MatchSubstringOptions options{"a", /*ignore_case=*/true}; + EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented, + ::testing::HasSubstr("ignore_case requires RE2"), + CallFunction("starts_with", {input}, &options)); +} + +TYPED_TEST(TestStringKernels, MatchEndsWithIgnoreCase) { + Datum input = ArrayFromJSON(this->type(), R"(["a"])"); + MatchSubstringOptions options{"a", /*ignore_case=*/true}; + EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented, + ::testing::HasSubstr("ignore_case requires RE2"), + CallFunction("ends_with", {input}, &options)); +} +#endif + #ifdef ARROW_WITH_RE2 TYPED_TEST(TestStringKernels, MatchSubstringRegex) { MatchSubstringOptions options{"ab"}; @@ -528,10 +582,15 @@ TYPED_TEST(TestStringKernels, MatchLike) { TYPED_TEST(TestStringKernels, MatchLikeEscaping) { auto inputs = R"(["%%foo", "_bar", "({", "\\baz"])"; + // N.B. I believe Impala mistakenly optimizes these into substring searches MatchSubstringOptions escape_percent{"\\%%"}; this->CheckUnary("match_like", inputs, boolean(), "[true, false, false, false]", &escape_percent); + MatchSubstringOptions not_substring{"%\\%%"}; + this->CheckUnary("match_like", inputs, boolean(), "[true, false, false, false]", + ¬_substring); + MatchSubstringOptions escape_underscore{"\\____"}; this->CheckUnary("match_like", inputs, boolean(), "[false, true, false, false]", &escape_underscore); From f5e28ab46e99cee43ebe4bb36b1635be54c6caa4 Mon Sep 17 00:00:00 2001 From: David Li Date: Thu, 3 Jun 2021 17:01:33 -0500 Subject: [PATCH 2/6] ARROW-12949: [C++] Restore match_like optimizations --- .../arrow/compute/kernels/scalar_string.cc | 45 +++++++++++++++---- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index e62572487af..b90a94a375c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -735,23 +735,52 @@ std::string MakeLikeRegex(const MatchSubstringOptions& options) { return like_pattern; } +// A LIKE pattern matching this regex can be translated into a substring search. +static RE2 kLikePatternIsSubstringMatch(R"(%+([^%_]*[^\\%_])?%+)"); +// A LIKE pattern matching this regex can be translated into a prefix search. +static RE2 kLikePatternIsStartsWith(R"(([^%_]*[^\\%_])?%+)"); +// A LIKE pattern matching this regex can be translated into a suffix search. +static RE2 kLikePatternIsEndsWith(R"(%+([^%_]*))"); + // Evaluate a SQL-like LIKE pattern by translating it to a regexp or // substring search as appropriate. See what Apache Impala does: // https://github.com/apache/impala/blob/9c38568657d62b6f6d7b10aa1c721ba843374dd8/be/src/exprs/like-predicate.cc -// Note we don't optimize regex matches to substring matches like Impala does (see the -// MatchLikeEscaping test) template struct MatchLike { static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { auto original_options = MatchSubstringState::Get(ctx); auto original_state = ctx->state(); - MatchSubstringOptions converted_options{MakeLikeRegex(original_options), - original_options.ignore_case}; - MatchSubstringState converted_state(converted_options); - ctx->SetState(&converted_state); - auto status = - MatchSubstring::Exec(ctx, batch, out); + Status status; + std::string pattern; + if (!original_options.ignore_case && + re2::RE2::FullMatch(original_options.pattern, kLikePatternIsSubstringMatch, + &pattern)) { + MatchSubstringOptions converted_options{pattern, original_options.ignore_case}; + MatchSubstringState converted_state(converted_options); + ctx->SetState(&converted_state); + status = MatchSubstring::Exec(ctx, batch, out); + } else if (!original_options.ignore_case && + re2::RE2::FullMatch(original_options.pattern, kLikePatternIsStartsWith, + &pattern)) { + MatchSubstringOptions converted_options{pattern, original_options.ignore_case}; + MatchSubstringState converted_state(converted_options); + ctx->SetState(&converted_state); + status = MatchSubstring::Exec(ctx, batch, out); + } else if (!original_options.ignore_case && + re2::RE2::FullMatch(original_options.pattern, kLikePatternIsEndsWith, + &pattern)) { + MatchSubstringOptions converted_options{pattern, original_options.ignore_case}; + MatchSubstringState converted_state(converted_options); + ctx->SetState(&converted_state); + status = MatchSubstring::Exec(ctx, batch, out); + } else { + MatchSubstringOptions converted_options{MakeLikeRegex(original_options), + original_options.ignore_case}; + MatchSubstringState converted_state(converted_options); + ctx->SetState(&converted_state); + status = MatchSubstring::Exec(ctx, batch, out); + } ctx->SetState(original_state); return status; } From cd1185e0b5dc33c4d58349493c23823c59f705a6 Mon Sep 17 00:00:00 2001 From: David Li Date: Mon, 7 Jun 2021 12:36:47 -0400 Subject: [PATCH 3/6] ARROW-12949: [C++] Don't use C++20 string_view methods --- cpp/src/arrow/compute/kernels/scalar_string.cc | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index b90a94a375c..ca614246e97 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -506,7 +506,8 @@ struct PlainStartsWithMatcher { } bool Match(util::string_view current) const { - return current.starts_with(options_.pattern); + // string_view::starts_with is C++20 + return current.substr(0, options_.pattern.size()) == options_.pattern; } }; @@ -524,7 +525,10 @@ struct PlainEndsWithMatcher { } bool Match(util::string_view current) const { - return current.ends_with(options_.pattern); + // string_view::ends_with is C++20 + return current.size() >= options_.pattern.size() && + current.substr(current.size() - options_.pattern.size(), + options_.pattern.size()) == options_.pattern; } }; @@ -667,14 +671,14 @@ const FunctionDoc match_substring_doc( {"strings"}, "MatchSubstringOptions"); const FunctionDoc starts_with_doc( - "Check if strings start with a pattern", + "Check if strings start with a literal pattern", ("For each string in `strings`, emit true iff it starts with a given pattern.\n" "Null inputs emit null. The pattern must be given in MatchSubstringOptions. " "If ignore_case is set, only simple case folding is performed."), {"strings"}, "MatchSubstringOptions"); const FunctionDoc ends_with_doc( - "Check if strings end with a pattern", + "Check if strings end with a literal pattern", ("For each string in `strings`, emit true iff it ends with a given pattern.\n" "Null inputs emit null. The pattern must be given in MatchSubstringOptions. " "If ignore_case is set, only simple case folding is performed."), From b92cd98f6cd33f2547c1de56002d5cc0c0125d57 Mon Sep 17 00:00:00 2001 From: David Li Date: Mon, 7 Jun 2021 14:05:49 -0400 Subject: [PATCH 4/6] ARROW-12949: [C++] Add MatchLike benchmarks --- .../kernels/scalar_string_benchmark.cc | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc index 8528c0d9e5d..606e774451c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc @@ -87,6 +87,30 @@ static void TrimManyAscii(benchmark::State& state) { UnaryStringBenchmark(state, "ascii_trim", &options); } +#ifdef ARROW_WITH_RE2 +static void MatchLike(benchmark::State& state) { + MatchSubstringOptions options("ab%ac"); + UnaryStringBenchmark(state, "match_like", &options); +} + +// MatchLike optimizes the following three into a substring/prefix/suffix search instead +// of using RE2 +static void MatchLikeSubstring(benchmark::State& state) { + MatchSubstringOptions options("%abac%"); + UnaryStringBenchmark(state, "match_like", &options); +} + +static void MatchLikePrefix(benchmark::State& state) { + MatchSubstringOptions options("%abac"); + UnaryStringBenchmark(state, "match_like", &options); +} + +static void MatchLikeSuffix(benchmark::State& state) { + MatchSubstringOptions options("%abac"); + UnaryStringBenchmark(state, "match_like", &options); +} +#endif + #ifdef ARROW_WITH_UTF8PROC static void Utf8Upper(benchmark::State& state) { UnaryStringBenchmark(state, "utf8_upper"); @@ -152,6 +176,12 @@ BENCHMARK(MatchSubstring); BENCHMARK(SplitPattern); BENCHMARK(TrimSingleAscii); BENCHMARK(TrimManyAscii); +#ifdef ARROW_WITH_RE2 +BENCHMARK(MatchLike); +BENCHMARK(MatchLikeSubstring); +BENCHMARK(MatchLikePrefix); +BENCHMARK(MatchLikeSuffix); +#endif #ifdef ARROW_WITH_UTF8PROC BENCHMARK(Utf8Lower); BENCHMARK(Utf8Upper); From deefabdbc136cb2df5e6510a93705d8720cf9491 Mon Sep 17 00:00:00 2001 From: David Li Date: Mon, 7 Jun 2021 16:59:46 -0400 Subject: [PATCH 5/6] ARROW-12949: [C++] Add docs for starts_with/ends_with --- docs/source/cpp/compute.rst | 47 +++++++++++++++++------------- docs/source/python/api/compute.rst | 10 ++++--- 2 files changed, 33 insertions(+), 24 deletions(-) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 434d4a23e9c..4aa38e1a295 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -563,52 +563,59 @@ Containment tests +===========================+============+====================================+====================+========================================+ | count_substring | Unary | String-like | Int32 or Int64 (1) | :struct:`MatchSubstringOptions` | +---------------------------+------------+------------------------------------+--------------------+----------------------------------------+ -| find_substring | Unary | String-like | Int32 or Int64 (2) | :struct:`MatchSubstringOptions` | +| ends_with | Unary | String-like | Boolean (2) | :struct:`MatchSubstringOptions` | +---------------------------+------------+------------------------------------+--------------------+----------------------------------------+ -| match_like | Unary | String-like | Boolean (3) | :struct:`MatchSubstringOptions` | +| find_substring | Unary | String-like | Int32 or Int64 (3) | :struct:`MatchSubstringOptions` | +---------------------------+------------+------------------------------------+--------------------+----------------------------------------+ -| match_substring | Unary | String-like | Boolean (4) | :struct:`MatchSubstringOptions` | -+---------------------------+------------+------------------------------------+--------------------+----------------------------------------+ -| match_substring_regex | Unary | String-like | Boolean (5) | :struct:`MatchSubstringOptions` | -+---------------------------+------------+------------------------------------+--------------------+----------------------------------------+ -| index_in | Unary | Boolean, Null, Numeric, Temporal, | Int32 (6) | :struct:`SetLookupOptions` | +| index_in | Unary | Boolean, Null, Numeric, Temporal, | Int32 (4) | :struct:`SetLookupOptions` | | | | Binary- and String-like | | | +---------------------------+------------+------------------------------------+--------------------+----------------------------------------+ -| is_in | Unary | Boolean, Null, Numeric, Temporal, | Boolean (7) | :struct:`SetLookupOptions` | +| is_in | Unary | Boolean, Null, Numeric, Temporal, | Boolean (5) | :struct:`SetLookupOptions` | | | | Binary- and String-like | | | +---------------------------+------------+------------------------------------+--------------------+----------------------------------------+ +| match_like | Unary | String-like | Boolean (6) | :struct:`MatchSubstringOptions` | ++---------------------------+------------+------------------------------------+--------------------+----------------------------------------+ +| match_substring | Unary | String-like | Boolean (7) | :struct:`MatchSubstringOptions` | ++---------------------------+------------+------------------------------------+--------------------+----------------------------------------+ +| match_substring_regex | Unary | String-like | Boolean (8) | :struct:`MatchSubstringOptions` | ++---------------------------+------------+------------------------------------+--------------------+----------------------------------------+ +| starts_with | Unary | String-like | Boolean (2) | :struct:`MatchSubstringOptions` | ++---------------------------+------------+------------------------------------+--------------------+----------------------------------------+ + * \(1) Output is the number of occurrences of :member:`MatchSubstringOptions::pattern` in the corresponding input string. Output type is Int32 for Binary/String, Int64 for LargeBinary/LargeString. -* \(2) Output is the index of the first occurrence of +* \(2) Output is true iff :member:`MatchSubstringOptions::pattern` + is a suffix/prefix of the corresponding input. + +* \(3) Output is the index of the first occurrence of :member:`MatchSubstringOptions::pattern` in the corresponding input string, otherwise -1. Output type is Int32 for Binary/String, Int64 for LargeBinary/LargeString. -* \(3) Output is true iff the SQL-style LIKE pattern +* \(4) Output is the index of the corresponding input element in + :member:`SetLookupOptions::value_set`, if found there. Otherwise, + output is null. + +* \(5) Output is true iff the corresponding input element is equal to one + of the elements in :member:`SetLookupOptions::value_set`. + +* \(6) Output is true iff the SQL-style LIKE pattern :member:`MatchSubstringOptions::pattern` fully matches the corresponding input element. That is, ``%`` will match any number of characters, ``_`` will match exactly one character, and any other character matches itself. To match a literal percent sign or underscore, precede the character with a backslash. -* \(4) Output is true iff :member:`MatchSubstringOptions::pattern` +* \(7) Output is true iff :member:`MatchSubstringOptions::pattern` is a substring of the corresponding input element. -* \(5) Output is true iff :member:`MatchSubstringOptions::pattern` +* \(8) Output is true iff :member:`MatchSubstringOptions::pattern` matches the corresponding input element at any position. -* \(6) Output is the index of the corresponding input element in - :member:`SetLookupOptions::value_set`, if found there. Otherwise, - output is null. - -* \(7) Output is true iff the corresponding input element is equal to one - of the elements in :member:`SetLookupOptions::value_set`. - - String splitting ~~~~~~~~~~~~~~~~ diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index a586f9011fd..1dbcb3073ca 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -40,7 +40,7 @@ Arithmetic Functions -------------------- By default these functions do not detect overflow. Each function is also -available in an overflow-checking variant, suffixed ``_checked``, which +available in an overflow-checking variant, suffixed ``_checked``, which throws an ``ArrowInvalid`` exception when overflow is detected. .. autosummary:: @@ -104,11 +104,11 @@ logic variants are provided (suffixed ``_kleene``). See User Guide for details. String Predicates ----------------- -In these functions an empty string emits false in the output. For ASCII +In these functions an empty string emits false in the output. For ASCII variants (prefixed ``ascii_``) a string element with non-ASCII characters emits false in the output. -The first set of functions emit true if the input contains only +The first set of functions emit true if the input contains only characters of a given class. .. autosummary:: @@ -140,7 +140,7 @@ in the string element. ascii_is_title utf8_is_title -The third set of functions examines string elements on +The third set of functions examines string elements on a byte-by-byte basis. .. autosummary:: @@ -179,12 +179,14 @@ Containment tests :toctree: ../generated/ count_substring + ends_with find_substring index_in is_in match_like match_substring match_substring_regex + starts_with Conversions ----------- From 401f03e8e045484714953920e0de881de1c7da13 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 8 Jun 2021 10:01:36 +0200 Subject: [PATCH 6/6] Avoid compiling regexes at startup --- cpp/src/arrow/compute/kernels/scalar_string.cc | 15 ++++++++------- .../arrow/compute/kernels/scalar_string_test.cc | 4 ++-- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index ca614246e97..a1e19b608d9 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -739,19 +739,20 @@ std::string MakeLikeRegex(const MatchSubstringOptions& options) { return like_pattern; } -// A LIKE pattern matching this regex can be translated into a substring search. -static RE2 kLikePatternIsSubstringMatch(R"(%+([^%_]*[^\\%_])?%+)"); -// A LIKE pattern matching this regex can be translated into a prefix search. -static RE2 kLikePatternIsStartsWith(R"(([^%_]*[^\\%_])?%+)"); -// A LIKE pattern matching this regex can be translated into a suffix search. -static RE2 kLikePatternIsEndsWith(R"(%+([^%_]*))"); - // Evaluate a SQL-like LIKE pattern by translating it to a regexp or // substring search as appropriate. See what Apache Impala does: // https://github.com/apache/impala/blob/9c38568657d62b6f6d7b10aa1c721ba843374dd8/be/src/exprs/like-predicate.cc template struct MatchLike { static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + // NOTE: avoid making those constants global to avoid compiling regexes at startup + // A LIKE pattern matching this regex can be translated into a substring search. + static const RE2 kLikePatternIsSubstringMatch(R"(%+([^%_]*[^\\%_])?%+)"); + // A LIKE pattern matching this regex can be translated into a prefix search. + static const RE2 kLikePatternIsStartsWith(R"(([^%_]*[^\\%_])?%+)"); + // A LIKE pattern matching this regex can be translated into a suffix search. + static const RE2 kLikePatternIsEndsWith(R"(%+([^%_]*))"); + auto original_options = MatchSubstringState::Get(ctx); auto original_state = ctx->state(); diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 6ec5db14d4f..f015e339423 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -465,7 +465,7 @@ TYPED_TEST(TestStringKernels, MatchEndsWith) { #ifdef ARROW_WITH_RE2 TYPED_TEST(TestStringKernels, MatchStartsWithIgnoreCase) { - MatchSubstringOptions options{"abab", /*ignore_case=*/true}; + MatchSubstringOptions options{"aBAb", /*ignore_case=*/true}; this->CheckUnary("starts_with", "[]", boolean(), "[]", &options); this->CheckUnary("starts_with", R"([null, "", "ab", "abab", "$abab", "abab$"])", boolean(), "[null, false, false, true, false, true]", &options); @@ -474,7 +474,7 @@ TYPED_TEST(TestStringKernels, MatchStartsWithIgnoreCase) { } TYPED_TEST(TestStringKernels, MatchEndsWithIgnoreCase) { - MatchSubstringOptions options{"abab", /*ignore_case=*/true}; + MatchSubstringOptions options{"aBAb", /*ignore_case=*/true}; this->CheckUnary("ends_with", "[]", boolean(), "[]", &options); this->CheckUnary("ends_with", R"([null, "", "ab", "abab", "$abab", "abab$"])", boolean(), "[null, false, false, true, true, false]", &options);