diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index df3a3991fcf..a1e19b608d9 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -492,6 +492,46 @@ struct PlainSubstringMatcher { bool Match(util::string_view current) const { return Find(current) >= 0; } }; +struct PlainStartsWithMatcher { + const MatchSubstringOptions& options_; + + explicit PlainStartsWithMatcher(const MatchSubstringOptions& options) + : options_(options) {} + + static Result> Make( + const MatchSubstringOptions& options) { + // Should be handled by partial template specialization below + DCHECK(!options.ignore_case); + return ::arrow::internal::make_unique(options); + } + + bool Match(util::string_view current) const { + // string_view::starts_with is C++20 + return current.substr(0, options_.pattern.size()) == options_.pattern; + } +}; + +struct PlainEndsWithMatcher { + const MatchSubstringOptions& options_; + + explicit PlainEndsWithMatcher(const MatchSubstringOptions& options) + : options_(options) {} + + static Result> Make( + const MatchSubstringOptions& options) { + // Should be handled by partial template specialization below + DCHECK(!options.ignore_case); + return ::arrow::internal::make_unique(options); + } + + bool Match(util::string_view current) const { + // string_view::ends_with is C++20 + return current.size() >= options_.pattern.size() && + current.substr(current.size() - options_.pattern.size(), + options_.pattern.size()) == options_.pattern; + } +}; + #ifdef ARROW_WITH_RE2 struct RegexSubstringMatcher { const MatchSubstringOptions& options_; @@ -581,6 +621,48 @@ struct MatchSubstring { } }; +template +struct MatchSubstring { + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + auto options = MatchSubstringState::Get(ctx); + if (options.ignore_case) { +#ifdef ARROW_WITH_RE2 + MatchSubstringOptions converted_options = options; + converted_options.pattern = "^" + RE2::QuoteMeta(options.pattern); + ARROW_ASSIGN_OR_RAISE(auto matcher, RegexSubstringMatcher::Make(converted_options)); + return MatchSubstringImpl::Exec(ctx, batch, out, + matcher.get()); +#else + return Status::NotImplemented("ignore_case requires RE2"); +#endif + } + ARROW_ASSIGN_OR_RAISE(auto matcher, PlainStartsWithMatcher::Make(options)); + return MatchSubstringImpl::Exec(ctx, batch, out, + matcher.get()); + } +}; + +template +struct MatchSubstring { + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + auto options = MatchSubstringState::Get(ctx); + if (options.ignore_case) { +#ifdef ARROW_WITH_RE2 + MatchSubstringOptions converted_options = options; + converted_options.pattern = RE2::QuoteMeta(options.pattern) + "$"; + ARROW_ASSIGN_OR_RAISE(auto matcher, RegexSubstringMatcher::Make(converted_options)); + return MatchSubstringImpl::Exec(ctx, batch, out, + matcher.get()); +#else + return Status::NotImplemented("ignore_case requires RE2"); +#endif + } + ARROW_ASSIGN_OR_RAISE(auto matcher, PlainEndsWithMatcher::Make(options)); + return MatchSubstringImpl::Exec(ctx, batch, out, + matcher.get()); + } +}; + const FunctionDoc match_substring_doc( "Match strings against literal pattern", ("For each string in `strings`, emit true iff it contains a given pattern.\n" @@ -588,6 +670,20 @@ const FunctionDoc match_substring_doc( "If ignore_case is set, only simple case folding is performed."), {"strings"}, "MatchSubstringOptions"); +const FunctionDoc starts_with_doc( + "Check if strings start with a literal pattern", + ("For each string in `strings`, emit true iff it starts with a given pattern.\n" + "Null inputs emit null. The pattern must be given in MatchSubstringOptions. " + "If ignore_case is set, only simple case folding is performed."), + {"strings"}, "MatchSubstringOptions"); + +const FunctionDoc ends_with_doc( + "Check if strings end with a literal pattern", + ("For each string in `strings`, emit true iff it ends with a given pattern.\n" + "Null inputs emit null. The pattern must be given in MatchSubstringOptions. " + "If ignore_case is set, only simple case folding is performed."), + {"strings"}, "MatchSubstringOptions"); + #ifdef ARROW_WITH_RE2 const FunctionDoc match_substring_regex_doc( "Match strings against regex pattern", @@ -643,17 +739,20 @@ std::string MakeLikeRegex(const MatchSubstringOptions& options) { return like_pattern; } -// A LIKE pattern matching this regex can be translated into a substring search. -static RE2 kLikePatternIsSubstringMatch("%+([^%_]*)%+"); - // Evaluate a SQL-like LIKE pattern by translating it to a regexp or // substring search as appropriate. See what Apache Impala does: // https://github.com/apache/impala/blob/9c38568657d62b6f6d7b10aa1c721ba843374dd8/be/src/exprs/like-predicate.cc -// Note that Impala optimizes more cases (e.g. prefix match) but we -// don't have kernels for those. template struct MatchLike { static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + // NOTE: avoid making those constants global to avoid compiling regexes at startup + // A LIKE pattern matching this regex can be translated into a substring search. + static const RE2 kLikePatternIsSubstringMatch(R"(%+([^%_]*[^\\%_])?%+)"); + // A LIKE pattern matching this regex can be translated into a prefix search. + static const RE2 kLikePatternIsStartsWith(R"(([^%_]*[^\\%_])?%+)"); + // A LIKE pattern matching this regex can be translated into a suffix search. + static const RE2 kLikePatternIsEndsWith(R"(%+([^%_]*))"); + auto original_options = MatchSubstringState::Get(ctx); auto original_state = ctx->state(); @@ -666,6 +765,20 @@ struct MatchLike { MatchSubstringState converted_state(converted_options); ctx->SetState(&converted_state); status = MatchSubstring::Exec(ctx, batch, out); + } else if (!original_options.ignore_case && + re2::RE2::FullMatch(original_options.pattern, kLikePatternIsStartsWith, + &pattern)) { + MatchSubstringOptions converted_options{pattern, original_options.ignore_case}; + MatchSubstringState converted_state(converted_options); + ctx->SetState(&converted_state); + status = MatchSubstring::Exec(ctx, batch, out); + } else if (!original_options.ignore_case && + re2::RE2::FullMatch(original_options.pattern, kLikePatternIsEndsWith, + &pattern)) { + MatchSubstringOptions converted_options{pattern, original_options.ignore_case}; + MatchSubstringState converted_state(converted_options); + ctx->SetState(&converted_state); + status = MatchSubstring::Exec(ctx, batch, out); } else { MatchSubstringOptions converted_options{MakeLikeRegex(original_options), original_options.ignore_case}; @@ -700,6 +813,26 @@ void AddMatchSubstring(FunctionRegistry* registry) { func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init)); DCHECK_OK(registry->AddFunction(std::move(func))); } + { + auto func = std::make_shared("starts_with", Arity::Unary(), + &match_substring_doc); + auto exec_32 = MatchSubstring::Exec; + auto exec_64 = MatchSubstring::Exec; + DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init)); + DCHECK_OK( + func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init)); + DCHECK_OK(registry->AddFunction(std::move(func))); + } + { + auto func = std::make_shared("ends_with", Arity::Unary(), + &match_substring_doc); + auto exec_32 = MatchSubstring::Exec; + auto exec_64 = MatchSubstring::Exec; + DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init)); + DCHECK_OK( + func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init)); + DCHECK_OK(registry->AddFunction(std::move(func))); + } #ifdef ARROW_WITH_RE2 { auto func = std::make_shared("match_substring_regex", Arity::Unary(), diff --git a/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc index 8528c0d9e5d..606e774451c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc @@ -87,6 +87,30 @@ static void TrimManyAscii(benchmark::State& state) { UnaryStringBenchmark(state, "ascii_trim", &options); } +#ifdef ARROW_WITH_RE2 +static void MatchLike(benchmark::State& state) { + MatchSubstringOptions options("ab%ac"); + UnaryStringBenchmark(state, "match_like", &options); +} + +// MatchLike optimizes the following three into a substring/prefix/suffix search instead +// of using RE2 +static void MatchLikeSubstring(benchmark::State& state) { + MatchSubstringOptions options("%abac%"); + UnaryStringBenchmark(state, "match_like", &options); +} + +static void MatchLikePrefix(benchmark::State& state) { + MatchSubstringOptions options("%abac"); + UnaryStringBenchmark(state, "match_like", &options); +} + +static void MatchLikeSuffix(benchmark::State& state) { + MatchSubstringOptions options("%abac"); + UnaryStringBenchmark(state, "match_like", &options); +} +#endif + #ifdef ARROW_WITH_UTF8PROC static void Utf8Upper(benchmark::State& state) { UnaryStringBenchmark(state, "utf8_upper"); @@ -152,6 +176,12 @@ BENCHMARK(MatchSubstring); BENCHMARK(SplitPattern); BENCHMARK(TrimSingleAscii); BENCHMARK(TrimManyAscii); +#ifdef ARROW_WITH_RE2 +BENCHMARK(MatchLike); +BENCHMARK(MatchLikeSubstring); +BENCHMARK(MatchLikePrefix); +BENCHMARK(MatchLikeSuffix); +#endif #ifdef ARROW_WITH_UTF8PROC BENCHMARK(Utf8Lower); BENCHMARK(Utf8Upper); diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 9b4cef494d7..f015e339423 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -445,6 +445,60 @@ TYPED_TEST(TestStringKernels, MatchSubstringIgnoreCase) { } #endif +TYPED_TEST(TestStringKernels, MatchStartsWith) { + MatchSubstringOptions options{"abab"}; + this->CheckUnary("starts_with", "[]", boolean(), "[]", &options); + this->CheckUnary("starts_with", R"([null, "", "ab", "abab", "$abab", "abab$"])", + boolean(), "[null, false, false, true, false, true]", &options); + this->CheckUnary("starts_with", R"(["ABAB", "BABAB", "ABABC", "bAbAb", "aBaBc"])", + boolean(), "[false, false, false, false, false]", &options); +} + +TYPED_TEST(TestStringKernels, MatchEndsWith) { + MatchSubstringOptions options{"abab"}; + this->CheckUnary("ends_with", "[]", boolean(), "[]", &options); + this->CheckUnary("ends_with", R"([null, "", "ab", "abab", "$abab", "abab$"])", + boolean(), "[null, false, false, true, true, false]", &options); + this->CheckUnary("ends_with", R"(["ABAB", "BABAB", "ABABC", "bAbAb", "aBaBc"])", + boolean(), "[false, false, false, false, false]", &options); +} + +#ifdef ARROW_WITH_RE2 +TYPED_TEST(TestStringKernels, MatchStartsWithIgnoreCase) { + MatchSubstringOptions options{"aBAb", /*ignore_case=*/true}; + this->CheckUnary("starts_with", "[]", boolean(), "[]", &options); + this->CheckUnary("starts_with", R"([null, "", "ab", "abab", "$abab", "abab$"])", + boolean(), "[null, false, false, true, false, true]", &options); + this->CheckUnary("starts_with", R"(["ABAB", "$ABAB", "ABAB$", "$AbAb", "aBaB$"])", + boolean(), "[true, false, true, false, true]", &options); +} + +TYPED_TEST(TestStringKernels, MatchEndsWithIgnoreCase) { + MatchSubstringOptions options{"aBAb", /*ignore_case=*/true}; + this->CheckUnary("ends_with", "[]", boolean(), "[]", &options); + this->CheckUnary("ends_with", R"([null, "", "ab", "abab", "$abab", "abab$"])", + boolean(), "[null, false, false, true, true, false]", &options); + this->CheckUnary("ends_with", R"(["ABAB", "$ABAB", "ABAB$", "$AbAb", "aBaB$"])", + boolean(), "[true, true, false, true, false]", &options); +} +#else +TYPED_TEST(TestStringKernels, MatchStartsWithIgnoreCase) { + Datum input = ArrayFromJSON(this->type(), R"(["a"])"); + MatchSubstringOptions options{"a", /*ignore_case=*/true}; + EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented, + ::testing::HasSubstr("ignore_case requires RE2"), + CallFunction("starts_with", {input}, &options)); +} + +TYPED_TEST(TestStringKernels, MatchEndsWithIgnoreCase) { + Datum input = ArrayFromJSON(this->type(), R"(["a"])"); + MatchSubstringOptions options{"a", /*ignore_case=*/true}; + EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented, + ::testing::HasSubstr("ignore_case requires RE2"), + CallFunction("ends_with", {input}, &options)); +} +#endif + #ifdef ARROW_WITH_RE2 TYPED_TEST(TestStringKernels, MatchSubstringRegex) { MatchSubstringOptions options{"ab"}; @@ -528,10 +582,15 @@ TYPED_TEST(TestStringKernels, MatchLike) { TYPED_TEST(TestStringKernels, MatchLikeEscaping) { auto inputs = R"(["%%foo", "_bar", "({", "\\baz"])"; + // N.B. I believe Impala mistakenly optimizes these into substring searches MatchSubstringOptions escape_percent{"\\%%"}; this->CheckUnary("match_like", inputs, boolean(), "[true, false, false, false]", &escape_percent); + MatchSubstringOptions not_substring{"%\\%%"}; + this->CheckUnary("match_like", inputs, boolean(), "[true, false, false, false]", + ¬_substring); + MatchSubstringOptions escape_underscore{"\\____"}; this->CheckUnary("match_like", inputs, boolean(), "[false, true, false, false]", &escape_underscore); diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 434d4a23e9c..4aa38e1a295 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -563,52 +563,59 @@ Containment tests +===========================+============+====================================+====================+========================================+ | count_substring | Unary | String-like | Int32 or Int64 (1) | :struct:`MatchSubstringOptions` | +---------------------------+------------+------------------------------------+--------------------+----------------------------------------+ -| find_substring | Unary | String-like | Int32 or Int64 (2) | :struct:`MatchSubstringOptions` | +| ends_with | Unary | String-like | Boolean (2) | :struct:`MatchSubstringOptions` | +---------------------------+------------+------------------------------------+--------------------+----------------------------------------+ -| match_like | Unary | String-like | Boolean (3) | :struct:`MatchSubstringOptions` | +| find_substring | Unary | String-like | Int32 or Int64 (3) | :struct:`MatchSubstringOptions` | +---------------------------+------------+------------------------------------+--------------------+----------------------------------------+ -| match_substring | Unary | String-like | Boolean (4) | :struct:`MatchSubstringOptions` | -+---------------------------+------------+------------------------------------+--------------------+----------------------------------------+ -| match_substring_regex | Unary | String-like | Boolean (5) | :struct:`MatchSubstringOptions` | -+---------------------------+------------+------------------------------------+--------------------+----------------------------------------+ -| index_in | Unary | Boolean, Null, Numeric, Temporal, | Int32 (6) | :struct:`SetLookupOptions` | +| index_in | Unary | Boolean, Null, Numeric, Temporal, | Int32 (4) | :struct:`SetLookupOptions` | | | | Binary- and String-like | | | +---------------------------+------------+------------------------------------+--------------------+----------------------------------------+ -| is_in | Unary | Boolean, Null, Numeric, Temporal, | Boolean (7) | :struct:`SetLookupOptions` | +| is_in | Unary | Boolean, Null, Numeric, Temporal, | Boolean (5) | :struct:`SetLookupOptions` | | | | Binary- and String-like | | | +---------------------------+------------+------------------------------------+--------------------+----------------------------------------+ +| match_like | Unary | String-like | Boolean (6) | :struct:`MatchSubstringOptions` | ++---------------------------+------------+------------------------------------+--------------------+----------------------------------------+ +| match_substring | Unary | String-like | Boolean (7) | :struct:`MatchSubstringOptions` | ++---------------------------+------------+------------------------------------+--------------------+----------------------------------------+ +| match_substring_regex | Unary | String-like | Boolean (8) | :struct:`MatchSubstringOptions` | ++---------------------------+------------+------------------------------------+--------------------+----------------------------------------+ +| starts_with | Unary | String-like | Boolean (2) | :struct:`MatchSubstringOptions` | ++---------------------------+------------+------------------------------------+--------------------+----------------------------------------+ + * \(1) Output is the number of occurrences of :member:`MatchSubstringOptions::pattern` in the corresponding input string. Output type is Int32 for Binary/String, Int64 for LargeBinary/LargeString. -* \(2) Output is the index of the first occurrence of +* \(2) Output is true iff :member:`MatchSubstringOptions::pattern` + is a suffix/prefix of the corresponding input. + +* \(3) Output is the index of the first occurrence of :member:`MatchSubstringOptions::pattern` in the corresponding input string, otherwise -1. Output type is Int32 for Binary/String, Int64 for LargeBinary/LargeString. -* \(3) Output is true iff the SQL-style LIKE pattern +* \(4) Output is the index of the corresponding input element in + :member:`SetLookupOptions::value_set`, if found there. Otherwise, + output is null. + +* \(5) Output is true iff the corresponding input element is equal to one + of the elements in :member:`SetLookupOptions::value_set`. + +* \(6) Output is true iff the SQL-style LIKE pattern :member:`MatchSubstringOptions::pattern` fully matches the corresponding input element. That is, ``%`` will match any number of characters, ``_`` will match exactly one character, and any other character matches itself. To match a literal percent sign or underscore, precede the character with a backslash. -* \(4) Output is true iff :member:`MatchSubstringOptions::pattern` +* \(7) Output is true iff :member:`MatchSubstringOptions::pattern` is a substring of the corresponding input element. -* \(5) Output is true iff :member:`MatchSubstringOptions::pattern` +* \(8) Output is true iff :member:`MatchSubstringOptions::pattern` matches the corresponding input element at any position. -* \(6) Output is the index of the corresponding input element in - :member:`SetLookupOptions::value_set`, if found there. Otherwise, - output is null. - -* \(7) Output is true iff the corresponding input element is equal to one - of the elements in :member:`SetLookupOptions::value_set`. - - String splitting ~~~~~~~~~~~~~~~~ diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index a586f9011fd..1dbcb3073ca 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -40,7 +40,7 @@ Arithmetic Functions -------------------- By default these functions do not detect overflow. Each function is also -available in an overflow-checking variant, suffixed ``_checked``, which +available in an overflow-checking variant, suffixed ``_checked``, which throws an ``ArrowInvalid`` exception when overflow is detected. .. autosummary:: @@ -104,11 +104,11 @@ logic variants are provided (suffixed ``_kleene``). See User Guide for details. String Predicates ----------------- -In these functions an empty string emits false in the output. For ASCII +In these functions an empty string emits false in the output. For ASCII variants (prefixed ``ascii_``) a string element with non-ASCII characters emits false in the output. -The first set of functions emit true if the input contains only +The first set of functions emit true if the input contains only characters of a given class. .. autosummary:: @@ -140,7 +140,7 @@ in the string element. ascii_is_title utf8_is_title -The third set of functions examines string elements on +The third set of functions examines string elements on a byte-by-byte basis. .. autosummary:: @@ -179,12 +179,14 @@ Containment tests :toctree: ../generated/ count_substring + ends_with find_substring index_in is_in match_like match_substring match_substring_regex + starts_with Conversions -----------