From 4702bc54327209bba67889ac1f2e9994bb940808 Mon Sep 17 00:00:00 2001
From: David Li
Date: Thu, 3 Jun 2021 16:47:03 -0500
Subject: [PATCH 1/6] ARROW-12949: [C++] Add starts_with/ends_with kernels
---
.../arrow/compute/kernels/scalar_string.cc | 141 +++++++++++++++---
.../compute/kernels/scalar_string_test.cc | 59 ++++++++
2 files changed, 179 insertions(+), 21 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index df3a3991fcf..e62572487af 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -492,6 +492,42 @@ struct PlainSubstringMatcher {
bool Match(util::string_view current) const { return Find(current) >= 0; }
};
+struct PlainStartsWithMatcher {
+ const MatchSubstringOptions& options_;
+
+ explicit PlainStartsWithMatcher(const MatchSubstringOptions& options)
+ : options_(options) {}
+
+ static Result> Make(
+ const MatchSubstringOptions& options) {
+ // Should be handled by partial template specialization below
+ DCHECK(!options.ignore_case);
+ return ::arrow::internal::make_unique(options);
+ }
+
+ bool Match(util::string_view current) const {
+ return current.starts_with(options_.pattern);
+ }
+};
+
+struct PlainEndsWithMatcher {
+ const MatchSubstringOptions& options_;
+
+ explicit PlainEndsWithMatcher(const MatchSubstringOptions& options)
+ : options_(options) {}
+
+ static Result> Make(
+ const MatchSubstringOptions& options) {
+ // Should be handled by partial template specialization below
+ DCHECK(!options.ignore_case);
+ return ::arrow::internal::make_unique(options);
+ }
+
+ bool Match(util::string_view current) const {
+ return current.ends_with(options_.pattern);
+ }
+};
+
#ifdef ARROW_WITH_RE2
struct RegexSubstringMatcher {
const MatchSubstringOptions& options_;
@@ -581,6 +617,48 @@ struct MatchSubstring {
}
};
+template
+struct MatchSubstring {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ auto options = MatchSubstringState::Get(ctx);
+ if (options.ignore_case) {
+#ifdef ARROW_WITH_RE2
+ MatchSubstringOptions converted_options = options;
+ converted_options.pattern = "^" + RE2::QuoteMeta(options.pattern);
+ ARROW_ASSIGN_OR_RAISE(auto matcher, RegexSubstringMatcher::Make(converted_options));
+ return MatchSubstringImpl::Exec(ctx, batch, out,
+ matcher.get());
+#else
+ return Status::NotImplemented("ignore_case requires RE2");
+#endif
+ }
+ ARROW_ASSIGN_OR_RAISE(auto matcher, PlainStartsWithMatcher::Make(options));
+ return MatchSubstringImpl::Exec(ctx, batch, out,
+ matcher.get());
+ }
+};
+
+template
+struct MatchSubstring {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ auto options = MatchSubstringState::Get(ctx);
+ if (options.ignore_case) {
+#ifdef ARROW_WITH_RE2
+ MatchSubstringOptions converted_options = options;
+ converted_options.pattern = RE2::QuoteMeta(options.pattern) + "$";
+ ARROW_ASSIGN_OR_RAISE(auto matcher, RegexSubstringMatcher::Make(converted_options));
+ return MatchSubstringImpl::Exec(ctx, batch, out,
+ matcher.get());
+#else
+ return Status::NotImplemented("ignore_case requires RE2");
+#endif
+ }
+ ARROW_ASSIGN_OR_RAISE(auto matcher, PlainEndsWithMatcher::Make(options));
+ return MatchSubstringImpl::Exec(ctx, batch, out,
+ matcher.get());
+ }
+};
+
const FunctionDoc match_substring_doc(
"Match strings against literal pattern",
("For each string in `strings`, emit true iff it contains a given pattern.\n"
@@ -588,6 +666,20 @@ const FunctionDoc match_substring_doc(
"If ignore_case is set, only simple case folding is performed."),
{"strings"}, "MatchSubstringOptions");
+const FunctionDoc starts_with_doc(
+ "Check if strings start with a pattern",
+ ("For each string in `strings`, emit true iff it starts with a given pattern.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
+ "If ignore_case is set, only simple case folding is performed."),
+ {"strings"}, "MatchSubstringOptions");
+
+const FunctionDoc ends_with_doc(
+ "Check if strings end with a pattern",
+ ("For each string in `strings`, emit true iff it ends with a given pattern.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
+ "If ignore_case is set, only simple case folding is performed."),
+ {"strings"}, "MatchSubstringOptions");
+
#ifdef ARROW_WITH_RE2
const FunctionDoc match_substring_regex_doc(
"Match strings against regex pattern",
@@ -643,36 +735,23 @@ std::string MakeLikeRegex(const MatchSubstringOptions& options) {
return like_pattern;
}
-// A LIKE pattern matching this regex can be translated into a substring search.
-static RE2 kLikePatternIsSubstringMatch("%+([^%_]*)%+");
-
// Evaluate a SQL-like LIKE pattern by translating it to a regexp or
// substring search as appropriate. See what Apache Impala does:
// https://github.com/apache/impala/blob/9c38568657d62b6f6d7b10aa1c721ba843374dd8/be/src/exprs/like-predicate.cc
-// Note that Impala optimizes more cases (e.g. prefix match) but we
-// don't have kernels for those.
+// Note we don't optimize regex matches to substring matches like Impala does (see the
+// MatchLikeEscaping test)
template
struct MatchLike {
static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
auto original_options = MatchSubstringState::Get(ctx);
auto original_state = ctx->state();
- Status status;
- std::string pattern;
- if (!original_options.ignore_case &&
- re2::RE2::FullMatch(original_options.pattern, kLikePatternIsSubstringMatch,
- &pattern)) {
- MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
- MatchSubstringState converted_state(converted_options);
- ctx->SetState(&converted_state);
- status = MatchSubstring::Exec(ctx, batch, out);
- } else {
- MatchSubstringOptions converted_options{MakeLikeRegex(original_options),
- original_options.ignore_case};
- MatchSubstringState converted_state(converted_options);
- ctx->SetState(&converted_state);
- status = MatchSubstring::Exec(ctx, batch, out);
- }
+ MatchSubstringOptions converted_options{MakeLikeRegex(original_options),
+ original_options.ignore_case};
+ MatchSubstringState converted_state(converted_options);
+ ctx->SetState(&converted_state);
+ auto status =
+ MatchSubstring::Exec(ctx, batch, out);
ctx->SetState(original_state);
return status;
}
@@ -700,6 +779,26 @@ void AddMatchSubstring(FunctionRegistry* registry) {
func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
DCHECK_OK(registry->AddFunction(std::move(func)));
}
+ {
+ auto func = std::make_shared("starts_with", Arity::Unary(),
+ &match_substring_doc);
+ auto exec_32 = MatchSubstring::Exec;
+ auto exec_64 = MatchSubstring::Exec;
+ DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
+ DCHECK_OK(
+ func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+ {
+ auto func = std::make_shared("ends_with", Arity::Unary(),
+ &match_substring_doc);
+ auto exec_32 = MatchSubstring::Exec;
+ auto exec_64 = MatchSubstring::Exec;
+ DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
+ DCHECK_OK(
+ func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
#ifdef ARROW_WITH_RE2
{
auto func = std::make_shared("match_substring_regex", Arity::Unary(),
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 9b4cef494d7..6ec5db14d4f 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -445,6 +445,60 @@ TYPED_TEST(TestStringKernels, MatchSubstringIgnoreCase) {
}
#endif
+TYPED_TEST(TestStringKernels, MatchStartsWith) {
+ MatchSubstringOptions options{"abab"};
+ this->CheckUnary("starts_with", "[]", boolean(), "[]", &options);
+ this->CheckUnary("starts_with", R"([null, "", "ab", "abab", "$abab", "abab$"])",
+ boolean(), "[null, false, false, true, false, true]", &options);
+ this->CheckUnary("starts_with", R"(["ABAB", "BABAB", "ABABC", "bAbAb", "aBaBc"])",
+ boolean(), "[false, false, false, false, false]", &options);
+}
+
+TYPED_TEST(TestStringKernels, MatchEndsWith) {
+ MatchSubstringOptions options{"abab"};
+ this->CheckUnary("ends_with", "[]", boolean(), "[]", &options);
+ this->CheckUnary("ends_with", R"([null, "", "ab", "abab", "$abab", "abab$"])",
+ boolean(), "[null, false, false, true, true, false]", &options);
+ this->CheckUnary("ends_with", R"(["ABAB", "BABAB", "ABABC", "bAbAb", "aBaBc"])",
+ boolean(), "[false, false, false, false, false]", &options);
+}
+
+#ifdef ARROW_WITH_RE2
+TYPED_TEST(TestStringKernels, MatchStartsWithIgnoreCase) {
+ MatchSubstringOptions options{"abab", /*ignore_case=*/true};
+ this->CheckUnary("starts_with", "[]", boolean(), "[]", &options);
+ this->CheckUnary("starts_with", R"([null, "", "ab", "abab", "$abab", "abab$"])",
+ boolean(), "[null, false, false, true, false, true]", &options);
+ this->CheckUnary("starts_with", R"(["ABAB", "$ABAB", "ABAB$", "$AbAb", "aBaB$"])",
+ boolean(), "[true, false, true, false, true]", &options);
+}
+
+TYPED_TEST(TestStringKernels, MatchEndsWithIgnoreCase) {
+ MatchSubstringOptions options{"abab", /*ignore_case=*/true};
+ this->CheckUnary("ends_with", "[]", boolean(), "[]", &options);
+ this->CheckUnary("ends_with", R"([null, "", "ab", "abab", "$abab", "abab$"])",
+ boolean(), "[null, false, false, true, true, false]", &options);
+ this->CheckUnary("ends_with", R"(["ABAB", "$ABAB", "ABAB$", "$AbAb", "aBaB$"])",
+ boolean(), "[true, true, false, true, false]", &options);
+}
+#else
+TYPED_TEST(TestStringKernels, MatchStartsWithIgnoreCase) {
+ Datum input = ArrayFromJSON(this->type(), R"(["a"])");
+ MatchSubstringOptions options{"a", /*ignore_case=*/true};
+ EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
+ ::testing::HasSubstr("ignore_case requires RE2"),
+ CallFunction("starts_with", {input}, &options));
+}
+
+TYPED_TEST(TestStringKernels, MatchEndsWithIgnoreCase) {
+ Datum input = ArrayFromJSON(this->type(), R"(["a"])");
+ MatchSubstringOptions options{"a", /*ignore_case=*/true};
+ EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
+ ::testing::HasSubstr("ignore_case requires RE2"),
+ CallFunction("ends_with", {input}, &options));
+}
+#endif
+
#ifdef ARROW_WITH_RE2
TYPED_TEST(TestStringKernels, MatchSubstringRegex) {
MatchSubstringOptions options{"ab"};
@@ -528,10 +582,15 @@ TYPED_TEST(TestStringKernels, MatchLike) {
TYPED_TEST(TestStringKernels, MatchLikeEscaping) {
auto inputs = R"(["%%foo", "_bar", "({", "\\baz"])";
+ // N.B. I believe Impala mistakenly optimizes these into substring searches
MatchSubstringOptions escape_percent{"\\%%"};
this->CheckUnary("match_like", inputs, boolean(), "[true, false, false, false]",
&escape_percent);
+ MatchSubstringOptions not_substring{"%\\%%"};
+ this->CheckUnary("match_like", inputs, boolean(), "[true, false, false, false]",
+ ¬_substring);
+
MatchSubstringOptions escape_underscore{"\\____"};
this->CheckUnary("match_like", inputs, boolean(), "[false, true, false, false]",
&escape_underscore);
From f5e28ab46e99cee43ebe4bb36b1635be54c6caa4 Mon Sep 17 00:00:00 2001
From: David Li
Date: Thu, 3 Jun 2021 17:01:33 -0500
Subject: [PATCH 2/6] ARROW-12949: [C++] Restore match_like optimizations
---
.../arrow/compute/kernels/scalar_string.cc | 45 +++++++++++++++----
1 file changed, 37 insertions(+), 8 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index e62572487af..b90a94a375c 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -735,23 +735,52 @@ std::string MakeLikeRegex(const MatchSubstringOptions& options) {
return like_pattern;
}
+// A LIKE pattern matching this regex can be translated into a substring search.
+static RE2 kLikePatternIsSubstringMatch(R"(%+([^%_]*[^\\%_])?%+)");
+// A LIKE pattern matching this regex can be translated into a prefix search.
+static RE2 kLikePatternIsStartsWith(R"(([^%_]*[^\\%_])?%+)");
+// A LIKE pattern matching this regex can be translated into a suffix search.
+static RE2 kLikePatternIsEndsWith(R"(%+([^%_]*))");
+
// Evaluate a SQL-like LIKE pattern by translating it to a regexp or
// substring search as appropriate. See what Apache Impala does:
// https://github.com/apache/impala/blob/9c38568657d62b6f6d7b10aa1c721ba843374dd8/be/src/exprs/like-predicate.cc
-// Note we don't optimize regex matches to substring matches like Impala does (see the
-// MatchLikeEscaping test)
template
struct MatchLike {
static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
auto original_options = MatchSubstringState::Get(ctx);
auto original_state = ctx->state();
- MatchSubstringOptions converted_options{MakeLikeRegex(original_options),
- original_options.ignore_case};
- MatchSubstringState converted_state(converted_options);
- ctx->SetState(&converted_state);
- auto status =
- MatchSubstring::Exec(ctx, batch, out);
+ Status status;
+ std::string pattern;
+ if (!original_options.ignore_case &&
+ re2::RE2::FullMatch(original_options.pattern, kLikePatternIsSubstringMatch,
+ &pattern)) {
+ MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
+ MatchSubstringState converted_state(converted_options);
+ ctx->SetState(&converted_state);
+ status = MatchSubstring::Exec(ctx, batch, out);
+ } else if (!original_options.ignore_case &&
+ re2::RE2::FullMatch(original_options.pattern, kLikePatternIsStartsWith,
+ &pattern)) {
+ MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
+ MatchSubstringState converted_state(converted_options);
+ ctx->SetState(&converted_state);
+ status = MatchSubstring::Exec(ctx, batch, out);
+ } else if (!original_options.ignore_case &&
+ re2::RE2::FullMatch(original_options.pattern, kLikePatternIsEndsWith,
+ &pattern)) {
+ MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
+ MatchSubstringState converted_state(converted_options);
+ ctx->SetState(&converted_state);
+ status = MatchSubstring::Exec(ctx, batch, out);
+ } else {
+ MatchSubstringOptions converted_options{MakeLikeRegex(original_options),
+ original_options.ignore_case};
+ MatchSubstringState converted_state(converted_options);
+ ctx->SetState(&converted_state);
+ status = MatchSubstring::Exec(ctx, batch, out);
+ }
ctx->SetState(original_state);
return status;
}
From cd1185e0b5dc33c4d58349493c23823c59f705a6 Mon Sep 17 00:00:00 2001
From: David Li
Date: Mon, 7 Jun 2021 12:36:47 -0400
Subject: [PATCH 3/6] ARROW-12949: [C++] Don't use C++20 string_view methods
---
cpp/src/arrow/compute/kernels/scalar_string.cc | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index b90a94a375c..ca614246e97 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -506,7 +506,8 @@ struct PlainStartsWithMatcher {
}
bool Match(util::string_view current) const {
- return current.starts_with(options_.pattern);
+ // string_view::starts_with is C++20
+ return current.substr(0, options_.pattern.size()) == options_.pattern;
}
};
@@ -524,7 +525,10 @@ struct PlainEndsWithMatcher {
}
bool Match(util::string_view current) const {
- return current.ends_with(options_.pattern);
+ // string_view::ends_with is C++20
+ return current.size() >= options_.pattern.size() &&
+ current.substr(current.size() - options_.pattern.size(),
+ options_.pattern.size()) == options_.pattern;
}
};
@@ -667,14 +671,14 @@ const FunctionDoc match_substring_doc(
{"strings"}, "MatchSubstringOptions");
const FunctionDoc starts_with_doc(
- "Check if strings start with a pattern",
+ "Check if strings start with a literal pattern",
("For each string in `strings`, emit true iff it starts with a given pattern.\n"
"Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
"If ignore_case is set, only simple case folding is performed."),
{"strings"}, "MatchSubstringOptions");
const FunctionDoc ends_with_doc(
- "Check if strings end with a pattern",
+ "Check if strings end with a literal pattern",
("For each string in `strings`, emit true iff it ends with a given pattern.\n"
"Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
"If ignore_case is set, only simple case folding is performed."),
From b92cd98f6cd33f2547c1de56002d5cc0c0125d57 Mon Sep 17 00:00:00 2001
From: David Li
Date: Mon, 7 Jun 2021 14:05:49 -0400
Subject: [PATCH 4/6] ARROW-12949: [C++] Add MatchLike benchmarks
---
.../kernels/scalar_string_benchmark.cc | 30 +++++++++++++++++++
1 file changed, 30 insertions(+)
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc
index 8528c0d9e5d..606e774451c 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc
@@ -87,6 +87,30 @@ static void TrimManyAscii(benchmark::State& state) {
UnaryStringBenchmark(state, "ascii_trim", &options);
}
+#ifdef ARROW_WITH_RE2
+static void MatchLike(benchmark::State& state) {
+ MatchSubstringOptions options("ab%ac");
+ UnaryStringBenchmark(state, "match_like", &options);
+}
+
+// MatchLike optimizes the following three into a substring/prefix/suffix search instead
+// of using RE2
+static void MatchLikeSubstring(benchmark::State& state) {
+ MatchSubstringOptions options("%abac%");
+ UnaryStringBenchmark(state, "match_like", &options);
+}
+
+static void MatchLikePrefix(benchmark::State& state) {
+ MatchSubstringOptions options("%abac");
+ UnaryStringBenchmark(state, "match_like", &options);
+}
+
+static void MatchLikeSuffix(benchmark::State& state) {
+ MatchSubstringOptions options("%abac");
+ UnaryStringBenchmark(state, "match_like", &options);
+}
+#endif
+
#ifdef ARROW_WITH_UTF8PROC
static void Utf8Upper(benchmark::State& state) {
UnaryStringBenchmark(state, "utf8_upper");
@@ -152,6 +176,12 @@ BENCHMARK(MatchSubstring);
BENCHMARK(SplitPattern);
BENCHMARK(TrimSingleAscii);
BENCHMARK(TrimManyAscii);
+#ifdef ARROW_WITH_RE2
+BENCHMARK(MatchLike);
+BENCHMARK(MatchLikeSubstring);
+BENCHMARK(MatchLikePrefix);
+BENCHMARK(MatchLikeSuffix);
+#endif
#ifdef ARROW_WITH_UTF8PROC
BENCHMARK(Utf8Lower);
BENCHMARK(Utf8Upper);
From deefabdbc136cb2df5e6510a93705d8720cf9491 Mon Sep 17 00:00:00 2001
From: David Li
Date: Mon, 7 Jun 2021 16:59:46 -0400
Subject: [PATCH 5/6] ARROW-12949: [C++] Add docs for starts_with/ends_with
---
docs/source/cpp/compute.rst | 47 +++++++++++++++++-------------
docs/source/python/api/compute.rst | 10 ++++---
2 files changed, 33 insertions(+), 24 deletions(-)
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index 434d4a23e9c..4aa38e1a295 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -563,52 +563,59 @@ Containment tests
+===========================+============+====================================+====================+========================================+
| count_substring | Unary | String-like | Int32 or Int64 (1) | :struct:`MatchSubstringOptions` |
+---------------------------+------------+------------------------------------+--------------------+----------------------------------------+
-| find_substring | Unary | String-like | Int32 or Int64 (2) | :struct:`MatchSubstringOptions` |
+| ends_with | Unary | String-like | Boolean (2) | :struct:`MatchSubstringOptions` |
+---------------------------+------------+------------------------------------+--------------------+----------------------------------------+
-| match_like | Unary | String-like | Boolean (3) | :struct:`MatchSubstringOptions` |
+| find_substring | Unary | String-like | Int32 or Int64 (3) | :struct:`MatchSubstringOptions` |
+---------------------------+------------+------------------------------------+--------------------+----------------------------------------+
-| match_substring | Unary | String-like | Boolean (4) | :struct:`MatchSubstringOptions` |
-+---------------------------+------------+------------------------------------+--------------------+----------------------------------------+
-| match_substring_regex | Unary | String-like | Boolean (5) | :struct:`MatchSubstringOptions` |
-+---------------------------+------------+------------------------------------+--------------------+----------------------------------------+
-| index_in | Unary | Boolean, Null, Numeric, Temporal, | Int32 (6) | :struct:`SetLookupOptions` |
+| index_in | Unary | Boolean, Null, Numeric, Temporal, | Int32 (4) | :struct:`SetLookupOptions` |
| | | Binary- and String-like | | |
+---------------------------+------------+------------------------------------+--------------------+----------------------------------------+
-| is_in | Unary | Boolean, Null, Numeric, Temporal, | Boolean (7) | :struct:`SetLookupOptions` |
+| is_in | Unary | Boolean, Null, Numeric, Temporal, | Boolean (5) | :struct:`SetLookupOptions` |
| | | Binary- and String-like | | |
+---------------------------+------------+------------------------------------+--------------------+----------------------------------------+
+| match_like | Unary | String-like | Boolean (6) | :struct:`MatchSubstringOptions` |
++---------------------------+------------+------------------------------------+--------------------+----------------------------------------+
+| match_substring | Unary | String-like | Boolean (7) | :struct:`MatchSubstringOptions` |
++---------------------------+------------+------------------------------------+--------------------+----------------------------------------+
+| match_substring_regex | Unary | String-like | Boolean (8) | :struct:`MatchSubstringOptions` |
++---------------------------+------------+------------------------------------+--------------------+----------------------------------------+
+| starts_with | Unary | String-like | Boolean (2) | :struct:`MatchSubstringOptions` |
++---------------------------+------------+------------------------------------+--------------------+----------------------------------------+
+
* \(1) Output is the number of occurrences of
:member:`MatchSubstringOptions::pattern` in the corresponding input
string. Output type is Int32 for Binary/String, Int64
for LargeBinary/LargeString.
-* \(2) Output is the index of the first occurrence of
+* \(2) Output is true iff :member:`MatchSubstringOptions::pattern`
+ is a suffix/prefix of the corresponding input.
+
+* \(3) Output is the index of the first occurrence of
:member:`MatchSubstringOptions::pattern` in the corresponding input
string, otherwise -1. Output type is Int32 for Binary/String, Int64
for LargeBinary/LargeString.
-* \(3) Output is true iff the SQL-style LIKE pattern
+* \(4) Output is the index of the corresponding input element in
+ :member:`SetLookupOptions::value_set`, if found there. Otherwise,
+ output is null.
+
+* \(5) Output is true iff the corresponding input element is equal to one
+ of the elements in :member:`SetLookupOptions::value_set`.
+
+* \(6) Output is true iff the SQL-style LIKE pattern
:member:`MatchSubstringOptions::pattern` fully matches the
corresponding input element. That is, ``%`` will match any number of
characters, ``_`` will match exactly one character, and any other
character matches itself. To match a literal percent sign or
underscore, precede the character with a backslash.
-* \(4) Output is true iff :member:`MatchSubstringOptions::pattern`
+* \(7) Output is true iff :member:`MatchSubstringOptions::pattern`
is a substring of the corresponding input element.
-* \(5) Output is true iff :member:`MatchSubstringOptions::pattern`
+* \(8) Output is true iff :member:`MatchSubstringOptions::pattern`
matches the corresponding input element at any position.
-* \(6) Output is the index of the corresponding input element in
- :member:`SetLookupOptions::value_set`, if found there. Otherwise,
- output is null.
-
-* \(7) Output is true iff the corresponding input element is equal to one
- of the elements in :member:`SetLookupOptions::value_set`.
-
-
String splitting
~~~~~~~~~~~~~~~~
diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst
index a586f9011fd..1dbcb3073ca 100644
--- a/docs/source/python/api/compute.rst
+++ b/docs/source/python/api/compute.rst
@@ -40,7 +40,7 @@ Arithmetic Functions
--------------------
By default these functions do not detect overflow. Each function is also
-available in an overflow-checking variant, suffixed ``_checked``, which
+available in an overflow-checking variant, suffixed ``_checked``, which
throws an ``ArrowInvalid`` exception when overflow is detected.
.. autosummary::
@@ -104,11 +104,11 @@ logic variants are provided (suffixed ``_kleene``). See User Guide for details.
String Predicates
-----------------
-In these functions an empty string emits false in the output. For ASCII
+In these functions an empty string emits false in the output. For ASCII
variants (prefixed ``ascii_``) a string element with non-ASCII characters
emits false in the output.
-The first set of functions emit true if the input contains only
+The first set of functions emit true if the input contains only
characters of a given class.
.. autosummary::
@@ -140,7 +140,7 @@ in the string element.
ascii_is_title
utf8_is_title
-The third set of functions examines string elements on
+The third set of functions examines string elements on
a byte-by-byte basis.
.. autosummary::
@@ -179,12 +179,14 @@ Containment tests
:toctree: ../generated/
count_substring
+ ends_with
find_substring
index_in
is_in
match_like
match_substring
match_substring_regex
+ starts_with
Conversions
-----------
From 401f03e8e045484714953920e0de881de1c7da13 Mon Sep 17 00:00:00 2001
From: Antoine Pitrou
Date: Tue, 8 Jun 2021 10:01:36 +0200
Subject: [PATCH 6/6] Avoid compiling regexes at startup
---
cpp/src/arrow/compute/kernels/scalar_string.cc | 15 ++++++++-------
.../arrow/compute/kernels/scalar_string_test.cc | 4 ++--
2 files changed, 10 insertions(+), 9 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index ca614246e97..a1e19b608d9 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -739,19 +739,20 @@ std::string MakeLikeRegex(const MatchSubstringOptions& options) {
return like_pattern;
}
-// A LIKE pattern matching this regex can be translated into a substring search.
-static RE2 kLikePatternIsSubstringMatch(R"(%+([^%_]*[^\\%_])?%+)");
-// A LIKE pattern matching this regex can be translated into a prefix search.
-static RE2 kLikePatternIsStartsWith(R"(([^%_]*[^\\%_])?%+)");
-// A LIKE pattern matching this regex can be translated into a suffix search.
-static RE2 kLikePatternIsEndsWith(R"(%+([^%_]*))");
-
// Evaluate a SQL-like LIKE pattern by translating it to a regexp or
// substring search as appropriate. See what Apache Impala does:
// https://github.com/apache/impala/blob/9c38568657d62b6f6d7b10aa1c721ba843374dd8/be/src/exprs/like-predicate.cc
template
struct MatchLike {
static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // NOTE: avoid making those constants global to avoid compiling regexes at startup
+ // A LIKE pattern matching this regex can be translated into a substring search.
+ static const RE2 kLikePatternIsSubstringMatch(R"(%+([^%_]*[^\\%_])?%+)");
+ // A LIKE pattern matching this regex can be translated into a prefix search.
+ static const RE2 kLikePatternIsStartsWith(R"(([^%_]*[^\\%_])?%+)");
+ // A LIKE pattern matching this regex can be translated into a suffix search.
+ static const RE2 kLikePatternIsEndsWith(R"(%+([^%_]*))");
+
auto original_options = MatchSubstringState::Get(ctx);
auto original_state = ctx->state();
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 6ec5db14d4f..f015e339423 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -465,7 +465,7 @@ TYPED_TEST(TestStringKernels, MatchEndsWith) {
#ifdef ARROW_WITH_RE2
TYPED_TEST(TestStringKernels, MatchStartsWithIgnoreCase) {
- MatchSubstringOptions options{"abab", /*ignore_case=*/true};
+ MatchSubstringOptions options{"aBAb", /*ignore_case=*/true};
this->CheckUnary("starts_with", "[]", boolean(), "[]", &options);
this->CheckUnary("starts_with", R"([null, "", "ab", "abab", "$abab", "abab$"])",
boolean(), "[null, false, false, true, false, true]", &options);
@@ -474,7 +474,7 @@ TYPED_TEST(TestStringKernels, MatchStartsWithIgnoreCase) {
}
TYPED_TEST(TestStringKernels, MatchEndsWithIgnoreCase) {
- MatchSubstringOptions options{"abab", /*ignore_case=*/true};
+ MatchSubstringOptions options{"aBAb", /*ignore_case=*/true};
this->CheckUnary("ends_with", "[]", boolean(), "[]", &options);
this->CheckUnary("ends_with", R"([null, "", "ab", "abab", "$abab", "abab$"])",
boolean(), "[null, false, false, true, true, false]", &options);