From a86744e38f71375fb5d933b02040d0fa737ec887 Mon Sep 17 00:00:00 2001
From: David Li
Date: Wed, 19 May 2021 09:23:36 -0400
Subject: [PATCH 1/5] ARROW-12715: [C++][Python] Add SQL LIKE match kernel
---
.../arrow/compute/kernels/scalar_string.cc | 97 +++++++++++++++++++
.../compute/kernels/scalar_string_test.cc | 49 ++++++++++
docs/source/cpp/compute.rst | 24 +++--
docs/source/python/api/compute.rst | 1 +
python/pyarrow/compute.py | 22 +++++
python/pyarrow/tests/test_compute.py | 7 ++
6 files changed, 192 insertions(+), 8 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 65196b2a491..398ee63c9f3 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -494,6 +494,93 @@ const FunctionDoc match_substring_regex_doc(
"position.\n"
"Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
{"strings"}, "MatchSubstringOptions");
+
+// SQL LIKE match
+
+/// Convert a SQL-style LIKE pattern (using '%' and '_') into a regex pattern
+std::string MakeLikeRegex(const MatchSubstringOptions& options) {
+ // Allow . to match \n
+ std::string like_pattern = "(?s:^";
+ like_pattern.reserve(options.pattern.size() + 7);
+ bool escaped = false;
+ for (const char c : options.pattern) {
+ if (!escaped && c == '%') {
+ like_pattern.append(".*");
+ } else if (!escaped && c == '_') {
+ like_pattern.append(".");
+ } else if (!escaped && c == '\\') {
+ escaped = true;
+ } else {
+ switch (c) {
+ case '.':
+ case '?':
+ case '+':
+ case '*':
+ case '^':
+ case '$':
+ case '\\':
+ case '[':
+ case '{':
+ case '(':
+ case ')':
+ case '|': {
+ like_pattern.push_back('\\');
+ like_pattern.push_back(c);
+ escaped = false;
+ break;
+ }
+ default: {
+ like_pattern.push_back(c);
+ escaped = false;
+ break;
+ }
+ }
+ }
+ }
+ like_pattern.append("$)");
+ return like_pattern;
+}
+
+// A LIKE pattern matching this regex can be translated into a substring search.
+static RE2 kLikePatternIsSubstringMatch("%+([^%_])*%+");
+
+// Evaluate a SQL-like LIKE pattern by translating it to a regexp or
+// substring search as appropriate. See what Apache Impala does:
+// https://github.com/apache/impala/blob/9c38568657d62b6f6d7b10aa1c721ba843374dd8/be/src/exprs/like-predicate.cc
+// Note that Impala optimizes more cases (e.g. prefix match) but we
+// don't have kernels for those.
+template
+struct MatchLike {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ auto original_options = MatchSubstringState::Get(ctx);
+ auto original_state = ctx->state();
+
+ Status status;
+ std::string pattern;
+ if (re2::RE2::FullMatch(original_options.pattern, kLikePatternIsSubstringMatch,
+ &pattern)) {
+ MatchSubstringOptions converted_options{pattern};
+ MatchSubstringState converted_state(converted_options);
+ ctx->SetState(&converted_state);
+ status = MatchSubstring::Exec(ctx, batch, out);
+ } else {
+ MatchSubstringOptions converted_options{MakeLikeRegex(original_options)};
+ MatchSubstringState converted_state(converted_options);
+ ctx->SetState(&converted_state);
+ status = MatchSubstring::Exec(ctx, batch, out);
+ }
+ ctx->SetState(original_state);
+ return status;
+ }
+};
+
+const FunctionDoc match_like_doc(
+ "Match strings against SQL-style LIKE pattern",
+ ("For each string in `strings`, emit true iff it fully matches a given pattern "
+ "at any position.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
+ {"strings"}, "MatchSubstringOptions");
+
#endif
void AddMatchSubstring(FunctionRegistry* registry) {
@@ -518,6 +605,16 @@ void AddMatchSubstring(FunctionRegistry* registry) {
func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
DCHECK_OK(registry->AddFunction(std::move(func)));
}
+ {
+ auto func =
+ std::make_shared("match_like", Arity::Unary(), &match_like_doc);
+ auto exec_32 = MatchLike::Exec;
+ auto exec_64 = MatchLike::Exec;
+ DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
+ DCHECK_OK(
+ func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
#endif
}
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index a59634b7be8..5f1df8aaafa 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -388,6 +388,55 @@ TYPED_TEST(TestStringKernels, MatchSubstringRegexInvalid) {
Invalid, ::testing::HasSubstr("Invalid regular expression: missing ]"),
CallFunction("match_substring_regex", {input}, &options));
}
+
+TYPED_TEST(TestStringKernels, MatchLike) {
+ auto inputs = R"(["foo", "bar", "foobar", "barfoo", "\nfoo", "foo\n", null])";
+
+ MatchSubstringOptions prefix_match{"foo%"};
+ this->CheckUnary("match_like", "[]", boolean(), "[]", &prefix_match);
+ this->CheckUnary("match_like", inputs, boolean(),
+ "[true, false, true, false, false, true, null]", &prefix_match);
+
+ MatchSubstringOptions suffix_match{"%foo"};
+ this->CheckUnary("match_like", inputs, boolean(),
+ "[true, false, false, true, true, false, null]", &suffix_match);
+
+ MatchSubstringOptions substring_match{"%foo%"};
+ this->CheckUnary("match_like", inputs, boolean(),
+ "[true, false, true, true, true, true, null]", &substring_match);
+
+ MatchSubstringOptions trivial_match{"%%"};
+ this->CheckUnary("match_like", inputs, boolean(),
+ "[true, true, true, true, true, true, null]", &trivial_match);
+
+ MatchSubstringOptions regex_match{"foo%bar"};
+ this->CheckUnary("match_like", inputs, boolean(),
+ "[false, false, true, false, false, false, null]", ®ex_match);
+}
+
+TYPED_TEST(TestStringKernels, MatchLikeEscaping) {
+ auto inputs = R"(["%%foo", "_bar", "({", "\\baz"])";
+
+ MatchSubstringOptions escape_percent{"\\%%"};
+ this->CheckUnary("match_like", inputs, boolean(), "[true, false, false, false]",
+ &escape_percent);
+
+ MatchSubstringOptions escape_underscore{"\\____"};
+ this->CheckUnary("match_like", inputs, boolean(), "[false, true, false, false]",
+ &escape_underscore);
+
+ MatchSubstringOptions escape_regex{"(%"};
+ this->CheckUnary("match_like", inputs, boolean(), "[false, false, true, false]",
+ &escape_regex);
+
+ MatchSubstringOptions escape_escape{"\\\\%"};
+ this->CheckUnary("match_like", inputs, boolean(), "[false, false, false, true]",
+ &escape_escape);
+
+ MatchSubstringOptions special_chars{"!@#$^&*()[]{}.?"};
+ this->CheckUnary("match_like", R"(["!@#$^&*()[]{}.?"])", boolean(), "[true]",
+ &special_chars);
+}
#endif
TYPED_TEST(TestStringKernels, SplitBasics) {
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index 592dc4ec1b0..2f5ee747aeb 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -529,28 +529,36 @@ Containment tests
+---------------------------+------------+------------------------------------+---------------+----------------------------------------+
| Function name | Arity | Input types | Output type | Options class |
+===========================+============+====================================+===============+========================================+
-| match_substring | Unary | String-like | Boolean (1) | :struct:`MatchSubstringOptions` |
+| match_like | Unary | String-like | Boolean (1) | :struct:`MatchSubstringOptions` |
+---------------------------+------------+------------------------------------+---------------+----------------------------------------+
-| match_substring_regex | Unary | String-like | Boolean (2) | :struct:`MatchSubstringOptions` |
+| match_substring | Unary | String-like | Boolean (2) | :struct:`MatchSubstringOptions` |
+---------------------------+------------+------------------------------------+---------------+----------------------------------------+
-| index_in | Unary | Boolean, Null, Numeric, Temporal, | Int32 (3) | :struct:`SetLookupOptions` |
+| match_substring_regex | Unary | String-like | Boolean (3) | :struct:`MatchSubstringOptions` |
++---------------------------+------------+------------------------------------+---------------+----------------------------------------+
+| index_in | Unary | Boolean, Null, Numeric, Temporal, | Int32 (4) | :struct:`SetLookupOptions` |
| | | Binary- and String-like | | |
+---------------------------+------------+------------------------------------+---------------+----------------------------------------+
-| is_in | Unary | Boolean, Null, Numeric, Temporal, | Boolean (4) | :struct:`SetLookupOptions` |
+| is_in | Unary | Boolean, Null, Numeric, Temporal, | Boolean (5) | :struct:`SetLookupOptions` |
| | | Binary- and String-like | | |
+---------------------------+------------+------------------------------------+---------------+----------------------------------------+
-* \(1) Output is true iff :member:`MatchSubstringOptions::pattern`
- is a substring of the corresponding input element.
+* \(1) Output is true iff the SQL-style LIKE pattern
+ :member:`MatchSubstringOptions::pattern` fully matches the
+ corresponding input element. That is, ``%`` will match any number of
+ characters, ``_`` will match exactly one character, and any other
+ character matches itself.
* \(2) Output is true iff :member:`MatchSubstringOptions::pattern`
+ is a substring of the corresponding input element.
+
+* \(3) Output is true iff :member:`MatchSubstringOptions::pattern`
matches the corresponding input element at any position.
-* \(3) Output is the index of the corresponding input element in
+* \(4) Output is the index of the corresponding input element in
:member:`SetLookupOptions::value_set`, if found there. Otherwise,
output is null.
-* \(4) Output is true iff the corresponding input element is equal to one
+* \(5) Output is true iff the corresponding input element is equal to one
of the elements in :member:`SetLookupOptions::value_set`.
diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst
index da16ccdfa29..7ab8338b522 100644
--- a/docs/source/python/api/compute.rst
+++ b/docs/source/python/api/compute.rst
@@ -156,6 +156,7 @@ Containment tests
index_in
is_in
+ match_like
match_substring
match_substring_regex
diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
index ec38710b023..79496b3f770 100644
--- a/python/pyarrow/compute.py
+++ b/python/pyarrow/compute.py
@@ -289,6 +289,28 @@ def cast(arr, target_type, safe=True):
return call_function("cast", [arr], options)
+def match_like(array, pattern):
+ """
+ Test if the SQL-style LIKE pattern *pattern* matches a value of a
+ string array.
+
+ Parameters
+ ----------
+ array : pyarrow.Array or pyarrow.ChunkedArray
+ pattern : str
+ SQL-style LIKE pattern. '%' will match any number of
+ characters, '_' will match exactly one character, and all
+ other characters match themselves.
+
+ Returns
+ -------
+ result : pyarrow.Array or pyarrow.ChunkedArray
+
+ """
+ return call_function("match_like", [array],
+ MatchSubstringOptions(pattern))
+
+
def match_substring(array, pattern):
"""
Test if substring *pattern* is contained within a value of a string array.
diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
index 8e045fb4f2d..fc87b2b4a19 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -272,6 +272,13 @@ def test_variance():
assert pc.variance(data, ddof=1).as_py() == 6.0
+def test_match_like():
+ arr = pa.array(["ab", "ba%", "ba", "ca%d", None])
+ result = pc.match_like(arr, r"_a\%%")
+ expected = pa.array([False, True, False, True, None])
+ assert expected.equals(result)
+
+
def test_match_substring():
arr = pa.array(["ab", "abc", "ba", None])
result = pc.match_substring(arr, "ab")
From 38f26d8595b57ef2898e10d82567b346bf52c39b Mon Sep 17 00:00:00 2001
From: David Li
Date: Wed, 19 May 2021 17:01:12 -0400
Subject: [PATCH 2/5] ARROW-12715: [C++][Python] Add test case for escape
sequences
---
cpp/src/arrow/compute/kernels/scalar_string_test.cc | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 5f1df8aaafa..743ff50feb3 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -436,6 +436,10 @@ TYPED_TEST(TestStringKernels, MatchLikeEscaping) {
MatchSubstringOptions special_chars{"!@#$^&*()[]{}.?"};
this->CheckUnary("match_like", R"(["!@#$^&*()[]{}.?"])", boolean(), "[true]",
&special_chars);
+
+ MatchSubstringOptions escape_sequences{"\n\t%"};
+ this->CheckUnary("match_like", R"(["\n\tfoo\t", "\n\t", "\n"])", boolean(),
+ "[true, true, false]", &escape_sequences);
}
#endif
From 7f3bdfd7752eb73ff9d3b2b8eb583cd4ad4a849d Mon Sep 17 00:00:00 2001
From: David Li
Date: Wed, 19 May 2021 17:30:59 -0400
Subject: [PATCH 3/5] ARROW-12715: [C++][Python] Document escape character
---
cpp/src/arrow/compute/kernels/scalar_string.cc | 7 +++++--
docs/source/cpp/compute.rst | 3 ++-
python/pyarrow/compute.py | 3 ++-
3 files changed, 9 insertions(+), 4 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 398ee63c9f3..fafa1e554a9 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -577,8 +577,11 @@ struct MatchLike {
const FunctionDoc match_like_doc(
"Match strings against SQL-style LIKE pattern",
("For each string in `strings`, emit true iff it fully matches a given pattern "
- "at any position.\n"
- "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
+ "at any position. That is, '%' will match any number of characters, '_' will "
+ "match exactly one character, and any other character matches itself. To "
+ "match a literal '%', '_', or '\\', precede the character with a backslash.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions.\n"
+ "To match a literal"),
{"strings"}, "MatchSubstringOptions");
#endif
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index 2f5ee747aeb..e9b3feb5b56 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -546,7 +546,8 @@ Containment tests
:member:`MatchSubstringOptions::pattern` fully matches the
corresponding input element. That is, ``%`` will match any number of
characters, ``_`` will match exactly one character, and any other
- character matches itself.
+ character matches itself. To match a literal percent sign or
+ underscore, precede the character with a backslash.
* \(2) Output is true iff :member:`MatchSubstringOptions::pattern`
is a substring of the corresponding input element.
diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
index 79496b3f770..18d7fee8df0 100644
--- a/python/pyarrow/compute.py
+++ b/python/pyarrow/compute.py
@@ -300,7 +300,8 @@ def match_like(array, pattern):
pattern : str
SQL-style LIKE pattern. '%' will match any number of
characters, '_' will match exactly one character, and all
- other characters match themselves.
+ other characters match themselves. To match a literal percent
+ sign or underscore, precede the character with a backslash.
Returns
-------
From f9605d4513dd74a2219f9bed40597c69bdac6485 Mon Sep 17 00:00:00 2001
From: David Li
Date: Wed, 19 May 2021 17:39:32 -0400
Subject: [PATCH 4/5] ARROW-12715: [C++] Fix docstring
---
cpp/src/arrow/compute/kernels/scalar_string.cc | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index fafa1e554a9..44db3fb5e40 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -580,8 +580,7 @@ const FunctionDoc match_like_doc(
"at any position. That is, '%' will match any number of characters, '_' will "
"match exactly one character, and any other character matches itself. To "
"match a literal '%', '_', or '\\', precede the character with a backslash.\n"
- "Null inputs emit null. The pattern must be given in MatchSubstringOptions.\n"
- "To match a literal"),
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
{"strings"}, "MatchSubstringOptions");
#endif
From 9ccca34b3f4129fd1f7945c32d40d7edffa2ff6c Mon Sep 17 00:00:00 2001
From: David Li
Date: Wed, 19 May 2021 18:00:21 -0400
Subject: [PATCH 5/5] ARROW-12715: [C++] Fix substring match optimization
---
cpp/src/arrow/compute/kernels/scalar_string.cc | 2 +-
.../arrow/compute/kernels/scalar_string_test.cc | 14 ++++++++------
2 files changed, 9 insertions(+), 7 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 44db3fb5e40..1475379391e 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -542,7 +542,7 @@ std::string MakeLikeRegex(const MatchSubstringOptions& options) {
}
// A LIKE pattern matching this regex can be translated into a substring search.
-static RE2 kLikePatternIsSubstringMatch("%+([^%_])*%+");
+static RE2 kLikePatternIsSubstringMatch("%+([^%_]*)%+");
// Evaluate a SQL-like LIKE pattern by translating it to a regexp or
// substring search as appropriate. See what Apache Impala does:
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 743ff50feb3..c20af503ca9 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -390,28 +390,30 @@ TYPED_TEST(TestStringKernels, MatchSubstringRegexInvalid) {
}
TYPED_TEST(TestStringKernels, MatchLike) {
- auto inputs = R"(["foo", "bar", "foobar", "barfoo", "\nfoo", "foo\n", null])";
+ auto inputs = R"(["foo", "bar", "foobar", "barfoo", "o", "\nfoo", "foo\n", null])";
MatchSubstringOptions prefix_match{"foo%"};
this->CheckUnary("match_like", "[]", boolean(), "[]", &prefix_match);
this->CheckUnary("match_like", inputs, boolean(),
- "[true, false, true, false, false, true, null]", &prefix_match);
+ "[true, false, true, false, false, false, true, null]", &prefix_match);
MatchSubstringOptions suffix_match{"%foo"};
this->CheckUnary("match_like", inputs, boolean(),
- "[true, false, false, true, true, false, null]", &suffix_match);
+ "[true, false, false, true, false, true, false, null]", &suffix_match);
MatchSubstringOptions substring_match{"%foo%"};
this->CheckUnary("match_like", inputs, boolean(),
- "[true, false, true, true, true, true, null]", &substring_match);
+ "[true, false, true, true, false, true, true, null]",
+ &substring_match);
MatchSubstringOptions trivial_match{"%%"};
this->CheckUnary("match_like", inputs, boolean(),
- "[true, true, true, true, true, true, null]", &trivial_match);
+ "[true, true, true, true, true, true, true, null]", &trivial_match);
MatchSubstringOptions regex_match{"foo%bar"};
this->CheckUnary("match_like", inputs, boolean(),
- "[false, false, true, false, false, false, null]", ®ex_match);
+ "[false, false, true, false, false, false, false, null]",
+ ®ex_match);
}
TYPED_TEST(TestStringKernels, MatchLikeEscaping) {