Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 99 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,95 @@ const FunctionDoc match_substring_regex_doc(
"position.\n"
"Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
{"strings"}, "MatchSubstringOptions");

// SQL LIKE match

/// Convert a SQL-style LIKE pattern (using '%' and '_') into a regex pattern
std::string MakeLikeRegex(const MatchSubstringOptions& options) {
// Allow . to match \n
std::string like_pattern = "(?s:^";
like_pattern.reserve(options.pattern.size() + 7);
bool escaped = false;
for (const char c : options.pattern) {
if (!escaped && c == '%') {
like_pattern.append(".*");
} else if (!escaped && c == '_') {
like_pattern.append(".");
} else if (!escaped && c == '\\') {
escaped = true;
} else {
switch (c) {
case '.':
case '?':
case '+':
case '*':
case '^':
case '$':
case '\\':
case '[':
case '{':
case '(':
case ')':
case '|': {
like_pattern.push_back('\\');
like_pattern.push_back(c);
escaped = false;
break;
}
default: {
like_pattern.push_back(c);
escaped = false;
break;
}
}
}
}
like_pattern.append("$)");
return like_pattern;
}

// A LIKE pattern matching this regex can be translated into a substring search.
static RE2 kLikePatternIsSubstringMatch("%+([^%_]*)%+");

// Evaluate a SQL-like LIKE pattern by translating it to a regexp or
// substring search as appropriate. See what Apache Impala does:
// https://github.com/apache/impala/blob/9c38568657d62b6f6d7b10aa1c721ba843374dd8/be/src/exprs/like-predicate.cc
// Note that Impala optimizes more cases (e.g. prefix match) but we
// don't have kernels for those.
template <typename StringType>
struct MatchLike {
static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
auto original_options = MatchSubstringState::Get(ctx);
auto original_state = ctx->state();

Status status;
std::string pattern;
if (re2::RE2::FullMatch(original_options.pattern, kLikePatternIsSubstringMatch,
&pattern)) {
MatchSubstringOptions converted_options{pattern};
MatchSubstringState converted_state(converted_options);
ctx->SetState(&converted_state);
status = MatchSubstring<StringType, PlainSubstringMatcher>::Exec(ctx, batch, out);
} else {
MatchSubstringOptions converted_options{MakeLikeRegex(original_options)};
MatchSubstringState converted_state(converted_options);
ctx->SetState(&converted_state);
status = MatchSubstring<StringType, RegexSubstringMatcher>::Exec(ctx, batch, out);
}
ctx->SetState(original_state);
return status;
}
};

const FunctionDoc match_like_doc(
"Match strings against SQL-style LIKE pattern",
("For each string in `strings`, emit true iff it fully matches a given pattern "
"at any position. That is, '%' will match any number of characters, '_' will "
"match exactly one character, and any other character matches itself. To "
"match a literal '%', '_', or '\\', precede the character with a backslash.\n"
"Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
{"strings"}, "MatchSubstringOptions");

#endif

void AddMatchSubstring(FunctionRegistry* registry) {
Expand All @@ -518,6 +607,16 @@ void AddMatchSubstring(FunctionRegistry* registry) {
func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
DCHECK_OK(registry->AddFunction(std::move(func)));
}
{
auto func =
std::make_shared<ScalarFunction>("match_like", Arity::Unary(), &match_like_doc);
auto exec_32 = MatchLike<StringType>::Exec;
auto exec_64 = MatchLike<LargeStringType>::Exec;
DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
DCHECK_OK(
func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
DCHECK_OK(registry->AddFunction(std::move(func)));
}
#endif
}

Expand Down
55 changes: 55 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_string_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,61 @@ TYPED_TEST(TestStringKernels, MatchSubstringRegexInvalid) {
Invalid, ::testing::HasSubstr("Invalid regular expression: missing ]"),
CallFunction("match_substring_regex", {input}, &options));
}

TYPED_TEST(TestStringKernels, MatchLike) {
auto inputs = R"(["foo", "bar", "foobar", "barfoo", "o", "\nfoo", "foo\n", null])";

MatchSubstringOptions prefix_match{"foo%"};
this->CheckUnary("match_like", "[]", boolean(), "[]", &prefix_match);
this->CheckUnary("match_like", inputs, boolean(),
"[true, false, true, false, false, false, true, null]", &prefix_match);

MatchSubstringOptions suffix_match{"%foo"};
this->CheckUnary("match_like", inputs, boolean(),
"[true, false, false, true, false, true, false, null]", &suffix_match);

MatchSubstringOptions substring_match{"%foo%"};
this->CheckUnary("match_like", inputs, boolean(),
"[true, false, true, true, false, true, true, null]",
&substring_match);

MatchSubstringOptions trivial_match{"%%"};
this->CheckUnary("match_like", inputs, boolean(),
"[true, true, true, true, true, true, true, null]", &trivial_match);

MatchSubstringOptions regex_match{"foo%bar"};
this->CheckUnary("match_like", inputs, boolean(),
"[false, false, true, false, false, false, false, null]",
&regex_match);
}

TYPED_TEST(TestStringKernels, MatchLikeEscaping) {
auto inputs = R"(["%%foo", "_bar", "({", "\\baz"])";

MatchSubstringOptions escape_percent{"\\%%"};
this->CheckUnary("match_like", inputs, boolean(), "[true, false, false, false]",
&escape_percent);

MatchSubstringOptions escape_underscore{"\\____"};
this->CheckUnary("match_like", inputs, boolean(), "[false, true, false, false]",
&escape_underscore);

MatchSubstringOptions escape_regex{"(%"};
this->CheckUnary("match_like", inputs, boolean(), "[false, false, true, false]",
&escape_regex);

MatchSubstringOptions escape_escape{"\\\\%"};
this->CheckUnary("match_like", inputs, boolean(), "[false, false, false, true]",
&escape_escape);

MatchSubstringOptions special_chars{"!@#$^&*()[]{}.?"};
this->CheckUnary("match_like", R"(["!@#$^&*()[]{}.?"])", boolean(), "[true]",
&special_chars);

MatchSubstringOptions escape_sequences{"\n\t%"};
this->CheckUnary("match_like", R"(["\n\tfoo\t", "\n\t", "\n"])", boolean(),
"[true, true, false]", &escape_sequences);
}
#endif

TYPED_TEST(TestStringKernels, SplitBasics) {
Expand Down
25 changes: 17 additions & 8 deletions docs/source/cpp/compute.rst
Original file line number Diff line number Diff line change
Expand Up @@ -529,28 +529,37 @@ Containment tests
+---------------------------+------------+------------------------------------+---------------+----------------------------------------+
| Function name | Arity | Input types | Output type | Options class |
+===========================+============+====================================+===============+========================================+
| match_substring | Unary | String-like | Boolean (1) | :struct:`MatchSubstringOptions` |
| match_like | Unary | String-like | Boolean (1) | :struct:`MatchSubstringOptions` |
+---------------------------+------------+------------------------------------+---------------+----------------------------------------+
| match_substring_regex | Unary | String-like | Boolean (2) | :struct:`MatchSubstringOptions` |
| match_substring | Unary | String-like | Boolean (2) | :struct:`MatchSubstringOptions` |
+---------------------------+------------+------------------------------------+---------------+----------------------------------------+
| index_in | Unary | Boolean, Null, Numeric, Temporal, | Int32 (3) | :struct:`SetLookupOptions` |
| match_substring_regex | Unary | String-like | Boolean (3) | :struct:`MatchSubstringOptions` |
+---------------------------+------------+------------------------------------+---------------+----------------------------------------+
| index_in | Unary | Boolean, Null, Numeric, Temporal, | Int32 (4) | :struct:`SetLookupOptions` |
| | | Binary- and String-like | | |
+---------------------------+------------+------------------------------------+---------------+----------------------------------------+
| is_in | Unary | Boolean, Null, Numeric, Temporal, | Boolean (4) | :struct:`SetLookupOptions` |
| is_in | Unary | Boolean, Null, Numeric, Temporal, | Boolean (5) | :struct:`SetLookupOptions` |
| | | Binary- and String-like | | |
+---------------------------+------------+------------------------------------+---------------+----------------------------------------+

* \(1) Output is true iff :member:`MatchSubstringOptions::pattern`
is a substring of the corresponding input element.
* \(1) Output is true iff the SQL-style LIKE pattern
:member:`MatchSubstringOptions::pattern` fully matches the
corresponding input element. That is, ``%`` will match any number of
characters, ``_`` will match exactly one character, and any other
character matches itself. To match a literal percent sign or
underscore, precede the character with a backslash.

* \(2) Output is true iff :member:`MatchSubstringOptions::pattern`
is a substring of the corresponding input element.

* \(3) Output is true iff :member:`MatchSubstringOptions::pattern`
matches the corresponding input element at any position.

* \(3) Output is the index of the corresponding input element in
* \(4) Output is the index of the corresponding input element in
:member:`SetLookupOptions::value_set`, if found there. Otherwise,
output is null.

* \(4) Output is true iff the corresponding input element is equal to one
* \(5) Output is true iff the corresponding input element is equal to one
of the elements in :member:`SetLookupOptions::value_set`.


Expand Down
1 change: 1 addition & 0 deletions docs/source/python/api/compute.rst
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ Containment tests

index_in
is_in
match_like
match_substring
match_substring_regex

Expand Down
23 changes: 23 additions & 0 deletions python/pyarrow/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,29 @@ def cast(arr, target_type, safe=True):
return call_function("cast", [arr], options)


def match_like(array, pattern):
"""
Test if the SQL-style LIKE pattern *pattern* matches a value of a
string array.

Parameters
----------
array : pyarrow.Array or pyarrow.ChunkedArray
pattern : str
SQL-style LIKE pattern. '%' will match any number of
characters, '_' will match exactly one character, and all
other characters match themselves. To match a literal percent
sign or underscore, precede the character with a backslash.

Returns
-------
result : pyarrow.Array or pyarrow.ChunkedArray

"""
return call_function("match_like", [array],
MatchSubstringOptions(pattern))


def match_substring(array, pattern):
"""
Test if substring *pattern* is contained within a value of a string array.
Expand Down
7 changes: 7 additions & 0 deletions python/pyarrow/tests/test_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,13 @@ def test_variance():
assert pc.variance(data, ddof=1).as_py() == 6.0


def test_match_like():
arr = pa.array(["ab", "ba%", "ba", "ca%d", None])
result = pc.match_like(arr, r"_a\%%")
expected = pa.array([False, True, False, True, None])
assert expected.equals(result)


def test_match_substring():
arr = pa.array(["ab", "abc", "ba", None])
result = pc.match_substring(arr, "ab")
Expand Down