From a86744e38f71375fb5d933b02040d0fa737ec887 Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 19 May 2021 09:23:36 -0400 Subject: [PATCH 1/5] ARROW-12715: [C++][Python] Add SQL LIKE match kernel --- .../arrow/compute/kernels/scalar_string.cc | 97 +++++++++++++++++++ .../compute/kernels/scalar_string_test.cc | 49 ++++++++++ docs/source/cpp/compute.rst | 24 +++-- docs/source/python/api/compute.rst | 1 + python/pyarrow/compute.py | 22 +++++ python/pyarrow/tests/test_compute.py | 7 ++ 6 files changed, 192 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 65196b2a491..398ee63c9f3 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -494,6 +494,93 @@ const FunctionDoc match_substring_regex_doc( "position.\n" "Null inputs emit null. The pattern must be given in MatchSubstringOptions."), {"strings"}, "MatchSubstringOptions"); + +// SQL LIKE match + +/// Convert a SQL-style LIKE pattern (using '%' and '_') into a regex pattern +std::string MakeLikeRegex(const MatchSubstringOptions& options) { + // Allow . to match \n + std::string like_pattern = "(?s:^"; + like_pattern.reserve(options.pattern.size() + 7); + bool escaped = false; + for (const char c : options.pattern) { + if (!escaped && c == '%') { + like_pattern.append(".*"); + } else if (!escaped && c == '_') { + like_pattern.append("."); + } else if (!escaped && c == '\\') { + escaped = true; + } else { + switch (c) { + case '.': + case '?': + case '+': + case '*': + case '^': + case '$': + case '\\': + case '[': + case '{': + case '(': + case ')': + case '|': { + like_pattern.push_back('\\'); + like_pattern.push_back(c); + escaped = false; + break; + } + default: { + like_pattern.push_back(c); + escaped = false; + break; + } + } + } + } + like_pattern.append("$)"); + return like_pattern; +} + +// A LIKE pattern matching this regex can be translated into a substring search. +static RE2 kLikePatternIsSubstringMatch("%+([^%_])*%+"); + +// Evaluate a SQL-like LIKE pattern by translating it to a regexp or +// substring search as appropriate. See what Apache Impala does: +// https://github.com/apache/impala/blob/9c38568657d62b6f6d7b10aa1c721ba843374dd8/be/src/exprs/like-predicate.cc +// Note that Impala optimizes more cases (e.g. prefix match) but we +// don't have kernels for those. +template +struct MatchLike { + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + auto original_options = MatchSubstringState::Get(ctx); + auto original_state = ctx->state(); + + Status status; + std::string pattern; + if (re2::RE2::FullMatch(original_options.pattern, kLikePatternIsSubstringMatch, + &pattern)) { + MatchSubstringOptions converted_options{pattern}; + MatchSubstringState converted_state(converted_options); + ctx->SetState(&converted_state); + status = MatchSubstring::Exec(ctx, batch, out); + } else { + MatchSubstringOptions converted_options{MakeLikeRegex(original_options)}; + MatchSubstringState converted_state(converted_options); + ctx->SetState(&converted_state); + status = MatchSubstring::Exec(ctx, batch, out); + } + ctx->SetState(original_state); + return status; + } +}; + +const FunctionDoc match_like_doc( + "Match strings against SQL-style LIKE pattern", + ("For each string in `strings`, emit true iff it fully matches a given pattern " + "at any position.\n" + "Null inputs emit null. The pattern must be given in MatchSubstringOptions."), + {"strings"}, "MatchSubstringOptions"); + #endif void AddMatchSubstring(FunctionRegistry* registry) { @@ -518,6 +605,16 @@ void AddMatchSubstring(FunctionRegistry* registry) { func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init)); DCHECK_OK(registry->AddFunction(std::move(func))); } + { + auto func = + std::make_shared("match_like", Arity::Unary(), &match_like_doc); + auto exec_32 = MatchLike::Exec; + auto exec_64 = MatchLike::Exec; + DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init)); + DCHECK_OK( + func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init)); + DCHECK_OK(registry->AddFunction(std::move(func))); + } #endif } diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index a59634b7be8..5f1df8aaafa 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -388,6 +388,55 @@ TYPED_TEST(TestStringKernels, MatchSubstringRegexInvalid) { Invalid, ::testing::HasSubstr("Invalid regular expression: missing ]"), CallFunction("match_substring_regex", {input}, &options)); } + +TYPED_TEST(TestStringKernels, MatchLike) { + auto inputs = R"(["foo", "bar", "foobar", "barfoo", "\nfoo", "foo\n", null])"; + + MatchSubstringOptions prefix_match{"foo%"}; + this->CheckUnary("match_like", "[]", boolean(), "[]", &prefix_match); + this->CheckUnary("match_like", inputs, boolean(), + "[true, false, true, false, false, true, null]", &prefix_match); + + MatchSubstringOptions suffix_match{"%foo"}; + this->CheckUnary("match_like", inputs, boolean(), + "[true, false, false, true, true, false, null]", &suffix_match); + + MatchSubstringOptions substring_match{"%foo%"}; + this->CheckUnary("match_like", inputs, boolean(), + "[true, false, true, true, true, true, null]", &substring_match); + + MatchSubstringOptions trivial_match{"%%"}; + this->CheckUnary("match_like", inputs, boolean(), + "[true, true, true, true, true, true, null]", &trivial_match); + + MatchSubstringOptions regex_match{"foo%bar"}; + this->CheckUnary("match_like", inputs, boolean(), + "[false, false, true, false, false, false, null]", ®ex_match); +} + +TYPED_TEST(TestStringKernels, MatchLikeEscaping) { + auto inputs = R"(["%%foo", "_bar", "({", "\\baz"])"; + + MatchSubstringOptions escape_percent{"\\%%"}; + this->CheckUnary("match_like", inputs, boolean(), "[true, false, false, false]", + &escape_percent); + + MatchSubstringOptions escape_underscore{"\\____"}; + this->CheckUnary("match_like", inputs, boolean(), "[false, true, false, false]", + &escape_underscore); + + MatchSubstringOptions escape_regex{"(%"}; + this->CheckUnary("match_like", inputs, boolean(), "[false, false, true, false]", + &escape_regex); + + MatchSubstringOptions escape_escape{"\\\\%"}; + this->CheckUnary("match_like", inputs, boolean(), "[false, false, false, true]", + &escape_escape); + + MatchSubstringOptions special_chars{"!@#$^&*()[]{}.?"}; + this->CheckUnary("match_like", R"(["!@#$^&*()[]{}.?"])", boolean(), "[true]", + &special_chars); +} #endif TYPED_TEST(TestStringKernels, SplitBasics) { diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 592dc4ec1b0..2f5ee747aeb 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -529,28 +529,36 @@ Containment tests +---------------------------+------------+------------------------------------+---------------+----------------------------------------+ | Function name | Arity | Input types | Output type | Options class | +===========================+============+====================================+===============+========================================+ -| match_substring | Unary | String-like | Boolean (1) | :struct:`MatchSubstringOptions` | +| match_like | Unary | String-like | Boolean (1) | :struct:`MatchSubstringOptions` | +---------------------------+------------+------------------------------------+---------------+----------------------------------------+ -| match_substring_regex | Unary | String-like | Boolean (2) | :struct:`MatchSubstringOptions` | +| match_substring | Unary | String-like | Boolean (2) | :struct:`MatchSubstringOptions` | +---------------------------+------------+------------------------------------+---------------+----------------------------------------+ -| index_in | Unary | Boolean, Null, Numeric, Temporal, | Int32 (3) | :struct:`SetLookupOptions` | +| match_substring_regex | Unary | String-like | Boolean (3) | :struct:`MatchSubstringOptions` | ++---------------------------+------------+------------------------------------+---------------+----------------------------------------+ +| index_in | Unary | Boolean, Null, Numeric, Temporal, | Int32 (4) | :struct:`SetLookupOptions` | | | | Binary- and String-like | | | +---------------------------+------------+------------------------------------+---------------+----------------------------------------+ -| is_in | Unary | Boolean, Null, Numeric, Temporal, | Boolean (4) | :struct:`SetLookupOptions` | +| is_in | Unary | Boolean, Null, Numeric, Temporal, | Boolean (5) | :struct:`SetLookupOptions` | | | | Binary- and String-like | | | +---------------------------+------------+------------------------------------+---------------+----------------------------------------+ -* \(1) Output is true iff :member:`MatchSubstringOptions::pattern` - is a substring of the corresponding input element. +* \(1) Output is true iff the SQL-style LIKE pattern + :member:`MatchSubstringOptions::pattern` fully matches the + corresponding input element. That is, ``%`` will match any number of + characters, ``_`` will match exactly one character, and any other + character matches itself. * \(2) Output is true iff :member:`MatchSubstringOptions::pattern` + is a substring of the corresponding input element. + +* \(3) Output is true iff :member:`MatchSubstringOptions::pattern` matches the corresponding input element at any position. -* \(3) Output is the index of the corresponding input element in +* \(4) Output is the index of the corresponding input element in :member:`SetLookupOptions::value_set`, if found there. Otherwise, output is null. -* \(4) Output is true iff the corresponding input element is equal to one +* \(5) Output is true iff the corresponding input element is equal to one of the elements in :member:`SetLookupOptions::value_set`. diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index da16ccdfa29..7ab8338b522 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -156,6 +156,7 @@ Containment tests index_in is_in + match_like match_substring match_substring_regex diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index ec38710b023..79496b3f770 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -289,6 +289,28 @@ def cast(arr, target_type, safe=True): return call_function("cast", [arr], options) +def match_like(array, pattern): + """ + Test if the SQL-style LIKE pattern *pattern* matches a value of a + string array. + + Parameters + ---------- + array : pyarrow.Array or pyarrow.ChunkedArray + pattern : str + SQL-style LIKE pattern. '%' will match any number of + characters, '_' will match exactly one character, and all + other characters match themselves. + + Returns + ------- + result : pyarrow.Array or pyarrow.ChunkedArray + + """ + return call_function("match_like", [array], + MatchSubstringOptions(pattern)) + + def match_substring(array, pattern): """ Test if substring *pattern* is contained within a value of a string array. diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 8e045fb4f2d..fc87b2b4a19 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -272,6 +272,13 @@ def test_variance(): assert pc.variance(data, ddof=1).as_py() == 6.0 +def test_match_like(): + arr = pa.array(["ab", "ba%", "ba", "ca%d", None]) + result = pc.match_like(arr, r"_a\%%") + expected = pa.array([False, True, False, True, None]) + assert expected.equals(result) + + def test_match_substring(): arr = pa.array(["ab", "abc", "ba", None]) result = pc.match_substring(arr, "ab") From 38f26d8595b57ef2898e10d82567b346bf52c39b Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 19 May 2021 17:01:12 -0400 Subject: [PATCH 2/5] ARROW-12715: [C++][Python] Add test case for escape sequences --- cpp/src/arrow/compute/kernels/scalar_string_test.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 5f1df8aaafa..743ff50feb3 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -436,6 +436,10 @@ TYPED_TEST(TestStringKernels, MatchLikeEscaping) { MatchSubstringOptions special_chars{"!@#$^&*()[]{}.?"}; this->CheckUnary("match_like", R"(["!@#$^&*()[]{}.?"])", boolean(), "[true]", &special_chars); + + MatchSubstringOptions escape_sequences{"\n\t%"}; + this->CheckUnary("match_like", R"(["\n\tfoo\t", "\n\t", "\n"])", boolean(), + "[true, true, false]", &escape_sequences); } #endif From 7f3bdfd7752eb73ff9d3b2b8eb583cd4ad4a849d Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 19 May 2021 17:30:59 -0400 Subject: [PATCH 3/5] ARROW-12715: [C++][Python] Document escape character --- cpp/src/arrow/compute/kernels/scalar_string.cc | 7 +++++-- docs/source/cpp/compute.rst | 3 ++- python/pyarrow/compute.py | 3 ++- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 398ee63c9f3..fafa1e554a9 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -577,8 +577,11 @@ struct MatchLike { const FunctionDoc match_like_doc( "Match strings against SQL-style LIKE pattern", ("For each string in `strings`, emit true iff it fully matches a given pattern " - "at any position.\n" - "Null inputs emit null. The pattern must be given in MatchSubstringOptions."), + "at any position. That is, '%' will match any number of characters, '_' will " + "match exactly one character, and any other character matches itself. To " + "match a literal '%', '_', or '\\', precede the character with a backslash.\n" + "Null inputs emit null. The pattern must be given in MatchSubstringOptions.\n" + "To match a literal"), {"strings"}, "MatchSubstringOptions"); #endif diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 2f5ee747aeb..e9b3feb5b56 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -546,7 +546,8 @@ Containment tests :member:`MatchSubstringOptions::pattern` fully matches the corresponding input element. That is, ``%`` will match any number of characters, ``_`` will match exactly one character, and any other - character matches itself. + character matches itself. To match a literal percent sign or + underscore, precede the character with a backslash. * \(2) Output is true iff :member:`MatchSubstringOptions::pattern` is a substring of the corresponding input element. diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 79496b3f770..18d7fee8df0 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -300,7 +300,8 @@ def match_like(array, pattern): pattern : str SQL-style LIKE pattern. '%' will match any number of characters, '_' will match exactly one character, and all - other characters match themselves. + other characters match themselves. To match a literal percent + sign or underscore, precede the character with a backslash. Returns ------- From f9605d4513dd74a2219f9bed40597c69bdac6485 Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 19 May 2021 17:39:32 -0400 Subject: [PATCH 4/5] ARROW-12715: [C++] Fix docstring --- cpp/src/arrow/compute/kernels/scalar_string.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index fafa1e554a9..44db3fb5e40 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -580,8 +580,7 @@ const FunctionDoc match_like_doc( "at any position. That is, '%' will match any number of characters, '_' will " "match exactly one character, and any other character matches itself. To " "match a literal '%', '_', or '\\', precede the character with a backslash.\n" - "Null inputs emit null. The pattern must be given in MatchSubstringOptions.\n" - "To match a literal"), + "Null inputs emit null. The pattern must be given in MatchSubstringOptions."), {"strings"}, "MatchSubstringOptions"); #endif From 9ccca34b3f4129fd1f7945c32d40d7edffa2ff6c Mon Sep 17 00:00:00 2001 From: David Li Date: Wed, 19 May 2021 18:00:21 -0400 Subject: [PATCH 5/5] ARROW-12715: [C++] Fix substring match optimization --- cpp/src/arrow/compute/kernels/scalar_string.cc | 2 +- .../arrow/compute/kernels/scalar_string_test.cc | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 44db3fb5e40..1475379391e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -542,7 +542,7 @@ std::string MakeLikeRegex(const MatchSubstringOptions& options) { } // A LIKE pattern matching this regex can be translated into a substring search. -static RE2 kLikePatternIsSubstringMatch("%+([^%_])*%+"); +static RE2 kLikePatternIsSubstringMatch("%+([^%_]*)%+"); // Evaluate a SQL-like LIKE pattern by translating it to a regexp or // substring search as appropriate. See what Apache Impala does: diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 743ff50feb3..c20af503ca9 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -390,28 +390,30 @@ TYPED_TEST(TestStringKernels, MatchSubstringRegexInvalid) { } TYPED_TEST(TestStringKernels, MatchLike) { - auto inputs = R"(["foo", "bar", "foobar", "barfoo", "\nfoo", "foo\n", null])"; + auto inputs = R"(["foo", "bar", "foobar", "barfoo", "o", "\nfoo", "foo\n", null])"; MatchSubstringOptions prefix_match{"foo%"}; this->CheckUnary("match_like", "[]", boolean(), "[]", &prefix_match); this->CheckUnary("match_like", inputs, boolean(), - "[true, false, true, false, false, true, null]", &prefix_match); + "[true, false, true, false, false, false, true, null]", &prefix_match); MatchSubstringOptions suffix_match{"%foo"}; this->CheckUnary("match_like", inputs, boolean(), - "[true, false, false, true, true, false, null]", &suffix_match); + "[true, false, false, true, false, true, false, null]", &suffix_match); MatchSubstringOptions substring_match{"%foo%"}; this->CheckUnary("match_like", inputs, boolean(), - "[true, false, true, true, true, true, null]", &substring_match); + "[true, false, true, true, false, true, true, null]", + &substring_match); MatchSubstringOptions trivial_match{"%%"}; this->CheckUnary("match_like", inputs, boolean(), - "[true, true, true, true, true, true, null]", &trivial_match); + "[true, true, true, true, true, true, true, null]", &trivial_match); MatchSubstringOptions regex_match{"foo%bar"}; this->CheckUnary("match_like", inputs, boolean(), - "[false, false, true, false, false, false, null]", ®ex_match); + "[false, false, true, false, false, false, false, null]", + ®ex_match); } TYPED_TEST(TestStringKernels, MatchLikeEscaping) {