diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 65196b2a491..1475379391e 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -494,6 +494,95 @@ const FunctionDoc match_substring_regex_doc( "position.\n" "Null inputs emit null. The pattern must be given in MatchSubstringOptions."), {"strings"}, "MatchSubstringOptions"); + +// SQL LIKE match + +/// Convert a SQL-style LIKE pattern (using '%' and '_') into a regex pattern +std::string MakeLikeRegex(const MatchSubstringOptions& options) { + // Allow . to match \n + std::string like_pattern = "(?s:^"; + like_pattern.reserve(options.pattern.size() + 7); + bool escaped = false; + for (const char c : options.pattern) { + if (!escaped && c == '%') { + like_pattern.append(".*"); + } else if (!escaped && c == '_') { + like_pattern.append("."); + } else if (!escaped && c == '\\') { + escaped = true; + } else { + switch (c) { + case '.': + case '?': + case '+': + case '*': + case '^': + case '$': + case '\\': + case '[': + case '{': + case '(': + case ')': + case '|': { + like_pattern.push_back('\\'); + like_pattern.push_back(c); + escaped = false; + break; + } + default: { + like_pattern.push_back(c); + escaped = false; + break; + } + } + } + } + like_pattern.append("$)"); + return like_pattern; +} + +// A LIKE pattern matching this regex can be translated into a substring search. +static RE2 kLikePatternIsSubstringMatch("%+([^%_]*)%+"); + +// Evaluate a SQL-like LIKE pattern by translating it to a regexp or +// substring search as appropriate. See what Apache Impala does: +// https://github.com/apache/impala/blob/9c38568657d62b6f6d7b10aa1c721ba843374dd8/be/src/exprs/like-predicate.cc +// Note that Impala optimizes more cases (e.g. prefix match) but we +// don't have kernels for those. +template +struct MatchLike { + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + auto original_options = MatchSubstringState::Get(ctx); + auto original_state = ctx->state(); + + Status status; + std::string pattern; + if (re2::RE2::FullMatch(original_options.pattern, kLikePatternIsSubstringMatch, + &pattern)) { + MatchSubstringOptions converted_options{pattern}; + MatchSubstringState converted_state(converted_options); + ctx->SetState(&converted_state); + status = MatchSubstring::Exec(ctx, batch, out); + } else { + MatchSubstringOptions converted_options{MakeLikeRegex(original_options)}; + MatchSubstringState converted_state(converted_options); + ctx->SetState(&converted_state); + status = MatchSubstring::Exec(ctx, batch, out); + } + ctx->SetState(original_state); + return status; + } +}; + +const FunctionDoc match_like_doc( + "Match strings against SQL-style LIKE pattern", + ("For each string in `strings`, emit true iff it fully matches a given pattern " + "at any position. That is, '%' will match any number of characters, '_' will " + "match exactly one character, and any other character matches itself. To " + "match a literal '%', '_', or '\\', precede the character with a backslash.\n" + "Null inputs emit null. The pattern must be given in MatchSubstringOptions."), + {"strings"}, "MatchSubstringOptions"); + #endif void AddMatchSubstring(FunctionRegistry* registry) { @@ -518,6 +607,16 @@ void AddMatchSubstring(FunctionRegistry* registry) { func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init)); DCHECK_OK(registry->AddFunction(std::move(func))); } + { + auto func = + std::make_shared("match_like", Arity::Unary(), &match_like_doc); + auto exec_32 = MatchLike::Exec; + auto exec_64 = MatchLike::Exec; + DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init)); + DCHECK_OK( + func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init)); + DCHECK_OK(registry->AddFunction(std::move(func))); + } #endif } diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index a59634b7be8..c20af503ca9 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -388,6 +388,61 @@ TYPED_TEST(TestStringKernels, MatchSubstringRegexInvalid) { Invalid, ::testing::HasSubstr("Invalid regular expression: missing ]"), CallFunction("match_substring_regex", {input}, &options)); } + +TYPED_TEST(TestStringKernels, MatchLike) { + auto inputs = R"(["foo", "bar", "foobar", "barfoo", "o", "\nfoo", "foo\n", null])"; + + MatchSubstringOptions prefix_match{"foo%"}; + this->CheckUnary("match_like", "[]", boolean(), "[]", &prefix_match); + this->CheckUnary("match_like", inputs, boolean(), + "[true, false, true, false, false, false, true, null]", &prefix_match); + + MatchSubstringOptions suffix_match{"%foo"}; + this->CheckUnary("match_like", inputs, boolean(), + "[true, false, false, true, false, true, false, null]", &suffix_match); + + MatchSubstringOptions substring_match{"%foo%"}; + this->CheckUnary("match_like", inputs, boolean(), + "[true, false, true, true, false, true, true, null]", + &substring_match); + + MatchSubstringOptions trivial_match{"%%"}; + this->CheckUnary("match_like", inputs, boolean(), + "[true, true, true, true, true, true, true, null]", &trivial_match); + + MatchSubstringOptions regex_match{"foo%bar"}; + this->CheckUnary("match_like", inputs, boolean(), + "[false, false, true, false, false, false, false, null]", + ®ex_match); +} + +TYPED_TEST(TestStringKernels, MatchLikeEscaping) { + auto inputs = R"(["%%foo", "_bar", "({", "\\baz"])"; + + MatchSubstringOptions escape_percent{"\\%%"}; + this->CheckUnary("match_like", inputs, boolean(), "[true, false, false, false]", + &escape_percent); + + MatchSubstringOptions escape_underscore{"\\____"}; + this->CheckUnary("match_like", inputs, boolean(), "[false, true, false, false]", + &escape_underscore); + + MatchSubstringOptions escape_regex{"(%"}; + this->CheckUnary("match_like", inputs, boolean(), "[false, false, true, false]", + &escape_regex); + + MatchSubstringOptions escape_escape{"\\\\%"}; + this->CheckUnary("match_like", inputs, boolean(), "[false, false, false, true]", + &escape_escape); + + MatchSubstringOptions special_chars{"!@#$^&*()[]{}.?"}; + this->CheckUnary("match_like", R"(["!@#$^&*()[]{}.?"])", boolean(), "[true]", + &special_chars); + + MatchSubstringOptions escape_sequences{"\n\t%"}; + this->CheckUnary("match_like", R"(["\n\tfoo\t", "\n\t", "\n"])", boolean(), + "[true, true, false]", &escape_sequences); +} #endif TYPED_TEST(TestStringKernels, SplitBasics) { diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 592dc4ec1b0..e9b3feb5b56 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -529,28 +529,37 @@ Containment tests +---------------------------+------------+------------------------------------+---------------+----------------------------------------+ | Function name | Arity | Input types | Output type | Options class | +===========================+============+====================================+===============+========================================+ -| match_substring | Unary | String-like | Boolean (1) | :struct:`MatchSubstringOptions` | +| match_like | Unary | String-like | Boolean (1) | :struct:`MatchSubstringOptions` | +---------------------------+------------+------------------------------------+---------------+----------------------------------------+ -| match_substring_regex | Unary | String-like | Boolean (2) | :struct:`MatchSubstringOptions` | +| match_substring | Unary | String-like | Boolean (2) | :struct:`MatchSubstringOptions` | +---------------------------+------------+------------------------------------+---------------+----------------------------------------+ -| index_in | Unary | Boolean, Null, Numeric, Temporal, | Int32 (3) | :struct:`SetLookupOptions` | +| match_substring_regex | Unary | String-like | Boolean (3) | :struct:`MatchSubstringOptions` | ++---------------------------+------------+------------------------------------+---------------+----------------------------------------+ +| index_in | Unary | Boolean, Null, Numeric, Temporal, | Int32 (4) | :struct:`SetLookupOptions` | | | | Binary- and String-like | | | +---------------------------+------------+------------------------------------+---------------+----------------------------------------+ -| is_in | Unary | Boolean, Null, Numeric, Temporal, | Boolean (4) | :struct:`SetLookupOptions` | +| is_in | Unary | Boolean, Null, Numeric, Temporal, | Boolean (5) | :struct:`SetLookupOptions` | | | | Binary- and String-like | | | +---------------------------+------------+------------------------------------+---------------+----------------------------------------+ -* \(1) Output is true iff :member:`MatchSubstringOptions::pattern` - is a substring of the corresponding input element. +* \(1) Output is true iff the SQL-style LIKE pattern + :member:`MatchSubstringOptions::pattern` fully matches the + corresponding input element. That is, ``%`` will match any number of + characters, ``_`` will match exactly one character, and any other + character matches itself. To match a literal percent sign or + underscore, precede the character with a backslash. * \(2) Output is true iff :member:`MatchSubstringOptions::pattern` + is a substring of the corresponding input element. + +* \(3) Output is true iff :member:`MatchSubstringOptions::pattern` matches the corresponding input element at any position. -* \(3) Output is the index of the corresponding input element in +* \(4) Output is the index of the corresponding input element in :member:`SetLookupOptions::value_set`, if found there. Otherwise, output is null. -* \(4) Output is true iff the corresponding input element is equal to one +* \(5) Output is true iff the corresponding input element is equal to one of the elements in :member:`SetLookupOptions::value_set`. diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index da16ccdfa29..7ab8338b522 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -156,6 +156,7 @@ Containment tests index_in is_in + match_like match_substring match_substring_regex diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index ec38710b023..18d7fee8df0 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -289,6 +289,29 @@ def cast(arr, target_type, safe=True): return call_function("cast", [arr], options) +def match_like(array, pattern): + """ + Test if the SQL-style LIKE pattern *pattern* matches a value of a + string array. + + Parameters + ---------- + array : pyarrow.Array or pyarrow.ChunkedArray + pattern : str + SQL-style LIKE pattern. '%' will match any number of + characters, '_' will match exactly one character, and all + other characters match themselves. To match a literal percent + sign or underscore, precede the character with a backslash. + + Returns + ------- + result : pyarrow.Array or pyarrow.ChunkedArray + + """ + return call_function("match_like", [array], + MatchSubstringOptions(pattern)) + + def match_substring(array, pattern): """ Test if substring *pattern* is contained within a value of a string array. diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 8e045fb4f2d..fc87b2b4a19 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -272,6 +272,13 @@ def test_variance(): assert pc.variance(data, ddof=1).as_py() == 6.0 +def test_match_like(): + arr = pa.array(["ab", "ba%", "ba", "ca%d", None]) + result = pc.match_like(arr, r"_a\%%") + expected = pa.array([False, True, False, True, None]) + assert expected.equals(result) + + def test_match_substring(): arr = pa.array(["ab", "abc", "ba", None]) result = pc.match_substring(arr, "ab")