diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 3a007e06567..dce420b32b2 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -43,10 +43,13 @@ struct ArithmeticOptions : public FunctionOptions { }; struct ARROW_EXPORT MatchSubstringOptions : public FunctionOptions { - explicit MatchSubstringOptions(std::string pattern) : pattern(std::move(pattern)) {} + explicit MatchSubstringOptions(std::string pattern, bool ignore_case = false) + : pattern(std::move(pattern)), ignore_case(ignore_case) {} /// The exact substring (or regex, depending on kernel) to look for inside input values. std::string pattern; + /// Whether to perform a case-insensitive match. + bool ignore_case = false; }; struct ARROW_EXPORT SplitOptions : public FunctionOptions { diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index 4d83e1ec24e..d939d1c7722 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -433,33 +433,6 @@ void StringBoolTransform(KernelContext* ctx, const ExecBatch& batch, using MatchSubstringState = OptionsWrapper; -template -struct MatchSubstring { - using offset_type = typename Type::offset_type; - static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { - // TODO Cache matcher across invocations (for regex compilation) - ARROW_ASSIGN_OR_RAISE(auto matcher, Matcher::Make(MatchSubstringState::Get(ctx))); - StringBoolTransform( - ctx, batch, - [&matcher](const void* raw_offsets, const uint8_t* data, int64_t length, - int64_t output_offset, uint8_t* output) { - const offset_type* offsets = reinterpret_cast(raw_offsets); - FirstTimeBitmapWriter bitmap_writer(output, output_offset, length); - for (int64_t i = 0; i < length; ++i) { - const char* current_data = reinterpret_cast(data + offsets[i]); - int64_t current_length = offsets[i + 1] - offsets[i]; - if (matcher->Match(util::string_view(current_data, current_length))) { - bitmap_writer.Set(); - } - bitmap_writer.Next(); - } - bitmap_writer.Finish(); - }, - out); - return Status::OK(); - } -}; - // This is an implementation of the Knuth-Morris-Pratt algorithm struct PlainSubstringMatcher { const MatchSubstringOptions& options_; @@ -467,6 +440,8 @@ struct PlainSubstringMatcher { static Result> Make( const MatchSubstringOptions& options) { + // Should be handled by partial template specialization below + DCHECK(!options.ignore_case); return ::arrow::internal::make_unique(options); } @@ -509,38 +484,109 @@ struct PlainSubstringMatcher { bool Match(util::string_view current) const { return Find(current) >= 0; } }; -const FunctionDoc match_substring_doc( - "Match strings against literal pattern", - ("For each string in `strings`, emit true iff it contains a given pattern.\n" - "Null inputs emit null. The pattern must be given in MatchSubstringOptions."), - {"strings"}, "MatchSubstringOptions"); - #ifdef ARROW_WITH_RE2 struct RegexSubstringMatcher { const MatchSubstringOptions& options_; const RE2 regex_match_; static Result> Make( - const MatchSubstringOptions& options) { - auto matcher = ::arrow::internal::make_unique(options); + const MatchSubstringOptions& options, bool literal = false) { + auto matcher = + ::arrow::internal::make_unique(options, literal); RETURN_NOT_OK(RegexStatus(matcher->regex_match_)); return std::move(matcher); } - explicit RegexSubstringMatcher(const MatchSubstringOptions& options) - : options_(options), regex_match_(options_.pattern, RE2::Quiet) {} + explicit RegexSubstringMatcher(const MatchSubstringOptions& options, + bool literal = false) + : options_(options), + regex_match_(options_.pattern, MakeRE2Options(options, literal)) {} bool Match(util::string_view current) const { auto piece = re2::StringPiece(current.data(), current.length()); return re2::RE2::PartialMatch(piece, regex_match_); } + + static RE2::RE2::Options MakeRE2Options(const MatchSubstringOptions& options, + bool literal) { + RE2::RE2::Options re2_options(RE2::Quiet); + re2_options.set_case_sensitive(!options.ignore_case); + re2_options.set_literal(literal); + return re2_options; + } +}; +#endif + +template +struct MatchSubstringImpl { + using offset_type = typename Type::offset_type; + + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out, + const Matcher* matcher) { + StringBoolTransform( + ctx, batch, + [&matcher](const void* raw_offsets, const uint8_t* data, int64_t length, + int64_t output_offset, uint8_t* output) { + const offset_type* offsets = reinterpret_cast(raw_offsets); + FirstTimeBitmapWriter bitmap_writer(output, output_offset, length); + for (int64_t i = 0; i < length; ++i) { + const char* current_data = reinterpret_cast(data + offsets[i]); + int64_t current_length = offsets[i + 1] - offsets[i]; + if (matcher->Match(util::string_view(current_data, current_length))) { + bitmap_writer.Set(); + } + bitmap_writer.Next(); + } + bitmap_writer.Finish(); + }, + out); + return Status::OK(); + } }; +template +struct MatchSubstring { + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + // TODO Cache matcher across invocations (for regex compilation) + ARROW_ASSIGN_OR_RAISE(auto matcher, Matcher::Make(MatchSubstringState::Get(ctx))); + return MatchSubstringImpl::Exec(ctx, batch, out, matcher.get()); + } +}; + +template +struct MatchSubstring { + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + auto options = MatchSubstringState::Get(ctx); + if (options.ignore_case) { +#ifdef ARROW_WITH_RE2 + ARROW_ASSIGN_OR_RAISE(auto matcher, + RegexSubstringMatcher::Make(options, /*literal=*/true)); + return MatchSubstringImpl::Exec(ctx, batch, out, + matcher.get()); +#else + return Status::NotImplemented("ignore_case requires RE2"); +#endif + } + ARROW_ASSIGN_OR_RAISE(auto matcher, PlainSubstringMatcher::Make(options)); + return MatchSubstringImpl::Exec(ctx, batch, out, + matcher.get()); + } +}; + +const FunctionDoc match_substring_doc( + "Match strings against literal pattern", + ("For each string in `strings`, emit true iff it contains a given pattern.\n" + "Null inputs emit null. The pattern must be given in MatchSubstringOptions. " + "If ignore_case is set, only simple case folding is performed."), + {"strings"}, "MatchSubstringOptions"); + +#ifdef ARROW_WITH_RE2 const FunctionDoc match_substring_regex_doc( "Match strings against regex pattern", ("For each string in `strings`, emit true iff it matches a given pattern at any " "position.\n" - "Null inputs emit null. The pattern must be given in MatchSubstringOptions."), + "Null inputs emit null. The pattern must be given in MatchSubstringOptions. " + "If ignore_case is set, only simple case folding is performed."), {"strings"}, "MatchSubstringOptions"); // SQL LIKE match @@ -605,14 +651,16 @@ struct MatchLike { Status status; std::string pattern; - if (re2::RE2::FullMatch(original_options.pattern, kLikePatternIsSubstringMatch, + if (!original_options.ignore_case && + re2::RE2::FullMatch(original_options.pattern, kLikePatternIsSubstringMatch, &pattern)) { - MatchSubstringOptions converted_options{pattern}; + MatchSubstringOptions converted_options{pattern, original_options.ignore_case}; MatchSubstringState converted_state(converted_options); ctx->SetState(&converted_state); status = MatchSubstring::Exec(ctx, batch, out); } else { - MatchSubstringOptions converted_options{MakeLikeRegex(original_options)}; + MatchSubstringOptions converted_options{MakeLikeRegex(original_options), + original_options.ignore_case}; MatchSubstringState converted_state(converted_options); ctx->SetState(&converted_state); status = MatchSubstring::Exec(ctx, batch, out); diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 7f2126828ce..5c230c41cd9 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -377,8 +377,8 @@ TYPED_TEST(TestStringKernels, IsUpperAscii) { TYPED_TEST(TestStringKernels, MatchSubstring) { MatchSubstringOptions options{"ab"}; this->CheckUnary("match_substring", "[]", boolean(), "[]", &options); - this->CheckUnary("match_substring", R"(["abc", "acb", "cab", null, "bac"])", boolean(), - "[true, false, true, null, false]", &options); + this->CheckUnary("match_substring", R"(["abc", "acb", "cab", null, "bac", "AB"])", + boolean(), "[true, false, true, null, false, false]", &options); MatchSubstringOptions options_repeated{"abab"}; this->CheckUnary("match_substring", R"(["abab", "ab", "cababc", null, "bac"])", @@ -393,12 +393,29 @@ TYPED_TEST(TestStringKernels, MatchSubstring) { &options_double_char_2); } +#ifdef ARROW_WITH_RE2 +TYPED_TEST(TestStringKernels, MatchSubstringIgnoreCase) { + MatchSubstringOptions options_insensitive{"aé(", /*ignore_case=*/true}; + this->CheckUnary("match_substring", R"(["abc", "aEb", "baÉ(", "aé(", "ae(", "Aé("])", + boolean(), "[false, false, true, true, false, true]", + &options_insensitive); +} +#else +TYPED_TEST(TestStringKernels, MatchSubstringIgnoreCase) { + Datum input = ArrayFromJSON(this->type(), R"(["a"])"); + MatchSubstringOptions options{"a", /*ignore_case=*/true}; + EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented, + ::testing::HasSubstr("ignore_case requires RE2"), + CallFunction("match_substring", {input}, &options)); +} +#endif + #ifdef ARROW_WITH_RE2 TYPED_TEST(TestStringKernels, MatchSubstringRegex) { MatchSubstringOptions options{"ab"}; this->CheckUnary("match_substring_regex", "[]", boolean(), "[]", &options); - this->CheckUnary("match_substring_regex", R"(["abc", "acb", "cab", null, "bac"])", - boolean(), "[true, false, true, null, false]", &options); + this->CheckUnary("match_substring_regex", R"(["abc", "acb", "cab", null, "bac", "AB"])", + boolean(), "[true, false, true, null, false, false]", &options); MatchSubstringOptions options_repeated{"(ab){2}"}; this->CheckUnary("match_substring_regex", R"(["abab", "ab", "cababc", null, "bac"])", boolean(), "[true, false, true, null, false]", &options_repeated); @@ -411,6 +428,10 @@ TYPED_TEST(TestStringKernels, MatchSubstringRegex) { MatchSubstringOptions options_plus{"a+b"}; this->CheckUnary("match_substring_regex", R"(["aacb", "aab", "dab", "caaab", "b", ""])", boolean(), "[false, true, true, true, false, false]", &options_plus); + MatchSubstringOptions options_insensitive{"ab|é", /*ignore_case=*/true}; + this->CheckUnary("match_substring_regex", R"(["abc", "acb", "É", null, "bac", "AB"])", + boolean(), "[true, false, true, null, false, true]", + &options_insensitive); // Unicode character semantics // "\pL" means: unicode category "letter" @@ -458,6 +479,15 @@ TYPED_TEST(TestStringKernels, MatchLike) { this->CheckUnary("match_like", inputs, boolean(), "[false, false, true, false, false, false, false, null]", ®ex_match); + + // ignore_case means this still gets mapped to a regex search + MatchSubstringOptions insensitive_substring{"%é%", /*ignore_case=*/true}; + this->CheckUnary("match_like", R"(["é", "fooÉbar", "e"])", boolean(), + "[true, true, false]", &insensitive_substring); + + MatchSubstringOptions insensitive_regex{"_é%", /*ignore_case=*/true}; + this->CheckUnary("match_like", R"(["éfoo", "aÉfoo", "e"])", boolean(), + "[false, true, false]", &insensitive_regex); } TYPED_TEST(TestStringKernels, MatchLikeEscaping) { diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index debea53b17b..1b62226b2b3 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -657,14 +657,14 @@ cdef class _MatchSubstringOptions(FunctionOptions): cdef const CFunctionOptions* get_options(self) except NULL: return self.match_substring_options.get() - def _set_options(self, pattern): + def _set_options(self, pattern, bint ignore_case): self.match_substring_options.reset( - new CMatchSubstringOptions(tobytes(pattern))) + new CMatchSubstringOptions(tobytes(pattern), ignore_case)) class MatchSubstringOptions(_MatchSubstringOptions): - def __init__(self, pattern): - self._set_options(pattern) + def __init__(self, pattern, bint ignore_case=False): + self._set_options(pattern, ignore_case) cdef class _TrimOptions(FunctionOptions): diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 9430dd4faf2..c447aa95c5c 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -308,7 +308,7 @@ def find_substring(array, pattern): MatchSubstringOptions(pattern)) -def match_like(array, pattern): +def match_like(array, pattern, *, ignore_case=False): """ Test if the SQL-style LIKE pattern *pattern* matches a value of a string array. @@ -321,6 +321,8 @@ def match_like(array, pattern): characters, '_' will match exactly one character, and all other characters match themselves. To match a literal percent sign or underscore, precede the character with a backslash. + ignore_case : bool, default False + Ignore case while searching. Returns ------- @@ -328,10 +330,10 @@ def match_like(array, pattern): """ return call_function("match_like", [array], - MatchSubstringOptions(pattern)) + MatchSubstringOptions(pattern, ignore_case)) -def match_substring(array, pattern): +def match_substring(array, pattern, *, ignore_case=False): """ Test if substring *pattern* is contained within a value of a string array. @@ -340,16 +342,18 @@ def match_substring(array, pattern): array : pyarrow.Array or pyarrow.ChunkedArray pattern : str pattern to search for exact matches + ignore_case : bool, default False + Ignore case while searching. Returns ------- result : pyarrow.Array or pyarrow.ChunkedArray """ return call_function("match_substring", [array], - MatchSubstringOptions(pattern)) + MatchSubstringOptions(pattern, ignore_case)) -def match_substring_regex(array, pattern): +def match_substring_regex(array, pattern, *, ignore_case=False): """ Test if regex *pattern* matches at any position a value of a string array. @@ -358,13 +362,15 @@ def match_substring_regex(array, pattern): array : pyarrow.Array or pyarrow.ChunkedArray pattern : str regex pattern to search + ignore_case : bool, default False + Ignore case while searching. Returns ------- result : pyarrow.Array or pyarrow.ChunkedArray """ return call_function("match_substring_regex", [array], - MatchSubstringOptions(pattern)) + MatchSubstringOptions(pattern, ignore_case)) def sum(array): diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 5afa806fa84..9184bd5bbfd 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1783,8 +1783,9 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: cdef cppclass CMatchSubstringOptions \ "arrow::compute::MatchSubstringOptions"(CFunctionOptions): - CMatchSubstringOptions(c_string pattern) + CMatchSubstringOptions(c_string pattern, c_bool ignore_case) c_string pattern + c_bool ignore_case cdef cppclass CTrimOptions \ "arrow::compute::TrimOptions"(CFunctionOptions): diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 26d52eff08b..127b271dda5 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -308,6 +308,14 @@ def test_match_like(): expected = pa.array([False, True, False, True, None]) assert expected.equals(result) + arr = pa.array(["aB", "bA%", "ba", "ca%d", None]) + result = pc.match_like(arr, r"_a\%%", ignore_case=True) + expected = pa.array([False, True, False, True, None]) + assert expected.equals(result) + result = pc.match_like(arr, r"_a\%%", ignore_case=False) + expected = pa.array([False, False, False, True, None]) + assert expected.equals(result) + def test_match_substring(): arr = pa.array(["ab", "abc", "ba", None]) @@ -315,6 +323,14 @@ def test_match_substring(): expected = pa.array([True, True, False, None]) assert expected.equals(result) + arr = pa.array(["áB", "Ábc", "ba", None]) + result = pc.match_substring(arr, "áb", ignore_case=True) + expected = pa.array([True, True, False, None]) + assert expected.equals(result) + result = pc.match_substring(arr, "áb", ignore_case=False) + expected = pa.array([False, False, False, None]) + assert expected.equals(result) + def test_match_substring_regex(): arr = pa.array(["ab", "abc", "ba", "c", None]) @@ -322,6 +338,14 @@ def test_match_substring_regex(): expected = pa.array([True, True, True, False, None]) assert expected.equals(result) + arr = pa.array(["aB", "Abc", "BA", "c", None]) + result = pc.match_substring_regex(arr, "^a?b", ignore_case=True) + expected = pa.array([True, True, True, False, None]) + assert expected.equals(result) + result = pc.match_substring_regex(arr, "^a?b", ignore_case=False) + expected = pa.array([False, False, False, False, None]) + assert expected.equals(result) + def test_trim(): # \u3000 is unicode whitespace diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R index fadd216a30c..e62f3e93007 100644 --- a/r/R/dplyr-functions.R +++ b/r/R/dplyr-functions.R @@ -226,11 +226,11 @@ nse_funcs$str_trim <- function(string, side = c("both", "left", "right")) { } nse_funcs$grepl <- function(pattern, x, ignore.case = FALSE, fixed = FALSE) { - arrow_fun <- ifelse(fixed && !ignore.case, "match_substring", "match_substring_regex") + arrow_fun <- ifelse(fixed, "match_substring", "match_substring_regex") Expression$create( arrow_fun, x, - options = list(pattern = format_string_pattern(pattern, ignore.case, fixed)) + options = list(pattern = pattern, ignore_case = ignore.case) ) } diff --git a/r/src/compute.cpp b/r/src/compute.cpp index 90c7b4129c7..26f0752d847 100644 --- a/r/src/compute.cpp +++ b/r/src/compute.cpp @@ -220,7 +220,12 @@ std::shared_ptr make_compute_options( if (func_name == "match_substring" || func_name == "match_substring_regex") { using Options = arrow::compute::MatchSubstringOptions; - return std::make_shared(cpp11::as_cpp(options["pattern"])); + bool ignore_case = false; + if (!Rf_isNull(options["ignore_case"])) { + ignore_case = cpp11::as_cpp(options["ignore_case"]); + } + return std::make_shared(cpp11::as_cpp(options["pattern"]), + ignore_case); } if (func_name == "replace_substring" || func_name == "replace_substring_regex") {