From 4e7c02c09fefd5655fbd216cc48bd12365c80fba Mon Sep 17 00:00:00 2001
From: David Li
Date: Thu, 20 May 2021 14:42:07 -0400
Subject: [PATCH] ARROW-12835: [C++][Python][R] Implement case-insensitive
substring match
---
cpp/src/arrow/compute/api_scalar.h | 5 +-
.../arrow/compute/kernels/scalar_string.cc | 130 ++++++++++++------
.../compute/kernels/scalar_string_test.cc | 38 ++++-
python/pyarrow/_compute.pyx | 8 +-
python/pyarrow/compute.py | 18 ++-
python/pyarrow/includes/libarrow.pxd | 3 +-
python/pyarrow/tests/test_compute.py | 24 ++++
r/R/dplyr-functions.R | 4 +-
r/src/compute.cpp | 7 +-
9 files changed, 177 insertions(+), 60 deletions(-)
diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h
index 3a007e06567..dce420b32b2 100644
--- a/cpp/src/arrow/compute/api_scalar.h
+++ b/cpp/src/arrow/compute/api_scalar.h
@@ -43,10 +43,13 @@ struct ArithmeticOptions : public FunctionOptions {
};
struct ARROW_EXPORT MatchSubstringOptions : public FunctionOptions {
- explicit MatchSubstringOptions(std::string pattern) : pattern(std::move(pattern)) {}
+ explicit MatchSubstringOptions(std::string pattern, bool ignore_case = false)
+ : pattern(std::move(pattern)), ignore_case(ignore_case) {}
/// The exact substring (or regex, depending on kernel) to look for inside input values.
std::string pattern;
+ /// Whether to perform a case-insensitive match.
+ bool ignore_case = false;
};
struct ARROW_EXPORT SplitOptions : public FunctionOptions {
diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 4d83e1ec24e..d939d1c7722 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -433,33 +433,6 @@ void StringBoolTransform(KernelContext* ctx, const ExecBatch& batch,
using MatchSubstringState = OptionsWrapper;
-template
-struct MatchSubstring {
- using offset_type = typename Type::offset_type;
- static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
- // TODO Cache matcher across invocations (for regex compilation)
- ARROW_ASSIGN_OR_RAISE(auto matcher, Matcher::Make(MatchSubstringState::Get(ctx)));
- StringBoolTransform(
- ctx, batch,
- [&matcher](const void* raw_offsets, const uint8_t* data, int64_t length,
- int64_t output_offset, uint8_t* output) {
- const offset_type* offsets = reinterpret_cast(raw_offsets);
- FirstTimeBitmapWriter bitmap_writer(output, output_offset, length);
- for (int64_t i = 0; i < length; ++i) {
- const char* current_data = reinterpret_cast(data + offsets[i]);
- int64_t current_length = offsets[i + 1] - offsets[i];
- if (matcher->Match(util::string_view(current_data, current_length))) {
- bitmap_writer.Set();
- }
- bitmap_writer.Next();
- }
- bitmap_writer.Finish();
- },
- out);
- return Status::OK();
- }
-};
-
// This is an implementation of the Knuth-Morris-Pratt algorithm
struct PlainSubstringMatcher {
const MatchSubstringOptions& options_;
@@ -467,6 +440,8 @@ struct PlainSubstringMatcher {
static Result> Make(
const MatchSubstringOptions& options) {
+ // Should be handled by partial template specialization below
+ DCHECK(!options.ignore_case);
return ::arrow::internal::make_unique(options);
}
@@ -509,38 +484,109 @@ struct PlainSubstringMatcher {
bool Match(util::string_view current) const { return Find(current) >= 0; }
};
-const FunctionDoc match_substring_doc(
- "Match strings against literal pattern",
- ("For each string in `strings`, emit true iff it contains a given pattern.\n"
- "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
- {"strings"}, "MatchSubstringOptions");
-
#ifdef ARROW_WITH_RE2
struct RegexSubstringMatcher {
const MatchSubstringOptions& options_;
const RE2 regex_match_;
static Result> Make(
- const MatchSubstringOptions& options) {
- auto matcher = ::arrow::internal::make_unique(options);
+ const MatchSubstringOptions& options, bool literal = false) {
+ auto matcher =
+ ::arrow::internal::make_unique(options, literal);
RETURN_NOT_OK(RegexStatus(matcher->regex_match_));
return std::move(matcher);
}
- explicit RegexSubstringMatcher(const MatchSubstringOptions& options)
- : options_(options), regex_match_(options_.pattern, RE2::Quiet) {}
+ explicit RegexSubstringMatcher(const MatchSubstringOptions& options,
+ bool literal = false)
+ : options_(options),
+ regex_match_(options_.pattern, MakeRE2Options(options, literal)) {}
bool Match(util::string_view current) const {
auto piece = re2::StringPiece(current.data(), current.length());
return re2::RE2::PartialMatch(piece, regex_match_);
}
+
+ static RE2::RE2::Options MakeRE2Options(const MatchSubstringOptions& options,
+ bool literal) {
+ RE2::RE2::Options re2_options(RE2::Quiet);
+ re2_options.set_case_sensitive(!options.ignore_case);
+ re2_options.set_literal(literal);
+ return re2_options;
+ }
+};
+#endif
+
+template
+struct MatchSubstringImpl {
+ using offset_type = typename Type::offset_type;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out,
+ const Matcher* matcher) {
+ StringBoolTransform(
+ ctx, batch,
+ [&matcher](const void* raw_offsets, const uint8_t* data, int64_t length,
+ int64_t output_offset, uint8_t* output) {
+ const offset_type* offsets = reinterpret_cast(raw_offsets);
+ FirstTimeBitmapWriter bitmap_writer(output, output_offset, length);
+ for (int64_t i = 0; i < length; ++i) {
+ const char* current_data = reinterpret_cast(data + offsets[i]);
+ int64_t current_length = offsets[i + 1] - offsets[i];
+ if (matcher->Match(util::string_view(current_data, current_length))) {
+ bitmap_writer.Set();
+ }
+ bitmap_writer.Next();
+ }
+ bitmap_writer.Finish();
+ },
+ out);
+ return Status::OK();
+ }
};
+template
+struct MatchSubstring {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ // TODO Cache matcher across invocations (for regex compilation)
+ ARROW_ASSIGN_OR_RAISE(auto matcher, Matcher::Make(MatchSubstringState::Get(ctx)));
+ return MatchSubstringImpl::Exec(ctx, batch, out, matcher.get());
+ }
+};
+
+template
+struct MatchSubstring {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ auto options = MatchSubstringState::Get(ctx);
+ if (options.ignore_case) {
+#ifdef ARROW_WITH_RE2
+ ARROW_ASSIGN_OR_RAISE(auto matcher,
+ RegexSubstringMatcher::Make(options, /*literal=*/true));
+ return MatchSubstringImpl::Exec(ctx, batch, out,
+ matcher.get());
+#else
+ return Status::NotImplemented("ignore_case requires RE2");
+#endif
+ }
+ ARROW_ASSIGN_OR_RAISE(auto matcher, PlainSubstringMatcher::Make(options));
+ return MatchSubstringImpl::Exec(ctx, batch, out,
+ matcher.get());
+ }
+};
+
+const FunctionDoc match_substring_doc(
+ "Match strings against literal pattern",
+ ("For each string in `strings`, emit true iff it contains a given pattern.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
+ "If ignore_case is set, only simple case folding is performed."),
+ {"strings"}, "MatchSubstringOptions");
+
+#ifdef ARROW_WITH_RE2
const FunctionDoc match_substring_regex_doc(
"Match strings against regex pattern",
("For each string in `strings`, emit true iff it matches a given pattern at any "
"position.\n"
- "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
+ "If ignore_case is set, only simple case folding is performed."),
{"strings"}, "MatchSubstringOptions");
// SQL LIKE match
@@ -605,14 +651,16 @@ struct MatchLike {
Status status;
std::string pattern;
- if (re2::RE2::FullMatch(original_options.pattern, kLikePatternIsSubstringMatch,
+ if (!original_options.ignore_case &&
+ re2::RE2::FullMatch(original_options.pattern, kLikePatternIsSubstringMatch,
&pattern)) {
- MatchSubstringOptions converted_options{pattern};
+ MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
MatchSubstringState converted_state(converted_options);
ctx->SetState(&converted_state);
status = MatchSubstring::Exec(ctx, batch, out);
} else {
- MatchSubstringOptions converted_options{MakeLikeRegex(original_options)};
+ MatchSubstringOptions converted_options{MakeLikeRegex(original_options),
+ original_options.ignore_case};
MatchSubstringState converted_state(converted_options);
ctx->SetState(&converted_state);
status = MatchSubstring::Exec(ctx, batch, out);
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 7f2126828ce..5c230c41cd9 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -377,8 +377,8 @@ TYPED_TEST(TestStringKernels, IsUpperAscii) {
TYPED_TEST(TestStringKernels, MatchSubstring) {
MatchSubstringOptions options{"ab"};
this->CheckUnary("match_substring", "[]", boolean(), "[]", &options);
- this->CheckUnary("match_substring", R"(["abc", "acb", "cab", null, "bac"])", boolean(),
- "[true, false, true, null, false]", &options);
+ this->CheckUnary("match_substring", R"(["abc", "acb", "cab", null, "bac", "AB"])",
+ boolean(), "[true, false, true, null, false, false]", &options);
MatchSubstringOptions options_repeated{"abab"};
this->CheckUnary("match_substring", R"(["abab", "ab", "cababc", null, "bac"])",
@@ -393,12 +393,29 @@ TYPED_TEST(TestStringKernels, MatchSubstring) {
&options_double_char_2);
}
+#ifdef ARROW_WITH_RE2
+TYPED_TEST(TestStringKernels, MatchSubstringIgnoreCase) {
+ MatchSubstringOptions options_insensitive{"aé(", /*ignore_case=*/true};
+ this->CheckUnary("match_substring", R"(["abc", "aEb", "baÉ(", "aé(", "ae(", "Aé("])",
+ boolean(), "[false, false, true, true, false, true]",
+ &options_insensitive);
+}
+#else
+TYPED_TEST(TestStringKernels, MatchSubstringIgnoreCase) {
+ Datum input = ArrayFromJSON(this->type(), R"(["a"])");
+ MatchSubstringOptions options{"a", /*ignore_case=*/true};
+ EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
+ ::testing::HasSubstr("ignore_case requires RE2"),
+ CallFunction("match_substring", {input}, &options));
+}
+#endif
+
#ifdef ARROW_WITH_RE2
TYPED_TEST(TestStringKernels, MatchSubstringRegex) {
MatchSubstringOptions options{"ab"};
this->CheckUnary("match_substring_regex", "[]", boolean(), "[]", &options);
- this->CheckUnary("match_substring_regex", R"(["abc", "acb", "cab", null, "bac"])",
- boolean(), "[true, false, true, null, false]", &options);
+ this->CheckUnary("match_substring_regex", R"(["abc", "acb", "cab", null, "bac", "AB"])",
+ boolean(), "[true, false, true, null, false, false]", &options);
MatchSubstringOptions options_repeated{"(ab){2}"};
this->CheckUnary("match_substring_regex", R"(["abab", "ab", "cababc", null, "bac"])",
boolean(), "[true, false, true, null, false]", &options_repeated);
@@ -411,6 +428,10 @@ TYPED_TEST(TestStringKernels, MatchSubstringRegex) {
MatchSubstringOptions options_plus{"a+b"};
this->CheckUnary("match_substring_regex", R"(["aacb", "aab", "dab", "caaab", "b", ""])",
boolean(), "[false, true, true, true, false, false]", &options_plus);
+ MatchSubstringOptions options_insensitive{"ab|é", /*ignore_case=*/true};
+ this->CheckUnary("match_substring_regex", R"(["abc", "acb", "É", null, "bac", "AB"])",
+ boolean(), "[true, false, true, null, false, true]",
+ &options_insensitive);
// Unicode character semantics
// "\pL" means: unicode category "letter"
@@ -458,6 +479,15 @@ TYPED_TEST(TestStringKernels, MatchLike) {
this->CheckUnary("match_like", inputs, boolean(),
"[false, false, true, false, false, false, false, null]",
®ex_match);
+
+ // ignore_case means this still gets mapped to a regex search
+ MatchSubstringOptions insensitive_substring{"%é%", /*ignore_case=*/true};
+ this->CheckUnary("match_like", R"(["é", "fooÉbar", "e"])", boolean(),
+ "[true, true, false]", &insensitive_substring);
+
+ MatchSubstringOptions insensitive_regex{"_é%", /*ignore_case=*/true};
+ this->CheckUnary("match_like", R"(["éfoo", "aÉfoo", "e"])", boolean(),
+ "[false, true, false]", &insensitive_regex);
}
TYPED_TEST(TestStringKernels, MatchLikeEscaping) {
diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx
index debea53b17b..1b62226b2b3 100644
--- a/python/pyarrow/_compute.pyx
+++ b/python/pyarrow/_compute.pyx
@@ -657,14 +657,14 @@ cdef class _MatchSubstringOptions(FunctionOptions):
cdef const CFunctionOptions* get_options(self) except NULL:
return self.match_substring_options.get()
- def _set_options(self, pattern):
+ def _set_options(self, pattern, bint ignore_case):
self.match_substring_options.reset(
- new CMatchSubstringOptions(tobytes(pattern)))
+ new CMatchSubstringOptions(tobytes(pattern), ignore_case))
class MatchSubstringOptions(_MatchSubstringOptions):
- def __init__(self, pattern):
- self._set_options(pattern)
+ def __init__(self, pattern, bint ignore_case=False):
+ self._set_options(pattern, ignore_case)
cdef class _TrimOptions(FunctionOptions):
diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
index 9430dd4faf2..c447aa95c5c 100644
--- a/python/pyarrow/compute.py
+++ b/python/pyarrow/compute.py
@@ -308,7 +308,7 @@ def find_substring(array, pattern):
MatchSubstringOptions(pattern))
-def match_like(array, pattern):
+def match_like(array, pattern, *, ignore_case=False):
"""
Test if the SQL-style LIKE pattern *pattern* matches a value of a
string array.
@@ -321,6 +321,8 @@ def match_like(array, pattern):
characters, '_' will match exactly one character, and all
other characters match themselves. To match a literal percent
sign or underscore, precede the character with a backslash.
+ ignore_case : bool, default False
+ Ignore case while searching.
Returns
-------
@@ -328,10 +330,10 @@ def match_like(array, pattern):
"""
return call_function("match_like", [array],
- MatchSubstringOptions(pattern))
+ MatchSubstringOptions(pattern, ignore_case))
-def match_substring(array, pattern):
+def match_substring(array, pattern, *, ignore_case=False):
"""
Test if substring *pattern* is contained within a value of a string array.
@@ -340,16 +342,18 @@ def match_substring(array, pattern):
array : pyarrow.Array or pyarrow.ChunkedArray
pattern : str
pattern to search for exact matches
+ ignore_case : bool, default False
+ Ignore case while searching.
Returns
-------
result : pyarrow.Array or pyarrow.ChunkedArray
"""
return call_function("match_substring", [array],
- MatchSubstringOptions(pattern))
+ MatchSubstringOptions(pattern, ignore_case))
-def match_substring_regex(array, pattern):
+def match_substring_regex(array, pattern, *, ignore_case=False):
"""
Test if regex *pattern* matches at any position a value of a string array.
@@ -358,13 +362,15 @@ def match_substring_regex(array, pattern):
array : pyarrow.Array or pyarrow.ChunkedArray
pattern : str
regex pattern to search
+ ignore_case : bool, default False
+ Ignore case while searching.
Returns
-------
result : pyarrow.Array or pyarrow.ChunkedArray
"""
return call_function("match_substring_regex", [array],
- MatchSubstringOptions(pattern))
+ MatchSubstringOptions(pattern, ignore_case))
def sum(array):
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 5afa806fa84..9184bd5bbfd 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1783,8 +1783,9 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
cdef cppclass CMatchSubstringOptions \
"arrow::compute::MatchSubstringOptions"(CFunctionOptions):
- CMatchSubstringOptions(c_string pattern)
+ CMatchSubstringOptions(c_string pattern, c_bool ignore_case)
c_string pattern
+ c_bool ignore_case
cdef cppclass CTrimOptions \
"arrow::compute::TrimOptions"(CFunctionOptions):
diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
index 26d52eff08b..127b271dda5 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -308,6 +308,14 @@ def test_match_like():
expected = pa.array([False, True, False, True, None])
assert expected.equals(result)
+ arr = pa.array(["aB", "bA%", "ba", "ca%d", None])
+ result = pc.match_like(arr, r"_a\%%", ignore_case=True)
+ expected = pa.array([False, True, False, True, None])
+ assert expected.equals(result)
+ result = pc.match_like(arr, r"_a\%%", ignore_case=False)
+ expected = pa.array([False, False, False, True, None])
+ assert expected.equals(result)
+
def test_match_substring():
arr = pa.array(["ab", "abc", "ba", None])
@@ -315,6 +323,14 @@ def test_match_substring():
expected = pa.array([True, True, False, None])
assert expected.equals(result)
+ arr = pa.array(["áB", "Ábc", "ba", None])
+ result = pc.match_substring(arr, "áb", ignore_case=True)
+ expected = pa.array([True, True, False, None])
+ assert expected.equals(result)
+ result = pc.match_substring(arr, "áb", ignore_case=False)
+ expected = pa.array([False, False, False, None])
+ assert expected.equals(result)
+
def test_match_substring_regex():
arr = pa.array(["ab", "abc", "ba", "c", None])
@@ -322,6 +338,14 @@ def test_match_substring_regex():
expected = pa.array([True, True, True, False, None])
assert expected.equals(result)
+ arr = pa.array(["aB", "Abc", "BA", "c", None])
+ result = pc.match_substring_regex(arr, "^a?b", ignore_case=True)
+ expected = pa.array([True, True, True, False, None])
+ assert expected.equals(result)
+ result = pc.match_substring_regex(arr, "^a?b", ignore_case=False)
+ expected = pa.array([False, False, False, False, None])
+ assert expected.equals(result)
+
def test_trim():
# \u3000 is unicode whitespace
diff --git a/r/R/dplyr-functions.R b/r/R/dplyr-functions.R
index fadd216a30c..e62f3e93007 100644
--- a/r/R/dplyr-functions.R
+++ b/r/R/dplyr-functions.R
@@ -226,11 +226,11 @@ nse_funcs$str_trim <- function(string, side = c("both", "left", "right")) {
}
nse_funcs$grepl <- function(pattern, x, ignore.case = FALSE, fixed = FALSE) {
- arrow_fun <- ifelse(fixed && !ignore.case, "match_substring", "match_substring_regex")
+ arrow_fun <- ifelse(fixed, "match_substring", "match_substring_regex")
Expression$create(
arrow_fun,
x,
- options = list(pattern = format_string_pattern(pattern, ignore.case, fixed))
+ options = list(pattern = pattern, ignore_case = ignore.case)
)
}
diff --git a/r/src/compute.cpp b/r/src/compute.cpp
index 90c7b4129c7..26f0752d847 100644
--- a/r/src/compute.cpp
+++ b/r/src/compute.cpp
@@ -220,7 +220,12 @@ std::shared_ptr make_compute_options(
if (func_name == "match_substring" || func_name == "match_substring_regex") {
using Options = arrow::compute::MatchSubstringOptions;
- return std::make_shared(cpp11::as_cpp(options["pattern"]));
+ bool ignore_case = false;
+ if (!Rf_isNull(options["ignore_case"])) {
+ ignore_case = cpp11::as_cpp(options["ignore_case"]);
+ }
+ return std::make_shared(cpp11::as_cpp(options["pattern"]),
+ ignore_case);
}
if (func_name == "replace_substring" || func_name == "replace_substring_regex") {