From 56adeb3b4fbee701fc5c161f30df4bc0b982acf7 Mon Sep 17 00:00:00 2001
From: David Li
Date: Thu, 24 Jun 2021 14:07:55 -0400
Subject: [PATCH] ARROW-13157: [C++][Python] Implement ignore_case for
find_substring
---
.../arrow/compute/kernels/scalar_string.cc | 88 ++++++++++++++++---
.../compute/kernels/scalar_string_test.cc | 30 +++++++
docs/source/cpp/compute.rst | 4 +-
docs/source/python/api/compute.rst | 1 +
python/pyarrow/compute.py | 27 +++++-
python/pyarrow/tests/test_compute.py | 27 +++---
6 files changed, 145 insertions(+), 32 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc
index dbacb6bb96f..e6820fe4747 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -917,13 +917,44 @@ struct FindSubstring {
}
};
+#ifdef ARROW_WITH_RE2
+struct FindSubstringRegex {
+ std::unique_ptr regex_match_;
+
+ explicit FindSubstringRegex(const MatchSubstringOptions& options,
+ bool literal = false) {
+ std::string regex = "(";
+ regex.reserve(options.pattern.length() + 2);
+ regex += literal ? RE2::QuoteMeta(options.pattern) : options.pattern;
+ regex += ")";
+ regex_match_.reset(new RE2(std::move(regex), RegexSubstringMatcher::MakeRE2Options(
+ options, /*literal=*/false)));
+ }
+
+ template
+ OutValue Call(KernelContext*, util::string_view val, Status*) const {
+ re2::StringPiece piece(val.data(), val.length());
+ re2::StringPiece match;
+ if (re2::RE2::PartialMatch(piece, *regex_match_, &match)) {
+ return static_cast(match.data() - piece.data());
+ }
+ return -1;
+ }
+};
+#endif
+
template
struct FindSubstringExec {
using OffsetType = typename TypeTraits::OffsetType;
static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
if (options.ignore_case) {
- return Status::NotImplemented("find_substring with ignore_case");
+#ifdef ARROW_WITH_RE2
+ applicator::ScalarUnaryNotNullStateful
+ kernel{FindSubstringRegex(options, /*literal=*/true)};
+ return kernel.Exec(ctx, batch, out);
+#endif
+ return Status::NotImplemented("ignore_case requires RE2");
}
applicator::ScalarUnaryNotNullStateful kernel{
FindSubstring(PlainSubstringMatcher(options))};
@@ -938,21 +969,52 @@ const FunctionDoc find_substring_doc(
"Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
{"strings"}, "MatchSubstringOptions");
+#ifdef ARROW_WITH_RE2
+template
+struct FindSubstringRegexExec {
+ using OffsetType = typename TypeTraits::OffsetType;
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
+ applicator::ScalarUnaryNotNullStateful
+ kernel{FindSubstringRegex(options, /*literal=*/false)};
+ return kernel.Exec(ctx, batch, out);
+ }
+};
+
+const FunctionDoc find_substring_regex_doc(
+ "Find location of first match of regex pattern",
+ ("For each string in `strings`, emit the index of the first match of the given "
+ "pattern, or -1 if not found.\n"
+ "Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
+ {"strings"}, "MatchSubstringOptions");
+#endif
+
void AddFindSubstring(FunctionRegistry* registry) {
- auto func = std::make_shared("find_substring", Arity::Unary(),
- &find_substring_doc);
- for (const auto& ty : BaseBinaryTypes()) {
- std::shared_ptr offset_type;
- if (ty->id() == Type::type::LARGE_BINARY || ty->id() == Type::type::LARGE_STRING) {
- offset_type = int64();
- } else {
- offset_type = int32();
+ {
+ auto func = std::make_shared("find_substring", Arity::Unary(),
+ &find_substring_doc);
+ for (const auto& ty : BaseBinaryTypes()) {
+ auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
+ DCHECK_OK(func->AddKernel({ty}, offset_type,
+ GenerateTypeAgnosticVarBinaryBase(ty),
+ MatchSubstringState::Init));
}
- DCHECK_OK(func->AddKernel({ty}, offset_type,
- GenerateTypeAgnosticVarBinaryBase(ty),
- MatchSubstringState::Init));
+ DCHECK_OK(registry->AddFunction(std::move(func)));
}
- DCHECK_OK(registry->AddFunction(std::move(func)));
+#ifdef ARROW_WITH_RE2
+ {
+ auto func = std::make_shared("find_substring_regex", Arity::Unary(),
+ &find_substring_regex_doc);
+ for (const auto& ty : BaseBinaryTypes()) {
+ auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
+ DCHECK_OK(
+ func->AddKernel({ty}, offset_type,
+ GenerateTypeAgnosticVarBinaryBase(ty),
+ MatchSubstringState::Init));
+ }
+ DCHECK_OK(registry->AddFunction(std::move(func)));
+ }
+#endif
}
// Substring count
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 6192e0a5dd7..d5c256fd8ef 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -173,6 +173,36 @@ TYPED_TEST(TestBinaryKernels, FindSubstring) {
"[0, 0, null]", &options_empty);
}
+#ifdef ARROW_WITH_RE2
+TYPED_TEST(TestBinaryKernels, FindSubstringIgnoreCase) {
+ MatchSubstringOptions options{"?AB)", /*ignore_case=*/true};
+ this->CheckUnary("find_substring", "[]", this->offset_type(), "[]", &options);
+ this->CheckUnary("find_substring",
+ R"-(["?aB)c", "acb", "c?Ab)", null, "?aBc", "AB)"])-",
+ this->offset_type(), "[0, -1, 1, null, -1, -1]", &options);
+}
+
+TYPED_TEST(TestBinaryKernels, FindSubstringRegex) {
+ MatchSubstringOptions options{"a+", /*ignore_case=*/false};
+ this->CheckUnary("find_substring_regex", "[]", this->offset_type(), "[]", &options);
+ this->CheckUnary("find_substring_regex", R"(["a", "A", "baaa", null, "", "AaaA"])",
+ this->offset_type(), "[0, -1, 1, null, -1, 1]", &options);
+
+ options.ignore_case = true;
+ this->CheckUnary("find_substring_regex", "[]", this->offset_type(), "[]", &options);
+ this->CheckUnary("find_substring_regex", R"(["a", "A", "baaa", null, "", "AaaA"])",
+ this->offset_type(), "[0, 0, 1, null, -1, 0]", &options);
+}
+#else
+TYPED_TEST(TestBinaryKernels, FindSubstringIgnoreCase) {
+ MatchSubstringOptions options{"a+", /*ignore_case=*/true};
+ Datum input = ArrayFromJSON(this->type(), R"(["a"])");
+ EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
+ ::testing::HasSubstr("ignore_case requires RE2"),
+ CallFunction("find_substring", {input}, &options));
+}
+#endif
+
TYPED_TEST(TestBinaryKernels, CountSubstring) {
MatchSubstringOptions options{"aba"};
this->CheckUnary("count_substring", "[]", this->offset_type(), "[]", &options);
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index 147885560f5..e785756dcda 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -597,7 +597,9 @@ Containment tests
+---------------------------+------------+------------------------------------+--------------------+----------------------------------------+
| ends_with | Unary | String-like | Boolean (2) | :struct:`MatchSubstringOptions` |
+---------------------------+------------+------------------------------------+--------------------+----------------------------------------+
-| find_substring | Unary | String-like | Int32 or Int64 (3) | :struct:`MatchSubstringOptions` |
+| find_substring | Unary | Binary- and String-like | Int32 or Int64 (3) | :struct:`MatchSubstringOptions` |
++---------------------------+------------+------------------------------------+--------------------+----------------------------------------+
+| find_substring_regex | Unary | Binary- and String-like | Int32 or Int64 (3) | :struct:`MatchSubstringOptions` |
+---------------------------+------------+------------------------------------+--------------------+----------------------------------------+
| index_in | Unary | Boolean, Null, Numeric, Temporal, | Int32 (4) | :struct:`SetLookupOptions` |
| | | Binary- and String-like | | |
diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst
index 80fcb2078f1..8b264ed9b83 100644
--- a/docs/source/python/api/compute.rst
+++ b/docs/source/python/api/compute.rst
@@ -205,6 +205,7 @@ Containment tests
count_substring_regex
ends_with
find_substring
+ find_substring_regex
index_in
is_in
match_like
diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
index b258b551f02..aacf8456c1b 100644
--- a/python/pyarrow/compute.py
+++ b/python/pyarrow/compute.py
@@ -331,7 +331,7 @@ def count_substring_regex(array, pattern, *, ignore_case=False):
MatchSubstringOptions(pattern, ignore_case))
-def find_substring(array, pattern):
+def find_substring(array, pattern, *, ignore_case=False):
"""
Find the index of the first occurrence of substring *pattern* in each
value of a string array.
@@ -341,13 +341,36 @@ def find_substring(array, pattern):
array : pyarrow.Array or pyarrow.ChunkedArray
pattern : str
pattern to search for exact matches
+ ignore_case : bool, default False
+ Ignore case while searching.
Returns
-------
result : pyarrow.Array or pyarrow.ChunkedArray
"""
return call_function("find_substring", [array],
- MatchSubstringOptions(pattern))
+ MatchSubstringOptions(pattern, ignore_case))
+
+
+def find_substring_regex(array, pattern, *, ignore_case=False):
+ """
+ Find the index of the first match of regex *pattern* in each
+ value of a string array.
+
+ Parameters
+ ----------
+ array : pyarrow.Array or pyarrow.ChunkedArray
+ pattern : str
+ regex pattern to search for
+ ignore_case : bool, default False
+ Ignore case while searching.
+
+ Returns
+ -------
+ result : pyarrow.Array or pyarrow.ChunkedArray
+ """
+ return call_function("find_substring_regex", [array],
+ MatchSubstringOptions(pattern, ignore_case))
def match_like(array, pattern, *, ignore_case=False):
diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
index efe2e6be2f8..8b294b85759 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -319,25 +319,20 @@ def test_count_substring_regex():
def test_find_substring():
- arr = pa.array(["ab", "cab", "ba", None])
- result = pc.find_substring(arr, "ab")
- expected = pa.array([0, 1, -1, None], type=pa.int32())
- assert expected.equals(result)
+ for ty in [pa.string(), pa.binary(), pa.large_string(), pa.large_binary()]:
+ arr = pa.array(["ab", "cab", "ba", None], type=ty)
+ result = pc.find_substring(arr, "ab")
+ assert result.to_pylist() == [0, 1, -1, None]
- arr = pa.array(["ab", "cab", "ba", None], type=pa.large_string())
- result = pc.find_substring(arr, "ab")
- expected = pa.array([0, 1, -1, None], type=pa.int64())
- assert expected.equals(result)
+ result = pc.find_substring_regex(arr, "a?b")
+ assert result.to_pylist() == [0, 1, 0, None]
- arr = pa.array([b"ab", b"cab", b"ba", None])
- result = pc.find_substring(arr, b"ab")
- expected = pa.array([0, 1, -1, None], type=pa.int32())
- assert expected.equals(result)
+ arr = pa.array(["ab*", "cAB*", "ba", "aB?"], type=ty)
+ result = pc.find_substring(arr, "aB*", ignore_case=True)
+ assert result.to_pylist() == [0, 1, -1, -1]
- arr = pa.array([b"ab", b"cab", b"ba", None], type=pa.large_binary())
- result = pc.find_substring(arr, b"ab")
- expected = pa.array([0, 1, -1, None], type=pa.int64())
- assert expected.equals(result)
+ result = pc.find_substring_regex(arr, "a?b", ignore_case=True)
+ assert result.to_pylist() == [0, 1, 0, 0]
def test_match_like():