Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 75 additions & 13 deletions cpp/src/arrow/compute/kernels/scalar_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -917,13 +917,44 @@ struct FindSubstring {
}
};

#ifdef ARROW_WITH_RE2
struct FindSubstringRegex {
std::unique_ptr<RE2> regex_match_;

explicit FindSubstringRegex(const MatchSubstringOptions& options,
bool literal = false) {
std::string regex = "(";
regex.reserve(options.pattern.length() + 2);
regex += literal ? RE2::QuoteMeta(options.pattern) : options.pattern;
regex += ")";
regex_match_.reset(new RE2(std::move(regex), RegexSubstringMatcher::MakeRE2Options(
options, /*literal=*/false)));
}

template <typename OutValue, typename... Ignored>
OutValue Call(KernelContext*, util::string_view val, Status*) const {
re2::StringPiece piece(val.data(), val.length());
re2::StringPiece match;
if (re2::RE2::PartialMatch(piece, *regex_match_, &match)) {
return static_cast<OutValue>(match.data() - piece.data());
}
return -1;
}
};
#endif

template <typename InputType>
struct FindSubstringExec {
using OffsetType = typename TypeTraits<InputType>::OffsetType;
static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
if (options.ignore_case) {
return Status::NotImplemented("find_substring with ignore_case");
#ifdef ARROW_WITH_RE2
applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstringRegex>
kernel{FindSubstringRegex(options, /*literal=*/true)};
return kernel.Exec(ctx, batch, out);
#endif
return Status::NotImplemented("ignore_case requires RE2");
}
applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstring> kernel{
FindSubstring(PlainSubstringMatcher(options))};
Expand All @@ -938,21 +969,52 @@ const FunctionDoc find_substring_doc(
"Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
{"strings"}, "MatchSubstringOptions");

#ifdef ARROW_WITH_RE2
template <typename InputType>
struct FindSubstringRegexExec {
using OffsetType = typename TypeTraits<InputType>::OffsetType;
static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstringRegex>
kernel{FindSubstringRegex(options, /*literal=*/false)};
return kernel.Exec(ctx, batch, out);
}
};

const FunctionDoc find_substring_regex_doc(
"Find location of first match of regex pattern",
("For each string in `strings`, emit the index of the first match of the given "
"pattern, or -1 if not found.\n"
"Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
{"strings"}, "MatchSubstringOptions");
#endif

void AddFindSubstring(FunctionRegistry* registry) {
auto func = std::make_shared<ScalarFunction>("find_substring", Arity::Unary(),
&find_substring_doc);
for (const auto& ty : BaseBinaryTypes()) {
std::shared_ptr<DataType> offset_type;
if (ty->id() == Type::type::LARGE_BINARY || ty->id() == Type::type::LARGE_STRING) {
offset_type = int64();
} else {
offset_type = int32();
{
auto func = std::make_shared<ScalarFunction>("find_substring", Arity::Unary(),
&find_substring_doc);
for (const auto& ty : BaseBinaryTypes()) {
auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
DCHECK_OK(func->AddKernel({ty}, offset_type,
GenerateTypeAgnosticVarBinaryBase<FindSubstringExec>(ty),
MatchSubstringState::Init));
}
DCHECK_OK(func->AddKernel({ty}, offset_type,
GenerateTypeAgnosticVarBinaryBase<FindSubstringExec>(ty),
MatchSubstringState::Init));
DCHECK_OK(registry->AddFunction(std::move(func)));
}
DCHECK_OK(registry->AddFunction(std::move(func)));
#ifdef ARROW_WITH_RE2
{
auto func = std::make_shared<ScalarFunction>("find_substring_regex", Arity::Unary(),
&find_substring_regex_doc);
for (const auto& ty : BaseBinaryTypes()) {
auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32();
DCHECK_OK(
func->AddKernel({ty}, offset_type,
GenerateTypeAgnosticVarBinaryBase<FindSubstringRegexExec>(ty),
MatchSubstringState::Init));
}
DCHECK_OK(registry->AddFunction(std::move(func)));
}
#endif
}

// Substring count
Expand Down
30 changes: 30 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_string_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,36 @@ TYPED_TEST(TestBinaryKernels, FindSubstring) {
"[0, 0, null]", &options_empty);
}

#ifdef ARROW_WITH_RE2
TYPED_TEST(TestBinaryKernels, FindSubstringIgnoreCase) {
MatchSubstringOptions options{"?AB)", /*ignore_case=*/true};
this->CheckUnary("find_substring", "[]", this->offset_type(), "[]", &options);
this->CheckUnary("find_substring",
R"-(["?aB)c", "acb", "c?Ab)", null, "?aBc", "AB)"])-",
this->offset_type(), "[0, -1, 1, null, -1, -1]", &options);
}

TYPED_TEST(TestBinaryKernels, FindSubstringRegex) {
MatchSubstringOptions options{"a+", /*ignore_case=*/false};
this->CheckUnary("find_substring_regex", "[]", this->offset_type(), "[]", &options);
this->CheckUnary("find_substring_regex", R"(["a", "A", "baaa", null, "", "AaaA"])",
this->offset_type(), "[0, -1, 1, null, -1, 1]", &options);

options.ignore_case = true;
this->CheckUnary("find_substring_regex", "[]", this->offset_type(), "[]", &options);
this->CheckUnary("find_substring_regex", R"(["a", "A", "baaa", null, "", "AaaA"])",
this->offset_type(), "[0, 0, 1, null, -1, 0]", &options);
}
#else
TYPED_TEST(TestBinaryKernels, FindSubstringIgnoreCase) {
MatchSubstringOptions options{"a+", /*ignore_case=*/true};
Datum input = ArrayFromJSON(this->type(), R"(["a"])");
EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
::testing::HasSubstr("ignore_case requires RE2"),
CallFunction("find_substring", {input}, &options));
}
#endif

TYPED_TEST(TestBinaryKernels, CountSubstring) {
MatchSubstringOptions options{"aba"};
this->CheckUnary("count_substring", "[]", this->offset_type(), "[]", &options);
Expand Down
4 changes: 3 additions & 1 deletion docs/source/cpp/compute.rst
Original file line number Diff line number Diff line change
Expand Up @@ -597,7 +597,9 @@ Containment tests
+---------------------------+------------+------------------------------------+--------------------+----------------------------------------+
| ends_with | Unary | String-like | Boolean (2) | :struct:`MatchSubstringOptions` |
+---------------------------+------------+------------------------------------+--------------------+----------------------------------------+
| find_substring | Unary | String-like | Int32 or Int64 (3) | :struct:`MatchSubstringOptions` |
| find_substring | Unary | Binary- and String-like | Int32 or Int64 (3) | :struct:`MatchSubstringOptions` |
+---------------------------+------------+------------------------------------+--------------------+----------------------------------------+
| find_substring_regex | Unary | Binary- and String-like | Int32 or Int64 (3) | :struct:`MatchSubstringOptions` |
+---------------------------+------------+------------------------------------+--------------------+----------------------------------------+
| index_in | Unary | Boolean, Null, Numeric, Temporal, | Int32 (4) | :struct:`SetLookupOptions` |
| | | Binary- and String-like | | |
Expand Down
1 change: 1 addition & 0 deletions docs/source/python/api/compute.rst
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ Containment tests
count_substring_regex
ends_with
find_substring
find_substring_regex
index_in
is_in
match_like
Expand Down
27 changes: 25 additions & 2 deletions python/pyarrow/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ def count_substring_regex(array, pattern, *, ignore_case=False):
MatchSubstringOptions(pattern, ignore_case))


def find_substring(array, pattern):
def find_substring(array, pattern, *, ignore_case=False):
"""
Find the index of the first occurrence of substring *pattern* in each
value of a string array.
Expand All @@ -341,13 +341,36 @@ def find_substring(array, pattern):
array : pyarrow.Array or pyarrow.ChunkedArray
pattern : str
pattern to search for exact matches
ignore_case : bool, default False
Ignore case while searching.

Returns
-------
result : pyarrow.Array or pyarrow.ChunkedArray
"""
return call_function("find_substring", [array],
MatchSubstringOptions(pattern))
MatchSubstringOptions(pattern, ignore_case))


def find_substring_regex(array, pattern, *, ignore_case=False):
"""
Find the index of the first match of regex *pattern* in each
value of a string array.

Parameters
----------
array : pyarrow.Array or pyarrow.ChunkedArray
pattern : str
regex pattern to search for
ignore_case : bool, default False
Ignore case while searching.

Returns
-------
result : pyarrow.Array or pyarrow.ChunkedArray
"""
return call_function("find_substring_regex", [array],
MatchSubstringOptions(pattern, ignore_case))


def match_like(array, pattern, *, ignore_case=False):
Expand Down
27 changes: 11 additions & 16 deletions python/pyarrow/tests/test_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,25 +319,20 @@ def test_count_substring_regex():


def test_find_substring():
arr = pa.array(["ab", "cab", "ba", None])
result = pc.find_substring(arr, "ab")
expected = pa.array([0, 1, -1, None], type=pa.int32())
assert expected.equals(result)
for ty in [pa.string(), pa.binary(), pa.large_string(), pa.large_binary()]:
arr = pa.array(["ab", "cab", "ba", None], type=ty)
result = pc.find_substring(arr, "ab")
assert result.to_pylist() == [0, 1, -1, None]

arr = pa.array(["ab", "cab", "ba", None], type=pa.large_string())
result = pc.find_substring(arr, "ab")
expected = pa.array([0, 1, -1, None], type=pa.int64())
assert expected.equals(result)
result = pc.find_substring_regex(arr, "a?b")
assert result.to_pylist() == [0, 1, 0, None]

arr = pa.array([b"ab", b"cab", b"ba", None])
result = pc.find_substring(arr, b"ab")
expected = pa.array([0, 1, -1, None], type=pa.int32())
assert expected.equals(result)
arr = pa.array(["ab*", "cAB*", "ba", "aB?"], type=ty)
result = pc.find_substring(arr, "aB*", ignore_case=True)
assert result.to_pylist() == [0, 1, -1, -1]

arr = pa.array([b"ab", b"cab", b"ba", None], type=pa.large_binary())
result = pc.find_substring(arr, b"ab")
expected = pa.array([0, 1, -1, None], type=pa.int64())
assert expected.equals(result)
result = pc.find_substring_regex(arr, "a?b", ignore_case=True)
assert result.to_pylist() == [0, 1, 0, 0]


def test_match_like():
Expand Down