Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 50 additions & 3 deletions cpp/src/arrow/compute/kernels/scalar_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -488,21 +488,25 @@ struct PlainSubstringMatcher {
}
}

bool Match(util::string_view current) const {
int64_t Find(util::string_view current) const {
// Phase 2: Find the prefix in the data
const auto pattern_length = options_.pattern.size();
int64_t pattern_pos = 0;
int64_t pos = 0;
for (const auto c : current) {
while ((pattern_pos >= 0) && (options_.pattern[pattern_pos] != c)) {
pattern_pos = prefix_table[pattern_pos];
}
pattern_pos++;
if (static_cast<size_t>(pattern_pos) == pattern_length) {
return true;
return pos + 1 - pattern_length;
}
pos++;
}
return false;
return -1;
}

bool Match(util::string_view current) const { return Find(current) >= 0; }
};

const FunctionDoc match_substring_doc(
Expand Down Expand Up @@ -664,6 +668,48 @@ void AddMatchSubstring(FunctionRegistry* registry) {
#endif
}

// Substring find - lfind/index/etc.

struct FindSubstring {
const PlainSubstringMatcher matcher_;

explicit FindSubstring(PlainSubstringMatcher matcher) : matcher_(std::move(matcher)) {}

template <typename OutValue, typename... Ignored>
OutValue Call(KernelContext*, util::string_view val, Status*) const {
return static_cast<OutValue>(matcher_.Find(val));
}
};

template <typename InputType>
Status FindSubstringExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
using offset_type = typename TypeTraits<InputType>::OffsetType;
applicator::ScalarUnaryNotNullStateful<offset_type, InputType, FindSubstring> kernel{
FindSubstring(PlainSubstringMatcher(MatchSubstringState::Get(ctx)))};
return kernel.Exec(ctx, batch, out);
}

const FunctionDoc find_substring_doc(
"Find first occurrence of substring",
("For each string in `strings`, emit the index of the first occurrence of the given "
"pattern, or -1 if not found.\n"
"Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
{"strings"}, "MatchSubstringOptions");

void AddFindSubstring(FunctionRegistry* registry) {
auto func = std::make_shared<ScalarFunction>("find_substring", Arity::Unary(),
&find_substring_doc);
DCHECK_OK(func->AddKernel({binary()}, int32(), FindSubstringExec<BinaryType>,
MatchSubstringState::Init));
DCHECK_OK(func->AddKernel({utf8()}, int32(), FindSubstringExec<StringType>,
MatchSubstringState::Init));
DCHECK_OK(func->AddKernel({large_binary()}, int64(), FindSubstringExec<LargeBinaryType>,
MatchSubstringState::Init));
DCHECK_OK(func->AddKernel({large_utf8()}, int64(), FindSubstringExec<LargeStringType>,
MatchSubstringState::Init));
DCHECK_OK(registry->AddFunction(std::move(func)));
}

// IsAlpha/Digit etc

#ifdef ARROW_WITH_UTF8PROC
Expand Down Expand Up @@ -2626,6 +2672,7 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) {
AddBinaryLength(registry);
AddUtf8Length(registry);
AddMatchSubstring(registry);
AddFindSubstring(registry);
MakeUnaryStringBatchKernelWithState<ReplaceSubStringPlain>(
"replace_substring", registry, &replace_substring_doc,
MemAllocation::NO_PREALLOCATE);
Expand Down
38 changes: 38 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_string_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,25 @@ TYPED_TEST(TestBinaryKernels, BinaryLength) {
this->offset_type(), "[3, null, 10, 0, 1]");
}

TYPED_TEST(TestBinaryKernels, FindSubstring) {
MatchSubstringOptions options{"ab"};
this->CheckUnary("find_substring", "[]", this->offset_type(), "[]", &options);
this->CheckUnary("find_substring", R"(["abc", "acb", "cab", null, "bac"])",
this->offset_type(), "[0, -1, 1, null, -1]", &options);

MatchSubstringOptions options_repeated{"abab"};
this->CheckUnary("find_substring", R"(["abab", "ab", "cababc", null, "bac"])",
this->offset_type(), "[0, -1, 1, null, -1]", &options_repeated);

MatchSubstringOptions options_double_char{"aab"};
this->CheckUnary("find_substring", R"(["aacb", "aab", "ab", "aaab"])",
this->offset_type(), "[-1, 0, -1, 1]", &options_double_char);

MatchSubstringOptions options_double_char_2{"bbcaa"};
this->CheckUnary("find_substring", R"(["abcbaabbbcaabccabaab"])", this->offset_type(),
"[7]", &options_double_char_2);
}

template <typename TestType>
class TestStringKernels : public BaseTestStringKernels<TestType> {};

Expand Down Expand Up @@ -470,6 +489,25 @@ TYPED_TEST(TestStringKernels, MatchLikeEscaping) {
}
#endif

TYPED_TEST(TestStringKernels, FindSubstring) {
MatchSubstringOptions options{"ab"};
this->CheckUnary("find_substring", "[]", this->offset_type(), "[]", &options);
this->CheckUnary("find_substring", R"(["abc", "acb", "cab", null, "bac"])",
this->offset_type(), "[0, -1, 1, null, -1]", &options);

MatchSubstringOptions options_repeated{"abab"};
this->CheckUnary("find_substring", R"(["abab", "ab", "cababc", null, "bac"])",
this->offset_type(), "[0, -1, 1, null, -1]", &options_repeated);

MatchSubstringOptions options_double_char{"aab"};
this->CheckUnary("find_substring", R"(["aacb", "aab", "ab", "aaab"])",
this->offset_type(), "[-1, 0, -1, 1]", &options_double_char);

MatchSubstringOptions options_double_char_2{"bbcaa"};
this->CheckUnary("find_substring", R"(["abcbaabbbcaabccabaab"])", this->offset_type(),
"[7]", &options_double_char_2);
}

TYPED_TEST(TestStringKernels, SplitBasics) {
SplitPatternOptions options{" "};
// basics
Expand Down
55 changes: 31 additions & 24 deletions docs/source/cpp/compute.rst
Original file line number Diff line number Diff line change
Expand Up @@ -462,7 +462,7 @@ String transforms
* \(1) Each ASCII character in the input is converted to lowercase or
uppercase. Non-ASCII characters are left untouched.

* \(2) ASCII input is reversed to the output. If non-ASCII characters
* \(2) ASCII input is reversed to the output. If non-ASCII characters
are present, ``Invalid`` :class:`Status` will be returned.

* \(3) Output is the physical length in bytes of each input element. Output
Expand All @@ -482,7 +482,7 @@ String transforms
pattern contains groups, backreferencing can be used.

* \(6) Output is the number of characters (not bytes) of each input element.
Output type is Int32 for String, Int64 for LargeString.
Output type is Int32 for String, Int64 for LargeString.

* \(7) Each UTF8-encoded character in the input is converted to lowercase or
uppercase.
Expand Down Expand Up @@ -541,40 +541,48 @@ These functions trim off characters on both sides (trim), or the left (ltrim) or
Containment tests
~~~~~~~~~~~~~~~~~

+---------------------------+------------+------------------------------------+---------------+----------------------------------------+
| Function name | Arity | Input types | Output type | Options class |
+===========================+============+====================================+===============+========================================+
| match_like | Unary | String-like | Boolean (1) | :struct:`MatchSubstringOptions` |
+---------------------------+------------+------------------------------------+---------------+----------------------------------------+
| match_substring | Unary | String-like | Boolean (2) | :struct:`MatchSubstringOptions` |
+---------------------------+------------+------------------------------------+---------------+----------------------------------------+
| match_substring_regex | Unary | String-like | Boolean (3) | :struct:`MatchSubstringOptions` |
+---------------------------+------------+------------------------------------+---------------+----------------------------------------+
| index_in | Unary | Boolean, Null, Numeric, Temporal, | Int32 (4) | :struct:`SetLookupOptions` |
| | | Binary- and String-like | | |
+---------------------------+------------+------------------------------------+---------------+----------------------------------------+
| is_in | Unary | Boolean, Null, Numeric, Temporal, | Boolean (5) | :struct:`SetLookupOptions` |
| | | Binary- and String-like | | |
+---------------------------+------------+------------------------------------+---------------+----------------------------------------+

* \(1) Output is true iff the SQL-style LIKE pattern
+---------------------------+------------+------------------------------------+--------------------+----------------------------------------+
| Function name | Arity | Input types | Output type | Options class |
+===========================+============+====================================+====================+========================================+
| find_substring | Unary | String-like | Int32 or Int64 (1) | :struct:`MatchSubstringOptions` |
+---------------------------+------------+------------------------------------+--------------------+----------------------------------------+
| match_like | Unary | String-like | Boolean (2) | :struct:`MatchSubstringOptions` |
+---------------------------+------------+------------------------------------+--------------------+----------------------------------------+
| match_substring | Unary | String-like | Boolean (3) | :struct:`MatchSubstringOptions` |
+---------------------------+------------+------------------------------------+--------------------+----------------------------------------+
| match_substring_regex | Unary | String-like | Boolean (4) | :struct:`MatchSubstringOptions` |
+---------------------------+------------+------------------------------------+--------------------+----------------------------------------+
| index_in | Unary | Boolean, Null, Numeric, Temporal, | Int32 (5) | :struct:`SetLookupOptions` |
| | | Binary- and String-like | | |
+---------------------------+------------+------------------------------------+--------------------+----------------------------------------+
| is_in | Unary | Boolean, Null, Numeric, Temporal, | Boolean (6) | :struct:`SetLookupOptions` |
| | | Binary- and String-like | | |
+---------------------------+------------+------------------------------------+--------------------+----------------------------------------+


* \(1) Output is the index of the first occurrence of
:member:`MatchSubstringOptions::pattern` in the corresponding input
string, otherwise -1. Output type is Int32 for Binary/String, Int64
for LargeBinary/LargeString.

* \(2) Output is true iff the SQL-style LIKE pattern
:member:`MatchSubstringOptions::pattern` fully matches the
corresponding input element. That is, ``%`` will match any number of
characters, ``_`` will match exactly one character, and any other
character matches itself. To match a literal percent sign or
underscore, precede the character with a backslash.

* \(2) Output is true iff :member:`MatchSubstringOptions::pattern`
* \(3) Output is true iff :member:`MatchSubstringOptions::pattern`
is a substring of the corresponding input element.

* \(3) Output is true iff :member:`MatchSubstringOptions::pattern`
* \(4) Output is true iff :member:`MatchSubstringOptions::pattern`
matches the corresponding input element at any position.

* \(4) Output is the index of the corresponding input element in
* \(5) Output is the index of the corresponding input element in
:member:`SetLookupOptions::value_set`, if found there. Otherwise,
output is null.

* \(5) Output is true iff the corresponding input element is equal to one
* \(6) Output is true iff the corresponding input element is equal to one
of the elements in :member:`SetLookupOptions::value_set`.


Expand Down Expand Up @@ -878,4 +886,3 @@ Structural transforms
* \(2) For each value in the list child array, the index at which it is found
in the list array is appended to the output. Nulls in the parent list array
are discarded.

1 change: 1 addition & 0 deletions docs/source/python/api/compute.rst
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ Containment tests
.. autosummary::
:toctree: ../generated/

find_substring
index_in
is_in
match_like
Expand Down
19 changes: 19 additions & 0 deletions python/pyarrow/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,25 @@ def cast(arr, target_type, safe=True):
return call_function("cast", [arr], options)


def find_substring(array, pattern):
"""
Find the index of the first occurrence of substring *pattern* in each
value of a string array.

Parameters
----------
array : pyarrow.Array or pyarrow.ChunkedArray
pattern : str
pattern to search for exact matches

Returns
-------
result : pyarrow.Array or pyarrow.ChunkedArray
"""
return call_function("find_substring", [array],
MatchSubstringOptions(pattern))


def match_like(array, pattern):
"""
Test if the SQL-style LIKE pattern *pattern* matches a value of a
Expand Down
22 changes: 22 additions & 0 deletions python/pyarrow/tests/test_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,28 @@ def test_variance():
assert pc.variance(data, ddof=1).as_py() == 6.0


def test_find_substring():
arr = pa.array(["ab", "cab", "ba", None])
result = pc.find_substring(arr, "ab")
expected = pa.array([0, 1, -1, None], type=pa.int32())
assert expected.equals(result)

arr = pa.array(["ab", "cab", "ba", None], type=pa.large_string())
result = pc.find_substring(arr, "ab")
expected = pa.array([0, 1, -1, None], type=pa.int64())
assert expected.equals(result)

arr = pa.array([b"ab", b"cab", b"ba", None])
result = pc.find_substring(arr, b"ab")
expected = pa.array([0, 1, -1, None], type=pa.int32())
assert expected.equals(result)

arr = pa.array([b"ab", b"cab", b"ba", None], type=pa.large_binary())
result = pc.find_substring(arr, b"ab")
expected = pa.array([0, 1, -1, None], type=pa.int64())
assert expected.equals(result)


def test_match_like():
arr = pa.array(["ab", "ba%", "ba", "ca%d", None])
result = pc.match_like(arr, r"_a\%%")
Expand Down