diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc b/cpp/src/arrow/compute/kernels/scalar_string.cc index dbacb6bb96f..e6820fe4747 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string.cc @@ -917,13 +917,44 @@ struct FindSubstring { } }; +#ifdef ARROW_WITH_RE2 +struct FindSubstringRegex { + std::unique_ptr regex_match_; + + explicit FindSubstringRegex(const MatchSubstringOptions& options, + bool literal = false) { + std::string regex = "("; + regex.reserve(options.pattern.length() + 2); + regex += literal ? RE2::QuoteMeta(options.pattern) : options.pattern; + regex += ")"; + regex_match_.reset(new RE2(std::move(regex), RegexSubstringMatcher::MakeRE2Options( + options, /*literal=*/false))); + } + + template + OutValue Call(KernelContext*, util::string_view val, Status*) const { + re2::StringPiece piece(val.data(), val.length()); + re2::StringPiece match; + if (re2::RE2::PartialMatch(piece, *regex_match_, &match)) { + return static_cast(match.data() - piece.data()); + } + return -1; + } +}; +#endif + template struct FindSubstringExec { using OffsetType = typename TypeTraits::OffsetType; static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { const MatchSubstringOptions& options = MatchSubstringState::Get(ctx); if (options.ignore_case) { - return Status::NotImplemented("find_substring with ignore_case"); +#ifdef ARROW_WITH_RE2 + applicator::ScalarUnaryNotNullStateful + kernel{FindSubstringRegex(options, /*literal=*/true)}; + return kernel.Exec(ctx, batch, out); +#endif + return Status::NotImplemented("ignore_case requires RE2"); } applicator::ScalarUnaryNotNullStateful kernel{ FindSubstring(PlainSubstringMatcher(options))}; @@ -938,21 +969,52 @@ const FunctionDoc find_substring_doc( "Null inputs emit null. The pattern must be given in MatchSubstringOptions."), {"strings"}, "MatchSubstringOptions"); +#ifdef ARROW_WITH_RE2 +template +struct FindSubstringRegexExec { + using OffsetType = typename TypeTraits::OffsetType; + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + const MatchSubstringOptions& options = MatchSubstringState::Get(ctx); + applicator::ScalarUnaryNotNullStateful + kernel{FindSubstringRegex(options, /*literal=*/false)}; + return kernel.Exec(ctx, batch, out); + } +}; + +const FunctionDoc find_substring_regex_doc( + "Find location of first match of regex pattern", + ("For each string in `strings`, emit the index of the first match of the given " + "pattern, or -1 if not found.\n" + "Null inputs emit null. The pattern must be given in MatchSubstringOptions."), + {"strings"}, "MatchSubstringOptions"); +#endif + void AddFindSubstring(FunctionRegistry* registry) { - auto func = std::make_shared("find_substring", Arity::Unary(), - &find_substring_doc); - for (const auto& ty : BaseBinaryTypes()) { - std::shared_ptr offset_type; - if (ty->id() == Type::type::LARGE_BINARY || ty->id() == Type::type::LARGE_STRING) { - offset_type = int64(); - } else { - offset_type = int32(); + { + auto func = std::make_shared("find_substring", Arity::Unary(), + &find_substring_doc); + for (const auto& ty : BaseBinaryTypes()) { + auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32(); + DCHECK_OK(func->AddKernel({ty}, offset_type, + GenerateTypeAgnosticVarBinaryBase(ty), + MatchSubstringState::Init)); } - DCHECK_OK(func->AddKernel({ty}, offset_type, - GenerateTypeAgnosticVarBinaryBase(ty), - MatchSubstringState::Init)); + DCHECK_OK(registry->AddFunction(std::move(func))); } - DCHECK_OK(registry->AddFunction(std::move(func))); +#ifdef ARROW_WITH_RE2 + { + auto func = std::make_shared("find_substring_regex", Arity::Unary(), + &find_substring_regex_doc); + for (const auto& ty : BaseBinaryTypes()) { + auto offset_type = offset_bit_width(ty->id()) == 64 ? int64() : int32(); + DCHECK_OK( + func->AddKernel({ty}, offset_type, + GenerateTypeAgnosticVarBinaryBase(ty), + MatchSubstringState::Init)); + } + DCHECK_OK(registry->AddFunction(std::move(func))); + } +#endif } // Substring count diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc index 6192e0a5dd7..d5c256fd8ef 100644 --- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc @@ -173,6 +173,36 @@ TYPED_TEST(TestBinaryKernels, FindSubstring) { "[0, 0, null]", &options_empty); } +#ifdef ARROW_WITH_RE2 +TYPED_TEST(TestBinaryKernels, FindSubstringIgnoreCase) { + MatchSubstringOptions options{"?AB)", /*ignore_case=*/true}; + this->CheckUnary("find_substring", "[]", this->offset_type(), "[]", &options); + this->CheckUnary("find_substring", + R"-(["?aB)c", "acb", "c?Ab)", null, "?aBc", "AB)"])-", + this->offset_type(), "[0, -1, 1, null, -1, -1]", &options); +} + +TYPED_TEST(TestBinaryKernels, FindSubstringRegex) { + MatchSubstringOptions options{"a+", /*ignore_case=*/false}; + this->CheckUnary("find_substring_regex", "[]", this->offset_type(), "[]", &options); + this->CheckUnary("find_substring_regex", R"(["a", "A", "baaa", null, "", "AaaA"])", + this->offset_type(), "[0, -1, 1, null, -1, 1]", &options); + + options.ignore_case = true; + this->CheckUnary("find_substring_regex", "[]", this->offset_type(), "[]", &options); + this->CheckUnary("find_substring_regex", R"(["a", "A", "baaa", null, "", "AaaA"])", + this->offset_type(), "[0, 0, 1, null, -1, 0]", &options); +} +#else +TYPED_TEST(TestBinaryKernels, FindSubstringIgnoreCase) { + MatchSubstringOptions options{"a+", /*ignore_case=*/true}; + Datum input = ArrayFromJSON(this->type(), R"(["a"])"); + EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented, + ::testing::HasSubstr("ignore_case requires RE2"), + CallFunction("find_substring", {input}, &options)); +} +#endif + TYPED_TEST(TestBinaryKernels, CountSubstring) { MatchSubstringOptions options{"aba"}; this->CheckUnary("count_substring", "[]", this->offset_type(), "[]", &options); diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 147885560f5..e785756dcda 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -597,7 +597,9 @@ Containment tests +---------------------------+------------+------------------------------------+--------------------+----------------------------------------+ | ends_with | Unary | String-like | Boolean (2) | :struct:`MatchSubstringOptions` | +---------------------------+------------+------------------------------------+--------------------+----------------------------------------+ -| find_substring | Unary | String-like | Int32 or Int64 (3) | :struct:`MatchSubstringOptions` | +| find_substring | Unary | Binary- and String-like | Int32 or Int64 (3) | :struct:`MatchSubstringOptions` | ++---------------------------+------------+------------------------------------+--------------------+----------------------------------------+ +| find_substring_regex | Unary | Binary- and String-like | Int32 or Int64 (3) | :struct:`MatchSubstringOptions` | +---------------------------+------------+------------------------------------+--------------------+----------------------------------------+ | index_in | Unary | Boolean, Null, Numeric, Temporal, | Int32 (4) | :struct:`SetLookupOptions` | | | | Binary- and String-like | | | diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index 80fcb2078f1..8b264ed9b83 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -205,6 +205,7 @@ Containment tests count_substring_regex ends_with find_substring + find_substring_regex index_in is_in match_like diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index b258b551f02..aacf8456c1b 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -331,7 +331,7 @@ def count_substring_regex(array, pattern, *, ignore_case=False): MatchSubstringOptions(pattern, ignore_case)) -def find_substring(array, pattern): +def find_substring(array, pattern, *, ignore_case=False): """ Find the index of the first occurrence of substring *pattern* in each value of a string array. @@ -341,13 +341,36 @@ def find_substring(array, pattern): array : pyarrow.Array or pyarrow.ChunkedArray pattern : str pattern to search for exact matches + ignore_case : bool, default False + Ignore case while searching. Returns ------- result : pyarrow.Array or pyarrow.ChunkedArray """ return call_function("find_substring", [array], - MatchSubstringOptions(pattern)) + MatchSubstringOptions(pattern, ignore_case)) + + +def find_substring_regex(array, pattern, *, ignore_case=False): + """ + Find the index of the first match of regex *pattern* in each + value of a string array. + + Parameters + ---------- + array : pyarrow.Array or pyarrow.ChunkedArray + pattern : str + regex pattern to search for + ignore_case : bool, default False + Ignore case while searching. + + Returns + ------- + result : pyarrow.Array or pyarrow.ChunkedArray + """ + return call_function("find_substring_regex", [array], + MatchSubstringOptions(pattern, ignore_case)) def match_like(array, pattern, *, ignore_case=False): diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index efe2e6be2f8..8b294b85759 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -319,25 +319,20 @@ def test_count_substring_regex(): def test_find_substring(): - arr = pa.array(["ab", "cab", "ba", None]) - result = pc.find_substring(arr, "ab") - expected = pa.array([0, 1, -1, None], type=pa.int32()) - assert expected.equals(result) + for ty in [pa.string(), pa.binary(), pa.large_string(), pa.large_binary()]: + arr = pa.array(["ab", "cab", "ba", None], type=ty) + result = pc.find_substring(arr, "ab") + assert result.to_pylist() == [0, 1, -1, None] - arr = pa.array(["ab", "cab", "ba", None], type=pa.large_string()) - result = pc.find_substring(arr, "ab") - expected = pa.array([0, 1, -1, None], type=pa.int64()) - assert expected.equals(result) + result = pc.find_substring_regex(arr, "a?b") + assert result.to_pylist() == [0, 1, 0, None] - arr = pa.array([b"ab", b"cab", b"ba", None]) - result = pc.find_substring(arr, b"ab") - expected = pa.array([0, 1, -1, None], type=pa.int32()) - assert expected.equals(result) + arr = pa.array(["ab*", "cAB*", "ba", "aB?"], type=ty) + result = pc.find_substring(arr, "aB*", ignore_case=True) + assert result.to_pylist() == [0, 1, -1, -1] - arr = pa.array([b"ab", b"cab", b"ba", None], type=pa.large_binary()) - result = pc.find_substring(arr, b"ab") - expected = pa.array([0, 1, -1, None], type=pa.int64()) - assert expected.equals(result) + result = pc.find_substring_regex(arr, "a?b", ignore_case=True) + assert result.to_pylist() == [0, 1, 0, 0] def test_match_like():