Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion cpp/src/arrow/compute/api_scalar.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,13 @@ struct ArithmeticOptions : public FunctionOptions {
};

struct ARROW_EXPORT MatchSubstringOptions : public FunctionOptions {
explicit MatchSubstringOptions(std::string pattern) : pattern(std::move(pattern)) {}
explicit MatchSubstringOptions(std::string pattern, bool ignore_case = false)
: pattern(std::move(pattern)), ignore_case(ignore_case) {}

/// The exact substring (or regex, depending on kernel) to look for inside input values.
std::string pattern;
/// Whether to perform a case-insensitive match.
bool ignore_case = false;
};

struct ARROW_EXPORT SplitOptions : public FunctionOptions {
Expand Down
130 changes: 89 additions & 41 deletions cpp/src/arrow/compute/kernels/scalar_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -433,40 +433,15 @@ void StringBoolTransform(KernelContext* ctx, const ExecBatch& batch,

using MatchSubstringState = OptionsWrapper<MatchSubstringOptions>;

template <typename Type, typename Matcher>
struct MatchSubstring {
using offset_type = typename Type::offset_type;
static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
// TODO Cache matcher across invocations (for regex compilation)
ARROW_ASSIGN_OR_RAISE(auto matcher, Matcher::Make(MatchSubstringState::Get(ctx)));
StringBoolTransform<Type>(
ctx, batch,
[&matcher](const void* raw_offsets, const uint8_t* data, int64_t length,
int64_t output_offset, uint8_t* output) {
const offset_type* offsets = reinterpret_cast<const offset_type*>(raw_offsets);
FirstTimeBitmapWriter bitmap_writer(output, output_offset, length);
for (int64_t i = 0; i < length; ++i) {
const char* current_data = reinterpret_cast<const char*>(data + offsets[i]);
int64_t current_length = offsets[i + 1] - offsets[i];
if (matcher->Match(util::string_view(current_data, current_length))) {
bitmap_writer.Set();
}
bitmap_writer.Next();
}
bitmap_writer.Finish();
},
out);
return Status::OK();
}
};

// This is an implementation of the Knuth-Morris-Pratt algorithm
struct PlainSubstringMatcher {
const MatchSubstringOptions& options_;
std::vector<int64_t> prefix_table;

static Result<std::unique_ptr<PlainSubstringMatcher>> Make(
const MatchSubstringOptions& options) {
// Should be handled by partial template specialization below
DCHECK(!options.ignore_case);
return ::arrow::internal::make_unique<PlainSubstringMatcher>(options);
}

Expand Down Expand Up @@ -509,38 +484,109 @@ struct PlainSubstringMatcher {
bool Match(util::string_view current) const { return Find(current) >= 0; }
};

const FunctionDoc match_substring_doc(
"Match strings against literal pattern",
("For each string in `strings`, emit true iff it contains a given pattern.\n"
"Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
{"strings"}, "MatchSubstringOptions");

#ifdef ARROW_WITH_RE2
struct RegexSubstringMatcher {
const MatchSubstringOptions& options_;
const RE2 regex_match_;

static Result<std::unique_ptr<RegexSubstringMatcher>> Make(
const MatchSubstringOptions& options) {
auto matcher = ::arrow::internal::make_unique<RegexSubstringMatcher>(options);
const MatchSubstringOptions& options, bool literal = false) {
auto matcher =
::arrow::internal::make_unique<RegexSubstringMatcher>(options, literal);
RETURN_NOT_OK(RegexStatus(matcher->regex_match_));
return std::move(matcher);
}

explicit RegexSubstringMatcher(const MatchSubstringOptions& options)
: options_(options), regex_match_(options_.pattern, RE2::Quiet) {}
explicit RegexSubstringMatcher(const MatchSubstringOptions& options,
bool literal = false)
: options_(options),
regex_match_(options_.pattern, MakeRE2Options(options, literal)) {}

bool Match(util::string_view current) const {
auto piece = re2::StringPiece(current.data(), current.length());
return re2::RE2::PartialMatch(piece, regex_match_);
}

static RE2::RE2::Options MakeRE2Options(const MatchSubstringOptions& options,
bool literal) {
RE2::RE2::Options re2_options(RE2::Quiet);
re2_options.set_case_sensitive(!options.ignore_case);
re2_options.set_literal(literal);
return re2_options;
}
};
#endif

template <typename Type, typename Matcher>
struct MatchSubstringImpl {
using offset_type = typename Type::offset_type;

static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out,
const Matcher* matcher) {
StringBoolTransform<Type>(
ctx, batch,
[&matcher](const void* raw_offsets, const uint8_t* data, int64_t length,
int64_t output_offset, uint8_t* output) {
const offset_type* offsets = reinterpret_cast<const offset_type*>(raw_offsets);
FirstTimeBitmapWriter bitmap_writer(output, output_offset, length);
for (int64_t i = 0; i < length; ++i) {
const char* current_data = reinterpret_cast<const char*>(data + offsets[i]);
int64_t current_length = offsets[i + 1] - offsets[i];
if (matcher->Match(util::string_view(current_data, current_length))) {
bitmap_writer.Set();
}
bitmap_writer.Next();
}
bitmap_writer.Finish();
},
out);
return Status::OK();
}
};

template <typename Type, typename Matcher>
struct MatchSubstring {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand why two MatchSubstring and MatchSubstringImpl classes. It seems one should be sufficient?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's only one of each. I moved them around in this PR, but it's the same as before.

static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
// TODO Cache matcher across invocations (for regex compilation)
ARROW_ASSIGN_OR_RAISE(auto matcher, Matcher::Make(MatchSubstringState::Get(ctx)));
return MatchSubstringImpl<Type, Matcher>::Exec(ctx, batch, out, matcher.get());
}
};

template <typename Type>
struct MatchSubstring<Type, PlainSubstringMatcher> {
static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
auto options = MatchSubstringState::Get(ctx);
if (options.ignore_case) {
#ifdef ARROW_WITH_RE2
ARROW_ASSIGN_OR_RAISE(auto matcher,
RegexSubstringMatcher::Make(options, /*literal=*/true));
return MatchSubstringImpl<Type, RegexSubstringMatcher>::Exec(ctx, batch, out,
matcher.get());
#else
return Status::NotImplemented("ignore_case requires RE2");
#endif
}
ARROW_ASSIGN_OR_RAISE(auto matcher, PlainSubstringMatcher::Make(options));
return MatchSubstringImpl<Type, PlainSubstringMatcher>::Exec(ctx, batch, out,
matcher.get());
}
};

const FunctionDoc match_substring_doc(
"Match strings against literal pattern",
("For each string in `strings`, emit true iff it contains a given pattern.\n"
"Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
"If ignore_case is set, only simple case folding is performed."),
{"strings"}, "MatchSubstringOptions");

#ifdef ARROW_WITH_RE2
const FunctionDoc match_substring_regex_doc(
"Match strings against regex pattern",
("For each string in `strings`, emit true iff it matches a given pattern at any "
"position.\n"
"Null inputs emit null. The pattern must be given in MatchSubstringOptions."),
"Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
"If ignore_case is set, only simple case folding is performed."),
{"strings"}, "MatchSubstringOptions");

// SQL LIKE match
Expand Down Expand Up @@ -605,14 +651,16 @@ struct MatchLike {

Status status;
std::string pattern;
if (re2::RE2::FullMatch(original_options.pattern, kLikePatternIsSubstringMatch,
if (!original_options.ignore_case &&
re2::RE2::FullMatch(original_options.pattern, kLikePatternIsSubstringMatch,
&pattern)) {
MatchSubstringOptions converted_options{pattern};
MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
MatchSubstringState converted_state(converted_options);
ctx->SetState(&converted_state);
status = MatchSubstring<StringType, PlainSubstringMatcher>::Exec(ctx, batch, out);
} else {
MatchSubstringOptions converted_options{MakeLikeRegex(original_options)};
MatchSubstringOptions converted_options{MakeLikeRegex(original_options),
original_options.ignore_case};
MatchSubstringState converted_state(converted_options);
ctx->SetState(&converted_state);
status = MatchSubstring<StringType, RegexSubstringMatcher>::Exec(ctx, batch, out);
Expand Down
38 changes: 34 additions & 4 deletions cpp/src/arrow/compute/kernels/scalar_string_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -377,8 +377,8 @@ TYPED_TEST(TestStringKernels, IsUpperAscii) {
TYPED_TEST(TestStringKernels, MatchSubstring) {
MatchSubstringOptions options{"ab"};
this->CheckUnary("match_substring", "[]", boolean(), "[]", &options);
this->CheckUnary("match_substring", R"(["abc", "acb", "cab", null, "bac"])", boolean(),
"[true, false, true, null, false]", &options);
this->CheckUnary("match_substring", R"(["abc", "acb", "cab", null, "bac", "AB"])",
boolean(), "[true, false, true, null, false, false]", &options);

MatchSubstringOptions options_repeated{"abab"};
this->CheckUnary("match_substring", R"(["abab", "ab", "cababc", null, "bac"])",
Expand All @@ -393,12 +393,29 @@ TYPED_TEST(TestStringKernels, MatchSubstring) {
&options_double_char_2);
}

#ifdef ARROW_WITH_RE2
TYPED_TEST(TestStringKernels, MatchSubstringIgnoreCase) {
MatchSubstringOptions options_insensitive{"aé(", /*ignore_case=*/true};
this->CheckUnary("match_substring", R"(["abc", "aEb", "baÉ(", "aé(", "ae(", "Aé("])",
boolean(), "[false, false, true, true, false, true]",
&options_insensitive);
}
#else
TYPED_TEST(TestStringKernels, MatchSubstringIgnoreCase) {
Datum input = ArrayFromJSON(this->type(), R"(["a"])");
MatchSubstringOptions options{"a", /*ignore_case=*/true};
EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
::testing::HasSubstr("ignore_case requires RE2"),
CallFunction("match_substring", {input}, &options));
}
#endif

#ifdef ARROW_WITH_RE2
TYPED_TEST(TestStringKernels, MatchSubstringRegex) {
MatchSubstringOptions options{"ab"};
this->CheckUnary("match_substring_regex", "[]", boolean(), "[]", &options);
this->CheckUnary("match_substring_regex", R"(["abc", "acb", "cab", null, "bac"])",
boolean(), "[true, false, true, null, false]", &options);
this->CheckUnary("match_substring_regex", R"(["abc", "acb", "cab", null, "bac", "AB"])",
boolean(), "[true, false, true, null, false, false]", &options);
MatchSubstringOptions options_repeated{"(ab){2}"};
this->CheckUnary("match_substring_regex", R"(["abab", "ab", "cababc", null, "bac"])",
boolean(), "[true, false, true, null, false]", &options_repeated);
Expand All @@ -411,6 +428,10 @@ TYPED_TEST(TestStringKernels, MatchSubstringRegex) {
MatchSubstringOptions options_plus{"a+b"};
this->CheckUnary("match_substring_regex", R"(["aacb", "aab", "dab", "caaab", "b", ""])",
boolean(), "[false, true, true, true, false, false]", &options_plus);
MatchSubstringOptions options_insensitive{"ab|é", /*ignore_case=*/true};
this->CheckUnary("match_substring_regex", R"(["abc", "acb", "É", null, "bac", "AB"])",
boolean(), "[true, false, true, null, false, true]",
&options_insensitive);

// Unicode character semantics
// "\pL" means: unicode category "letter"
Expand Down Expand Up @@ -458,6 +479,15 @@ TYPED_TEST(TestStringKernels, MatchLike) {
this->CheckUnary("match_like", inputs, boolean(),
"[false, false, true, false, false, false, false, null]",
&regex_match);

// ignore_case means this still gets mapped to a regex search
MatchSubstringOptions insensitive_substring{"%é%", /*ignore_case=*/true};
this->CheckUnary("match_like", R"(["é", "fooÉbar", "e"])", boolean(),
"[true, true, false]", &insensitive_substring);

MatchSubstringOptions insensitive_regex{"_é%", /*ignore_case=*/true};
this->CheckUnary("match_like", R"(["éfoo", "aÉfoo", "e"])", boolean(),
"[false, true, false]", &insensitive_regex);
}

TYPED_TEST(TestStringKernels, MatchLikeEscaping) {
Expand Down
8 changes: 4 additions & 4 deletions python/pyarrow/_compute.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -657,14 +657,14 @@ cdef class _MatchSubstringOptions(FunctionOptions):
cdef const CFunctionOptions* get_options(self) except NULL:
return self.match_substring_options.get()

def _set_options(self, pattern):
def _set_options(self, pattern, bint ignore_case):
self.match_substring_options.reset(
new CMatchSubstringOptions(tobytes(pattern)))
new CMatchSubstringOptions(tobytes(pattern), ignore_case))


class MatchSubstringOptions(_MatchSubstringOptions):
def __init__(self, pattern):
self._set_options(pattern)
def __init__(self, pattern, bint ignore_case=False):
self._set_options(pattern, ignore_case)


cdef class _TrimOptions(FunctionOptions):
Expand Down
18 changes: 12 additions & 6 deletions python/pyarrow/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ def find_substring(array, pattern):
MatchSubstringOptions(pattern))


def match_like(array, pattern):
def match_like(array, pattern, *, ignore_case=False):
"""
Test if the SQL-style LIKE pattern *pattern* matches a value of a
string array.
Expand All @@ -321,17 +321,19 @@ def match_like(array, pattern):
characters, '_' will match exactly one character, and all
other characters match themselves. To match a literal percent
sign or underscore, precede the character with a backslash.
ignore_case : bool, default False
Ignore case while searching.

Returns
-------
result : pyarrow.Array or pyarrow.ChunkedArray

"""
return call_function("match_like", [array],
MatchSubstringOptions(pattern))
MatchSubstringOptions(pattern, ignore_case))


def match_substring(array, pattern):
def match_substring(array, pattern, *, ignore_case=False):
"""
Test if substring *pattern* is contained within a value of a string array.

Expand All @@ -340,16 +342,18 @@ def match_substring(array, pattern):
array : pyarrow.Array or pyarrow.ChunkedArray
pattern : str
pattern to search for exact matches
ignore_case : bool, default False
Ignore case while searching.

Returns
-------
result : pyarrow.Array or pyarrow.ChunkedArray
"""
return call_function("match_substring", [array],
MatchSubstringOptions(pattern))
MatchSubstringOptions(pattern, ignore_case))


def match_substring_regex(array, pattern):
def match_substring_regex(array, pattern, *, ignore_case=False):
"""
Test if regex *pattern* matches at any position a value of a string array.

Expand All @@ -358,13 +362,15 @@ def match_substring_regex(array, pattern):
array : pyarrow.Array or pyarrow.ChunkedArray
pattern : str
regex pattern to search
ignore_case : bool, default False
Ignore case while searching.

Returns
-------
result : pyarrow.Array or pyarrow.ChunkedArray
"""
return call_function("match_substring_regex", [array],
MatchSubstringOptions(pattern))
MatchSubstringOptions(pattern, ignore_case))


def sum(array):
Expand Down
3 changes: 2 additions & 1 deletion python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -1783,8 +1783,9 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:

cdef cppclass CMatchSubstringOptions \
"arrow::compute::MatchSubstringOptions"(CFunctionOptions):
CMatchSubstringOptions(c_string pattern)
CMatchSubstringOptions(c_string pattern, c_bool ignore_case)
c_string pattern
c_bool ignore_case

cdef cppclass CTrimOptions \
"arrow::compute::TrimOptions"(CFunctionOptions):
Expand Down
Loading