Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
143 changes: 138 additions & 5 deletions cpp/src/arrow/compute/kernels/scalar_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,46 @@ struct PlainSubstringMatcher {
bool Match(util::string_view current) const { return Find(current) >= 0; }
};

struct PlainStartsWithMatcher {
const MatchSubstringOptions& options_;

explicit PlainStartsWithMatcher(const MatchSubstringOptions& options)
: options_(options) {}

static Result<std::unique_ptr<PlainStartsWithMatcher>> Make(
const MatchSubstringOptions& options) {
// Should be handled by partial template specialization below
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure what this comment means. Looks there's no template around.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean L612-L653 below which are partial template specializations that convert a case-insensitive prefix/suffix match into an equivalent regex (to avoid having to do Unicode case folding ourselves): https://github.com/apache/arrow/pull/10448/files#diff-eb8300bc4dea7d1c46b2576b7dbd8e42b927ab7d42c031f4aecae892a72ee244R612-R653

DCHECK(!options.ignore_case);
return ::arrow::internal::make_unique<PlainStartsWithMatcher>(options);
}

bool Match(util::string_view current) const {
// string_view::starts_with is C++20
return current.substr(0, options_.pattern.size()) == options_.pattern;
}
};

struct PlainEndsWithMatcher {
const MatchSubstringOptions& options_;

explicit PlainEndsWithMatcher(const MatchSubstringOptions& options)
: options_(options) {}

static Result<std::unique_ptr<PlainEndsWithMatcher>> Make(
const MatchSubstringOptions& options) {
// Should be handled by partial template specialization below
DCHECK(!options.ignore_case);
return ::arrow::internal::make_unique<PlainEndsWithMatcher>(options);
}

bool Match(util::string_view current) const {
// string_view::ends_with is C++20
return current.size() >= options_.pattern.size() &&
current.substr(current.size() - options_.pattern.size(),
options_.pattern.size()) == options_.pattern;
}
};

#ifdef ARROW_WITH_RE2
struct RegexSubstringMatcher {
const MatchSubstringOptions& options_;
Expand Down Expand Up @@ -581,13 +621,69 @@ struct MatchSubstring<Type, PlainSubstringMatcher> {
}
};

template <typename Type>
struct MatchSubstring<Type, PlainStartsWithMatcher> {
static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
auto options = MatchSubstringState::Get(ctx);
if (options.ignore_case) {
#ifdef ARROW_WITH_RE2
MatchSubstringOptions converted_options = options;
converted_options.pattern = "^" + RE2::QuoteMeta(options.pattern);
ARROW_ASSIGN_OR_RAISE(auto matcher, RegexSubstringMatcher::Make(converted_options));
return MatchSubstringImpl<Type, RegexSubstringMatcher>::Exec(ctx, batch, out,
matcher.get());
#else
return Status::NotImplemented("ignore_case requires RE2");
#endif
}
ARROW_ASSIGN_OR_RAISE(auto matcher, PlainStartsWithMatcher::Make(options));
return MatchSubstringImpl<Type, PlainStartsWithMatcher>::Exec(ctx, batch, out,
matcher.get());
}
};

template <typename Type>
struct MatchSubstring<Type, PlainEndsWithMatcher> {
static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
auto options = MatchSubstringState::Get(ctx);
if (options.ignore_case) {
#ifdef ARROW_WITH_RE2
MatchSubstringOptions converted_options = options;
converted_options.pattern = RE2::QuoteMeta(options.pattern) + "$";
ARROW_ASSIGN_OR_RAISE(auto matcher, RegexSubstringMatcher::Make(converted_options));
return MatchSubstringImpl<Type, RegexSubstringMatcher>::Exec(ctx, batch, out,
matcher.get());
#else
return Status::NotImplemented("ignore_case requires RE2");
#endif
}
ARROW_ASSIGN_OR_RAISE(auto matcher, PlainEndsWithMatcher::Make(options));
return MatchSubstringImpl<Type, PlainEndsWithMatcher>::Exec(ctx, batch, out,
matcher.get());
}
};

const FunctionDoc match_substring_doc(
"Match strings against literal pattern",
("For each string in `strings`, emit true iff it contains a given pattern.\n"
"Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
"If ignore_case is set, only simple case folding is performed."),
{"strings"}, "MatchSubstringOptions");

const FunctionDoc starts_with_doc(
"Check if strings start with a literal pattern",
("For each string in `strings`, emit true iff it starts with a given pattern.\n"
"Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
"If ignore_case is set, only simple case folding is performed."),
{"strings"}, "MatchSubstringOptions");

const FunctionDoc ends_with_doc(
"Check if strings end with a literal pattern",
("For each string in `strings`, emit true iff it ends with a given pattern.\n"
"Null inputs emit null. The pattern must be given in MatchSubstringOptions. "
"If ignore_case is set, only simple case folding is performed."),
{"strings"}, "MatchSubstringOptions");

#ifdef ARROW_WITH_RE2
const FunctionDoc match_substring_regex_doc(
"Match strings against regex pattern",
Expand Down Expand Up @@ -643,17 +739,20 @@ std::string MakeLikeRegex(const MatchSubstringOptions& options) {
return like_pattern;
}

// A LIKE pattern matching this regex can be translated into a substring search.
static RE2 kLikePatternIsSubstringMatch("%+([^%_]*)%+");

// Evaluate a SQL-like LIKE pattern by translating it to a regexp or
// substring search as appropriate. See what Apache Impala does:
// https://github.com/apache/impala/blob/9c38568657d62b6f6d7b10aa1c721ba843374dd8/be/src/exprs/like-predicate.cc
// Note that Impala optimizes more cases (e.g. prefix match) but we
// don't have kernels for those.
template <typename StringType>
struct MatchLike {
static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
// NOTE: avoid making those constants global to avoid compiling regexes at startup
// A LIKE pattern matching this regex can be translated into a substring search.
static const RE2 kLikePatternIsSubstringMatch(R"(%+([^%_]*[^\\%_])?%+)");
// A LIKE pattern matching this regex can be translated into a prefix search.
static const RE2 kLikePatternIsStartsWith(R"(([^%_]*[^\\%_])?%+)");
// A LIKE pattern matching this regex can be translated into a suffix search.
static const RE2 kLikePatternIsEndsWith(R"(%+([^%_]*))");

auto original_options = MatchSubstringState::Get(ctx);
auto original_state = ctx->state();

Expand All @@ -666,6 +765,20 @@ struct MatchLike {
MatchSubstringState converted_state(converted_options);
ctx->SetState(&converted_state);
status = MatchSubstring<StringType, PlainSubstringMatcher>::Exec(ctx, batch, out);
} else if (!original_options.ignore_case &&
re2::RE2::FullMatch(original_options.pattern, kLikePatternIsStartsWith,
&pattern)) {
MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
MatchSubstringState converted_state(converted_options);
ctx->SetState(&converted_state);
status = MatchSubstring<StringType, PlainStartsWithMatcher>::Exec(ctx, batch, out);
} else if (!original_options.ignore_case &&
re2::RE2::FullMatch(original_options.pattern, kLikePatternIsEndsWith,
&pattern)) {
MatchSubstringOptions converted_options{pattern, original_options.ignore_case};
MatchSubstringState converted_state(converted_options);
ctx->SetState(&converted_state);
status = MatchSubstring<StringType, PlainEndsWithMatcher>::Exec(ctx, batch, out);
} else {
MatchSubstringOptions converted_options{MakeLikeRegex(original_options),
original_options.ignore_case};
Expand Down Expand Up @@ -700,6 +813,26 @@ void AddMatchSubstring(FunctionRegistry* registry) {
func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
DCHECK_OK(registry->AddFunction(std::move(func)));
}
{
auto func = std::make_shared<ScalarFunction>("starts_with", Arity::Unary(),
&match_substring_doc);
auto exec_32 = MatchSubstring<StringType, PlainStartsWithMatcher>::Exec;
auto exec_64 = MatchSubstring<LargeStringType, PlainStartsWithMatcher>::Exec;
DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
DCHECK_OK(
func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
DCHECK_OK(registry->AddFunction(std::move(func)));
}
{
auto func = std::make_shared<ScalarFunction>("ends_with", Arity::Unary(),
&match_substring_doc);
auto exec_32 = MatchSubstring<StringType, PlainEndsWithMatcher>::Exec;
auto exec_64 = MatchSubstring<LargeStringType, PlainEndsWithMatcher>::Exec;
DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
DCHECK_OK(
func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
DCHECK_OK(registry->AddFunction(std::move(func)));
}
#ifdef ARROW_WITH_RE2
{
auto func = std::make_shared<ScalarFunction>("match_substring_regex", Arity::Unary(),
Expand Down
30 changes: 30 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,30 @@ static void TrimManyAscii(benchmark::State& state) {
UnaryStringBenchmark(state, "ascii_trim", &options);
}

#ifdef ARROW_WITH_RE2
static void MatchLike(benchmark::State& state) {
MatchSubstringOptions options("ab%ac");
UnaryStringBenchmark(state, "match_like", &options);
}

// MatchLike optimizes the following three into a substring/prefix/suffix search instead
// of using RE2
static void MatchLikeSubstring(benchmark::State& state) {
MatchSubstringOptions options("%abac%");
UnaryStringBenchmark(state, "match_like", &options);
}

static void MatchLikePrefix(benchmark::State& state) {
MatchSubstringOptions options("%abac");
UnaryStringBenchmark(state, "match_like", &options);
}

static void MatchLikeSuffix(benchmark::State& state) {
MatchSubstringOptions options("%abac");
UnaryStringBenchmark(state, "match_like", &options);
}
#endif

#ifdef ARROW_WITH_UTF8PROC
static void Utf8Upper(benchmark::State& state) {
UnaryStringBenchmark(state, "utf8_upper");
Expand Down Expand Up @@ -152,6 +176,12 @@ BENCHMARK(MatchSubstring);
BENCHMARK(SplitPattern);
BENCHMARK(TrimSingleAscii);
BENCHMARK(TrimManyAscii);
#ifdef ARROW_WITH_RE2
BENCHMARK(MatchLike);
BENCHMARK(MatchLikeSubstring);
BENCHMARK(MatchLikePrefix);
BENCHMARK(MatchLikeSuffix);
#endif
#ifdef ARROW_WITH_UTF8PROC
BENCHMARK(Utf8Lower);
BENCHMARK(Utf8Upper);
Expand Down
59 changes: 59 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_string_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,60 @@ TYPED_TEST(TestStringKernels, MatchSubstringIgnoreCase) {
}
#endif

TYPED_TEST(TestStringKernels, MatchStartsWith) {
MatchSubstringOptions options{"abab"};
this->CheckUnary("starts_with", "[]", boolean(), "[]", &options);
this->CheckUnary("starts_with", R"([null, "", "ab", "abab", "$abab", "abab$"])",
boolean(), "[null, false, false, true, false, true]", &options);
this->CheckUnary("starts_with", R"(["ABAB", "BABAB", "ABABC", "bAbAb", "aBaBc"])",
boolean(), "[false, false, false, false, false]", &options);
}

TYPED_TEST(TestStringKernels, MatchEndsWith) {
MatchSubstringOptions options{"abab"};
this->CheckUnary("ends_with", "[]", boolean(), "[]", &options);
this->CheckUnary("ends_with", R"([null, "", "ab", "abab", "$abab", "abab$"])",
boolean(), "[null, false, false, true, true, false]", &options);
this->CheckUnary("ends_with", R"(["ABAB", "BABAB", "ABABC", "bAbAb", "aBaBc"])",
boolean(), "[false, false, false, false, false]", &options);
}

#ifdef ARROW_WITH_RE2
TYPED_TEST(TestStringKernels, MatchStartsWithIgnoreCase) {
MatchSubstringOptions options{"aBAb", /*ignore_case=*/true};
this->CheckUnary("starts_with", "[]", boolean(), "[]", &options);
this->CheckUnary("starts_with", R"([null, "", "ab", "abab", "$abab", "abab$"])",
boolean(), "[null, false, false, true, false, true]", &options);
this->CheckUnary("starts_with", R"(["ABAB", "$ABAB", "ABAB$", "$AbAb", "aBaB$"])",
boolean(), "[true, false, true, false, true]", &options);
}

TYPED_TEST(TestStringKernels, MatchEndsWithIgnoreCase) {
MatchSubstringOptions options{"aBAb", /*ignore_case=*/true};
this->CheckUnary("ends_with", "[]", boolean(), "[]", &options);
this->CheckUnary("ends_with", R"([null, "", "ab", "abab", "$abab", "abab$"])",
boolean(), "[null, false, false, true, true, false]", &options);
this->CheckUnary("ends_with", R"(["ABAB", "$ABAB", "ABAB$", "$AbAb", "aBaB$"])",
boolean(), "[true, true, false, true, false]", &options);
}
#else
TYPED_TEST(TestStringKernels, MatchStartsWithIgnoreCase) {
Datum input = ArrayFromJSON(this->type(), R"(["a"])");
MatchSubstringOptions options{"a", /*ignore_case=*/true};
EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
::testing::HasSubstr("ignore_case requires RE2"),
CallFunction("starts_with", {input}, &options));
}

TYPED_TEST(TestStringKernels, MatchEndsWithIgnoreCase) {
Datum input = ArrayFromJSON(this->type(), R"(["a"])");
MatchSubstringOptions options{"a", /*ignore_case=*/true};
EXPECT_RAISES_WITH_MESSAGE_THAT(NotImplemented,
::testing::HasSubstr("ignore_case requires RE2"),
CallFunction("ends_with", {input}, &options));
}
#endif

#ifdef ARROW_WITH_RE2
TYPED_TEST(TestStringKernels, MatchSubstringRegex) {
MatchSubstringOptions options{"ab"};
Expand Down Expand Up @@ -528,10 +582,15 @@ TYPED_TEST(TestStringKernels, MatchLike) {
TYPED_TEST(TestStringKernels, MatchLikeEscaping) {
auto inputs = R"(["%%foo", "_bar", "({", "\\baz"])";

// N.B. I believe Impala mistakenly optimizes these into substring searches
MatchSubstringOptions escape_percent{"\\%%"};
this->CheckUnary("match_like", inputs, boolean(), "[true, false, false, false]",
&escape_percent);

MatchSubstringOptions not_substring{"%\\%%"};
this->CheckUnary("match_like", inputs, boolean(), "[true, false, false, false]",
&not_substring);

MatchSubstringOptions escape_underscore{"\\____"};
this->CheckUnary("match_like", inputs, boolean(), "[false, true, false, false]",
&escape_underscore);
Expand Down
Loading