Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 34 additions & 30 deletions be/src/vec/functions/like.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
#include <hs/hs_compile.h>
#include <re2/stringpiece.h>

#include <algorithm>
#include <cstddef>
#include <ostream>
#include <utility>
Expand All @@ -39,26 +38,25 @@

namespace doris::vectorized {
// A regex to match any regex pattern is equivalent to a substring search.
static const RE2 SUBSTRING_RE(
"(?:\\.\\*)*([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)(?:\\.\\*)*");
static const RE2 SUBSTRING_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)");

// A regex to match any regex pattern which is equivalent to matching a constant string
// at the end of the string values.
static const RE2 ENDS_WITH_RE("(?:\\.\\*)*([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)\\$");
static const RE2 ENDS_WITH_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)");

// A regex to match any regex pattern which is equivalent to matching a constant string
// at the end of the string values.
static const RE2 STARTS_WITH_RE("\\^([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)(?:\\.\\*)*");
static const RE2 STARTS_WITH_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)");

// A regex to match any regex pattern which is equivalent to a constant string match.
static const RE2 EQUALS_RE("\\^([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)\\$");
static const RE2 EQUALS_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)");
// A regex to match .*
static const RE2 ALLPASS_RE("(\\\\.\\*)+");
static const RE2 ALLPASS_RE(R"((\\.\*)+)");

// Like patterns
static const re2::RE2 LIKE_SUBSTRING_RE("(?:%+)(((\\\\_)|([^%_\\\\]))+)(?:%+)");
static const re2::RE2 LIKE_SUBSTRING_RE(R"((?:%+)(((\\_)|([^%_\\]))+)(?:%+))");
static const re2::RE2 LIKE_ENDS_WITH_RE("(?:%+)(((\\\\_)|([^%_]))+)");
static const re2::RE2 LIKE_STARTS_WITH_RE("(((\\\\%)|(\\\\_)|([^%_\\\\]))+)(?:%+)");
static const re2::RE2 LIKE_STARTS_WITH_RE(R"((((\\%)|(\\_)|([^%_\\]))+)(?:%+))");
static const re2::RE2 LIKE_EQUALS_RE("(((\\\\_)|([^%_]))+)");
static const re2::RE2 LIKE_ALLPASS_RE("%+");

Expand Down Expand Up @@ -200,7 +198,7 @@ Status FunctionLikeBase::constant_regex_fn_scalar(LikeSearchState* state, const
return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
}
} else { // fallback to re2
*result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), *state->regex.get());
*result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), *state->regex);
}

return Status::OK();
Expand Down Expand Up @@ -241,8 +239,8 @@ Status FunctionLikeBase::constant_regex_fn(LikeSearchState* state, const ColumnS
} else { // fallback to re2
for (size_t i = 0; i < sz; i++) {
const auto& str_ref = val.get_data_at(i);
*(result.data() + i) = RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size),
*state->regex.get());
*(result.data() + i) =
RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size), *state->regex);
}
}

Expand Down Expand Up @@ -447,43 +445,49 @@ void FunctionLike::convert_like_pattern(LikeSearchState* state, const std::strin
}

// add ^ to pattern head to match line head
if (pattern.size() > 0 && pattern[0] != '%') {
if (!pattern.empty() && pattern[0] != '%') {
re_pattern->append("^");
}

bool is_escaped = false;
for (size_t i = 0; i < pattern.size(); ++i) {
if (!is_escaped) {
switch (pattern[i]) {
// expect % and _, all chars should keep it literal means.
for (char i : pattern) {
if (is_escaped) { // last is \, this should be escape
if (i == '[' || i == ']' || i == '(' || i == ')' || i == '{' || i == '}' || i == '-' ||
i == '*' || i == '+' || i == '\\' || i == '|' || i == '/' || i == ':' || i == '^' ||
i == '.' || i == '$' || i == '?') {
re_pattern->append(1, '\\');
} else if (i != '%' && i != '_') {
re_pattern->append(2, '\\');
}
re_pattern->append(1, i);
is_escaped = false;
} else {
switch (i) {
case '%':
re_pattern->append(".*");
break;
case '_':
re_pattern->append(".");
break;
default:
is_escaped = pattern[i] == state->escape_char;
is_escaped = i == state->escape_char;
if (!is_escaped) {
re_pattern->append(1, pattern[i]);
// special for hyperscan: [, ], (, ), {, }, -, *, +, \, |, /, :, ^, ., $, ?
if (i == '[' || i == ']' || i == '(' || i == ')' || i == '{' || i == '}' ||
i == '-' || i == '*' || i == '+' || i == '\\' || i == '|' || i == '/' ||
i == ':' || i == '^' || i == '.' || i == '$' || i == '?') {
re_pattern->append(1, '\\');
}
re_pattern->append(1, i);
}
break;
}
} else {
if (pattern[i] == '.' || pattern[i] == '[' || pattern[i] == ']' || pattern[i] == '{' ||
pattern[i] == '}' || pattern[i] == '(' || pattern[i] == ')' || pattern[i] == '\\' ||
pattern[i] == '*' || pattern[i] == '+' || pattern[i] == '?' || pattern[i] == '|' ||
pattern[i] == '^' || pattern[i] == '$') {
re_pattern->append("\\");
} else if (pattern[i] != '%' && pattern[i] != '_') {
re_pattern->append("\\\\");
}
re_pattern->append(1, pattern[i]);
is_escaped = false;
}
}

// add $ to pattern tail to match line tail
if (pattern.size() > 0 && re_pattern->back() != '*') {
if (!pattern.empty() && re_pattern->back() != '*') {
re_pattern->append("$");
}
}
Expand Down
1 change: 1 addition & 0 deletions docs/sidebars.json
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@
"advanced/using-hll",
"advanced/variables",
"advanced/time-zone",
"advanced/sql-mode",
"advanced/small-file-mgr",
"advanced/cold-hot-separation",
"advanced/compute-node",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,21 @@ true false
-- !like24 --
false true

-- !escape1 --
true

-- !escape2 --
false

-- !escape3 --
false

-- !escape4 --
true

-- !escape5 --
true

-- !escape6 --
true

Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,21 @@ true false
-- !like24 --
false true

-- !escape1 --
true

-- !escape2 --
false

-- !escape3 --
false

-- !escape4 --
true

-- !escape5 --
true

-- !escape6 --
true

Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,11 @@ suite("test_query_like", "query,p0") {
qt_like22 """select "abcd%%1" like "abcd__1", "abcd%%1" not like "abcd__1" """
qt_like23 """select "abcd%%1" like "abcd_%_", "abcd%%1" not like "abcd_%_" """
qt_like24 """select "abcd%%1" like "abcd\\_%1", "abcd%%1" not like "abcd\\_%1" """

qt_escape1 """select 'facebook_10008_T1+T2-ALL_AAA-VO_LowestCost_20230830_HSJ' LIKE '%facebook_10008_T1+T2%' """
qt_escape2 """select '!z23]' like '_[z]%' """
qt_escape3 """select '[123]' like '%[1.*]%' """
qt_escape4 """select '1\\b\\b' like '%_\\b\\b%' """
qt_escape5 """select '1\\d\\d' like '%_\\d\\d%' """
qt_escape6 """select '1dd' like '%_\\d\\d%' """
}
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,11 @@ suite("test_query_like", "query,p0") {
qt_like22 """select "abcd%%1" like "abcd__1", "abcd%%1" not like "abcd__1" """
qt_like23 """select "abcd%%1" like "abcd_%_", "abcd%%1" not like "abcd_%_" """
qt_like24 """select "abcd%%1" like "abcd\\_%1", "abcd%%1" not like "abcd\\_%1" """

qt_escape1 """select 'facebook_10008_T1+T2-ALL_AAA-VO_LowestCost_20230830_HSJ' LIKE '%facebook_10008_T1+T2%' """
qt_escape2 """select '!z23]' like '_[z]%' """
qt_escape3 """select '[123]' like '%[1.*]%' """
qt_escape4 """select '1\\b\\b' like '%_\\b\\b%' """
qt_escape5 """select '1\\d\\d' like '%_\\d\\d%' """
qt_escape6 """select '1dd' like '%_\\d\\d%' """
}