diff --git a/be/src/vec/functions/like.cpp b/be/src/vec/functions/like.cpp index add09f845a62aa..2d45bab32b562d 100644 --- a/be/src/vec/functions/like.cpp +++ b/be/src/vec/functions/like.cpp @@ -21,7 +21,6 @@ #include #include -#include #include #include #include @@ -39,26 +38,25 @@ namespace doris::vectorized { // A regex to match any regex pattern is equivalent to a substring search. -static const RE2 SUBSTRING_RE( - "(?:\\.\\*)*([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)(?:\\.\\*)*"); +static const RE2 SUBSTRING_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)"); // A regex to match any regex pattern which is equivalent to matching a constant string // at the end of the string values. -static const RE2 ENDS_WITH_RE("(?:\\.\\*)*([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)\\$"); +static const RE2 ENDS_WITH_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)"); // A regex to match any regex pattern which is equivalent to matching a constant string // at the end of the string values. -static const RE2 STARTS_WITH_RE("\\^([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)(?:\\.\\*)*"); +static const RE2 STARTS_WITH_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)"); // A regex to match any regex pattern which is equivalent to a constant string match. -static const RE2 EQUALS_RE("\\^([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)\\$"); +static const RE2 EQUALS_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)"); // A regex to match .* -static const RE2 ALLPASS_RE("(\\\\.\\*)+"); +static const RE2 ALLPASS_RE(R"((\\.\*)+)"); // Like patterns -static const re2::RE2 LIKE_SUBSTRING_RE("(?:%+)(((\\\\_)|([^%_\\\\]))+)(?:%+)"); +static const re2::RE2 LIKE_SUBSTRING_RE(R"((?:%+)(((\\_)|([^%_\\]))+)(?:%+))"); static const re2::RE2 LIKE_ENDS_WITH_RE("(?:%+)(((\\\\_)|([^%_]))+)"); -static const re2::RE2 LIKE_STARTS_WITH_RE("(((\\\\%)|(\\\\_)|([^%_\\\\]))+)(?:%+)"); +static const re2::RE2 LIKE_STARTS_WITH_RE(R"((((\\%)|(\\_)|([^%_\\]))+)(?:%+))"); static const re2::RE2 LIKE_EQUALS_RE("(((\\\\_)|([^%_]))+)"); static const re2::RE2 LIKE_ALLPASS_RE("%+"); @@ -200,7 +198,7 @@ Status FunctionLikeBase::constant_regex_fn_scalar(LikeSearchState* state, const return Status::RuntimeError(fmt::format("hyperscan error: {}", ret)); } } else { // fallback to re2 - *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), *state->regex.get()); + *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), *state->regex); } return Status::OK(); @@ -241,8 +239,8 @@ Status FunctionLikeBase::constant_regex_fn(LikeSearchState* state, const ColumnS } else { // fallback to re2 for (size_t i = 0; i < sz; i++) { const auto& str_ref = val.get_data_at(i); - *(result.data() + i) = RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size), - *state->regex.get()); + *(result.data() + i) = + RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size), *state->regex); } } @@ -447,14 +445,25 @@ void FunctionLike::convert_like_pattern(LikeSearchState* state, const std::strin } // add ^ to pattern head to match line head - if (pattern.size() > 0 && pattern[0] != '%') { + if (!pattern.empty() && pattern[0] != '%') { re_pattern->append("^"); } bool is_escaped = false; - for (size_t i = 0; i < pattern.size(); ++i) { - if (!is_escaped) { - switch (pattern[i]) { + // expect % and _, all chars should keep it literal means. + for (char i : pattern) { + if (is_escaped) { // last is \, this should be escape + if (i == '[' || i == ']' || i == '(' || i == ')' || i == '{' || i == '}' || i == '-' || + i == '*' || i == '+' || i == '\\' || i == '|' || i == '/' || i == ':' || i == '^' || + i == '.' || i == '$' || i == '?') { + re_pattern->append(1, '\\'); + } else if (i != '%' && i != '_') { + re_pattern->append(2, '\\'); + } + re_pattern->append(1, i); + is_escaped = false; + } else { + switch (i) { case '%': re_pattern->append(".*"); break; @@ -462,28 +471,23 @@ void FunctionLike::convert_like_pattern(LikeSearchState* state, const std::strin re_pattern->append("."); break; default: - is_escaped = pattern[i] == state->escape_char; + is_escaped = i == state->escape_char; if (!is_escaped) { - re_pattern->append(1, pattern[i]); + // special for hyperscan: [, ], (, ), {, }, -, *, +, \, |, /, :, ^, ., $, ? + if (i == '[' || i == ']' || i == '(' || i == ')' || i == '{' || i == '}' || + i == '-' || i == '*' || i == '+' || i == '\\' || i == '|' || i == '/' || + i == ':' || i == '^' || i == '.' || i == '$' || i == '?') { + re_pattern->append(1, '\\'); + } + re_pattern->append(1, i); } break; } - } else { - if (pattern[i] == '.' || pattern[i] == '[' || pattern[i] == ']' || pattern[i] == '{' || - pattern[i] == '}' || pattern[i] == '(' || pattern[i] == ')' || pattern[i] == '\\' || - pattern[i] == '*' || pattern[i] == '+' || pattern[i] == '?' || pattern[i] == '|' || - pattern[i] == '^' || pattern[i] == '$') { - re_pattern->append("\\"); - } else if (pattern[i] != '%' && pattern[i] != '_') { - re_pattern->append("\\\\"); - } - re_pattern->append(1, pattern[i]); - is_escaped = false; } } // add $ to pattern tail to match line tail - if (pattern.size() > 0 && re_pattern->back() != '*') { + if (!pattern.empty() && re_pattern->back() != '*') { re_pattern->append("$"); } } diff --git a/docs/sidebars.json b/docs/sidebars.json index b9c64147e9fcef..3f9e3cc67f6c6d 100644 --- a/docs/sidebars.json +++ b/docs/sidebars.json @@ -178,6 +178,7 @@ "advanced/using-hll", "advanced/variables", "advanced/time-zone", + "advanced/sql-mode", "advanced/small-file-mgr", "advanced/cold-hot-separation", "advanced/compute-node", diff --git a/regression-test/data/nereids_p0/sql_functions/conditional_functions/test_query_like.out b/regression-test/data/nereids_p0/sql_functions/conditional_functions/test_query_like.out index 5a9e10ed6cad1e..05417f338d24b2 100644 --- a/regression-test/data/nereids_p0/sql_functions/conditional_functions/test_query_like.out +++ b/regression-test/data/nereids_p0/sql_functions/conditional_functions/test_query_like.out @@ -77,3 +77,21 @@ true false -- !like24 -- false true +-- !escape1 -- +true + +-- !escape2 -- +false + +-- !escape3 -- +false + +-- !escape4 -- +true + +-- !escape5 -- +true + +-- !escape6 -- +true + diff --git a/regression-test/data/query_p0/sql_functions/conditional_functions/test_query_like.out b/regression-test/data/query_p0/sql_functions/conditional_functions/test_query_like.out index 5a9e10ed6cad1e..05417f338d24b2 100644 --- a/regression-test/data/query_p0/sql_functions/conditional_functions/test_query_like.out +++ b/regression-test/data/query_p0/sql_functions/conditional_functions/test_query_like.out @@ -77,3 +77,21 @@ true false -- !like24 -- false true +-- !escape1 -- +true + +-- !escape2 -- +false + +-- !escape3 -- +false + +-- !escape4 -- +true + +-- !escape5 -- +true + +-- !escape6 -- +true + diff --git a/regression-test/suites/nereids_p0/sql_functions/conditional_functions/test_query_like.groovy b/regression-test/suites/nereids_p0/sql_functions/conditional_functions/test_query_like.groovy index 5f1701778b70f0..c345ee36cf4202 100644 --- a/regression-test/suites/nereids_p0/sql_functions/conditional_functions/test_query_like.groovy +++ b/regression-test/suites/nereids_p0/sql_functions/conditional_functions/test_query_like.groovy @@ -47,4 +47,11 @@ suite("test_query_like", "query,p0") { qt_like22 """select "abcd%%1" like "abcd__1", "abcd%%1" not like "abcd__1" """ qt_like23 """select "abcd%%1" like "abcd_%_", "abcd%%1" not like "abcd_%_" """ qt_like24 """select "abcd%%1" like "abcd\\_%1", "abcd%%1" not like "abcd\\_%1" """ + + qt_escape1 """select 'facebook_10008_T1+T2-ALL_AAA-VO_LowestCost_20230830_HSJ' LIKE '%facebook_10008_T1+T2%' """ + qt_escape2 """select '!z23]' like '_[z]%' """ + qt_escape3 """select '[123]' like '%[1.*]%' """ + qt_escape4 """select '1\\b\\b' like '%_\\b\\b%' """ + qt_escape5 """select '1\\d\\d' like '%_\\d\\d%' """ + qt_escape6 """select '1dd' like '%_\\d\\d%' """ } diff --git a/regression-test/suites/query_p0/sql_functions/conditional_functions/test_query_like.groovy b/regression-test/suites/query_p0/sql_functions/conditional_functions/test_query_like.groovy index b15a5383d79fa0..9ebb300ee5ec6a 100644 --- a/regression-test/suites/query_p0/sql_functions/conditional_functions/test_query_like.groovy +++ b/regression-test/suites/query_p0/sql_functions/conditional_functions/test_query_like.groovy @@ -45,4 +45,11 @@ suite("test_query_like", "query,p0") { qt_like22 """select "abcd%%1" like "abcd__1", "abcd%%1" not like "abcd__1" """ qt_like23 """select "abcd%%1" like "abcd_%_", "abcd%%1" not like "abcd_%_" """ qt_like24 """select "abcd%%1" like "abcd\\_%1", "abcd%%1" not like "abcd\\_%1" """ + + qt_escape1 """select 'facebook_10008_T1+T2-ALL_AAA-VO_LowestCost_20230830_HSJ' LIKE '%facebook_10008_T1+T2%' """ + qt_escape2 """select '!z23]' like '_[z]%' """ + qt_escape3 """select '[123]' like '%[1.*]%' """ + qt_escape4 """select '1\\b\\b' like '%_\\b\\b%' """ + qt_escape5 """select '1\\d\\d' like '%_\\d\\d%' """ + qt_escape6 """select '1dd' like '%_\\d\\d%' """ }