From 4de32d0628fb5b2304dd8131fd7a35f62f2501e2 Mon Sep 17 00:00:00 2001 From: zhaochangle Date: Thu, 15 Aug 2024 03:06:04 +0800 Subject: [PATCH 1/2] 1 --- be/src/olap/delete_handler.cpp | 45 +++++++++---------- .../data/delete_p0/test_delete_unicode.out | 6 +++ .../delete_p0/test_delete_unicode.groovy | 39 ++++++++++++++++ 3 files changed, 66 insertions(+), 24 deletions(-) create mode 100644 regression-test/data/delete_p0/test_delete_unicode.out create mode 100644 regression-test/suites/delete_p0/test_delete_unicode.groovy diff --git a/be/src/olap/delete_handler.cpp b/be/src/olap/delete_handler.cpp index 6819d7d90f3ef7..09e23d515445cb 100644 --- a/be/src/olap/delete_handler.cpp +++ b/be/src/olap/delete_handler.cpp @@ -312,38 +312,35 @@ Status DeleteHandler::parse_condition(const DeleteSubPredicatePB& sub_cond, TCon // value: matches "1597751948193618247 and length(source)<1;\n;\n" // // For more info, see DeleteHandler::construct_sub_predicates -// FIXME(gavin): support unicode. And this is a tricky implementation, it should -// not be the final resolution, refactor it. +// FIXME(gavin): This is a tricky implementation, it should not be the final resolution, refactor it. const char* const CONDITION_STR_PATTERN = - // .----------------- column-name ----------------. .----------------------- operator ------------------------. .------------ value ----------. - R"(([_a-zA-Z@0-9\s/][.a-zA-Z0-9_+-/?@#$%^&*"\s,:]*)\s*((?:=)|(?:!=)|(?:>>)|(?:<<)|(?:>=)|(?:<=)|(?:\*=)|(?: IS ))\s*('((?:[\s\S]+)?)'|(?:[\s\S]+)?))"; - // '----------------- group 1 --------------------' '--------------------- group 2 ---------------------------' | '-- group 4--' | - // match any of: = != >> << >= <= *= " IS " '----------- group 3 ---------' - // match **ANY THING** without(4) - // or with(3) single quote -boost::regex DELETE_HANDLER_REGEX(CONDITION_STR_PATTERN); + // .----------------- column-name --------------------------. .----------------------- operator ------------------------. .------------ value ----------. + R"(([_a-zA-Z@0-9\s/\p{L}][.a-zA-Z0-9_+-/?@#$%^&*"\s,:\p{L}]*)\s*((?:=)|(?:!=)|(?:>>)|(?:<<)|(?:>=)|(?:<=)|(?:\*=)|(?: IS ))\s*('((?:[\s\S]+)?)'|(?:[\s\S]+)?))"; + // '----------------- group 1 ------------------------------' '--------------------- group 2 ---------------------------' | '-- group 4--' | + // match any of: = != >> << >= <= *= " IS " '----------- group 3 ---------' + // match **ANY THING** without(4) + // or with(3) single quote // clang-format on +RE2 DELETE_HANDLER_REGEX(CONDITION_STR_PATTERN); Status DeleteHandler::parse_condition(const std::string& condition_str, TCondition* condition) { - bool matched = false; - boost::smatch what; - try { - VLOG_NOTICE << "condition_str: " << condition_str; - matched = boost::regex_match(condition_str, what, DELETE_HANDLER_REGEX) && - condition_str.size() == what[0].str().size(); // exact match - } catch (boost::regex_error& e) { - VLOG_NOTICE << "fail to parse expr. [expr=" << condition_str << "; error=" << e.what() - << "]"; - } + std::string col_name, op, value, g4; + + bool matched = RE2::FullMatch(condition_str, DELETE_HANDLER_REGEX, &col_name, &op, &value, + &g4); // exact match + if (!matched) { - return Status::Error("fail to sub condition. condition={}", - condition_str); + return Status::InvalidArgument("fail to sub condition. condition={}", condition_str); } - condition->column_name = what[1].str(); - condition->condition_op = what[2].str() == " IS " ? "IS" : what[2].str(); + condition->column_name = col_name; + condition->condition_op = op == " IS " ? "IS" : op; // match string with single quotes, a = b or a = 'b' - condition->condition_values.push_back(what[3 + !!what[4].matched].str()); + if (!g4.empty()) { + condition->condition_values.push_back(g4); + } else { + condition->condition_values.push_back(value); + } VLOG_NOTICE << "parsed condition_str: col_name={" << condition->column_name << "} op={" << condition->condition_op << "} val={" << condition->condition_values.back() << "}"; diff --git a/regression-test/data/delete_p0/test_delete_unicode.out b/regression-test/data/delete_p0/test_delete_unicode.out new file mode 100644 index 00000000000000..c0cb04a2a1dde2 --- /dev/null +++ b/regression-test/data/delete_p0/test_delete_unicode.out @@ -0,0 +1,6 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql1 -- +2020-12-12 1 1 1 + +-- !sql2 -- + diff --git a/regression-test/suites/delete_p0/test_delete_unicode.groovy b/regression-test/suites/delete_p0/test_delete_unicode.groovy new file mode 100644 index 00000000000000..9dd5f589a07dae --- /dev/null +++ b/regression-test/suites/delete_p0/test_delete_unicode.groovy @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_delete_unicode") { + sql "set enable_unicode_name_support=true;" + + sql """ + CREATE TABLE `table_7298276` ( + `中文列名1` date NOT NULL, + `中文列名2` int NOT NULL, + `中文列名3` bigint NOT NULL, + `中文列名4` largeint NOT NULL, + INDEX 中文列名2 (`中文列名2`) USING INVERTED, + INDEX 中文列名4 (`中文列名4`) USING INVERTED + ) ENGINE=OLAP + DUPLICATE KEY(`中文列名1`, `中文列名2`, `中文列名3`) + DISTRIBUTED BY HASH(`中文列名1`, `中文列名2`, `中文列名3`) BUCKETS 4 + properties("replication_num" = "1"); + """ + + sql """ insert into table_7298276 values ('2020-12-12',1,1,1);""" + qt_sql1 "select * from table_7298276;" + sql "delete from table_7298276 where 中文列名1 > '2012-08-17' and 中文列名2 > -68 and 中文列名3 in (1,2,3);" + qt_sql2 "select * from table_7298276;" +} \ No newline at end of file From 53ce3e3e3f7abec9d61d8df0f44d048a9af05df2 Mon Sep 17 00:00:00 2001 From: zhaochangle Date: Thu, 15 Aug 2024 16:33:33 +0800 Subject: [PATCH 2/2] beut --- be/src/olap/delete_handler.cpp | 4 ---- be/test/olap/delete_handler_test.cpp | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/be/src/olap/delete_handler.cpp b/be/src/olap/delete_handler.cpp index 09e23d515445cb..b2692769906aae 100644 --- a/be/src/olap/delete_handler.cpp +++ b/be/src/olap/delete_handler.cpp @@ -21,8 +21,6 @@ #include #include -#include -#include #include #include @@ -40,12 +38,10 @@ using apache::thrift::ThriftDebugString; using std::vector; using std::string; -using std::stringstream; using ::google::protobuf::RepeatedPtrField; namespace doris { -using namespace ErrorCode; // construct sub condition from TCondition std::string construct_sub_predicate(const TCondition& condition) { diff --git a/be/test/olap/delete_handler_test.cpp b/be/test/olap/delete_handler_test.cpp index 335c163930df3e..0d45d28c2846b5 100644 --- a/be/test/olap/delete_handler_test.cpp +++ b/be/test/olap/delete_handler_test.cpp @@ -1225,6 +1225,10 @@ TEST_F(TestDeleteHandler, TestParseDeleteCondition) { {R"(a IS b IS NOT NULL)", true, gen_cond(R"(a IS b)", "IS", R"(NOT NULL)" )}, // test " IS " in column name {R"(_a-zA-Z@0-9 /.a-zA-Z0-9_+-/?@#$%^&*" ,:=hell)", true, gen_cond(R"(_a-zA-Z@0-9 /.a-zA-Z0-9_+-/?@#$%^&*" ,:)", "=", R"(hell)")}, // hellbound column name {R"(this is a col very loooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooon colum name=long)", true, gen_cond(R"(this is a col very loooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooon colum name)", "=", R"(long)")}, // test " IS " in column name + {R"(中文列名1=b)" , true, gen_cond(R"(中文列名1)", "=" , R"(b)" )}, // Chinese case + {R"(错!!误!=b)" , false, gen_cond(R"(abc)" , "!=", R"(b)" )}, // illegal character + {R"(##错误<=b)" , false, gen_cond(R"(abc)" , "<=", R"(b)" )}, // illegal prefix + {R"(κάνεις지내세요>>b)" , true, gen_cond(R"(κάνεις지내세요)", ">>", R"(b)" )}, // other languages }; for (auto& i : test_input) { test(i); } }