From 4de32d0628fb5b2304dd8131fd7a35f62f2501e2 Mon Sep 17 00:00:00 2001
From: zhaochangle <zhaochangle@selectdb.com>
Date: Thu, 15 Aug 2024 03:06:04 +0800
Subject: [PATCH 1/2] 1

---
 be/src/olap/delete_handler.cpp                | 45 +++++++++----------
 .../data/delete_p0/test_delete_unicode.out    |  6 +++
 .../delete_p0/test_delete_unicode.groovy      | 39 ++++++++++++++++
 3 files changed, 66 insertions(+), 24 deletions(-)
 create mode 100644 regression-test/data/delete_p0/test_delete_unicode.out
 create mode 100644 regression-test/suites/delete_p0/test_delete_unicode.groovy

diff --git a/be/src/olap/delete_handler.cpp b/be/src/olap/delete_handler.cpp
index 6819d7d90f3ef7..09e23d515445cb 100644
--- a/be/src/olap/delete_handler.cpp
+++ b/be/src/olap/delete_handler.cpp
@@ -312,38 +312,35 @@ Status DeleteHandler::parse_condition(const DeleteSubPredicatePB& sub_cond, TCon
 // value: matches "1597751948193618247  and length(source)<1;\n;\n"
 //
 // For more info, see DeleteHandler::construct_sub_predicates
-// FIXME(gavin): support unicode. And this is a tricky implementation, it should
-//               not be the final resolution, refactor it.
+// FIXME(gavin): This is a tricky implementation, it should not be the final resolution, refactor it.
 const char* const CONDITION_STR_PATTERN =
-    // .----------------- column-name ----------------.   .----------------------- operator ------------------------.   .------------ value ----------.
-    R"(([_a-zA-Z@0-9\s/][.a-zA-Z0-9_+-/?@#$%^&*"\s,:]*)\s*((?:=)|(?:!=)|(?:>>)|(?:<<)|(?:>=)|(?:<=)|(?:\*=)|(?: IS ))\s*('((?:[\s\S]+)?)'|(?:[\s\S]+)?))";
-    // '----------------- group 1 --------------------'   '--------------------- group 2 ---------------------------'   | '-- group 4--'              |
-    //                                                         match any of: = != >> << >= <= *= " IS "                 '----------- group 3 ---------'
-    //                                                                                                                   match **ANY THING** without(4)
-    //                                                                                                                   or with(3) single quote
-boost::regex DELETE_HANDLER_REGEX(CONDITION_STR_PATTERN);
+    // .----------------- column-name --------------------------.   .----------------------- operator ------------------------.   .------------ value ----------.
+    R"(([_a-zA-Z@0-9\s/\p{L}][.a-zA-Z0-9_+-/?@#$%^&*"\s,:\p{L}]*)\s*((?:=)|(?:!=)|(?:>>)|(?:<<)|(?:>=)|(?:<=)|(?:\*=)|(?: IS ))\s*('((?:[\s\S]+)?)'|(?:[\s\S]+)?))";
+    // '----------------- group 1 ------------------------------'   '--------------------- group 2 ---------------------------'   | '-- group 4--'              |
+    //                                                                   match any of: = != >> << >= <= *= " IS "                 '----------- group 3 ---------'
+    //                                                                                                                             match **ANY THING** without(4)
+    //                                                                                                                             or with(3) single quote
 // clang-format on
+RE2 DELETE_HANDLER_REGEX(CONDITION_STR_PATTERN);
 
 Status DeleteHandler::parse_condition(const std::string& condition_str, TCondition* condition) {
-    bool matched = false;
-    boost::smatch what;
-    try {
-        VLOG_NOTICE << "condition_str: " << condition_str;
-        matched = boost::regex_match(condition_str, what, DELETE_HANDLER_REGEX) &&
-                  condition_str.size() == what[0].str().size(); // exact match
-    } catch (boost::regex_error& e) {
-        VLOG_NOTICE << "fail to parse expr. [expr=" << condition_str << "; error=" << e.what()
-                    << "]";
-    }
+    std::string col_name, op, value, g4;
+
+    bool matched = RE2::FullMatch(condition_str, DELETE_HANDLER_REGEX, &col_name, &op, &value,
+                                  &g4); // exact match
+
     if (!matched) {
-        return Status::Error<ErrorCode::INVALID_ARGUMENT>("fail to sub condition. condition={}",
-                                                          condition_str);
+        return Status::InvalidArgument("fail to sub condition. condition={}", condition_str);
     }
 
-    condition->column_name = what[1].str();
-    condition->condition_op = what[2].str() == " IS " ? "IS" : what[2].str();
+    condition->column_name = col_name;
+    condition->condition_op = op == " IS " ? "IS" : op;
     // match string with single quotes, a = b  or a = 'b'
-    condition->condition_values.push_back(what[3 + !!what[4].matched].str());
+    if (!g4.empty()) {
+        condition->condition_values.push_back(g4);
+    } else {
+        condition->condition_values.push_back(value);
+    }
     VLOG_NOTICE << "parsed condition_str: col_name={" << condition->column_name << "} op={"
                 << condition->condition_op << "} val={" << condition->condition_values.back()
                 << "}";
diff --git a/regression-test/data/delete_p0/test_delete_unicode.out b/regression-test/data/delete_p0/test_delete_unicode.out
new file mode 100644
index 00000000000000..c0cb04a2a1dde2
--- /dev/null
+++ b/regression-test/data/delete_p0/test_delete_unicode.out
@@ -0,0 +1,6 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !sql1 --
+2020-12-12	1	1	1
+
+-- !sql2 --
+
diff --git a/regression-test/suites/delete_p0/test_delete_unicode.groovy b/regression-test/suites/delete_p0/test_delete_unicode.groovy
new file mode 100644
index 00000000000000..9dd5f589a07dae
--- /dev/null
+++ b/regression-test/suites/delete_p0/test_delete_unicode.groovy
@@ -0,0 +1,39 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_delete_unicode") {
+    sql "set enable_unicode_name_support=true;"
+
+    sql """
+        CREATE TABLE `table_7298276` (
+        `中文列名1` date NOT NULL,
+        `中文列名2` int NOT NULL,
+        `中文列名3` bigint NOT NULL,
+        `中文列名4` largeint NOT NULL,
+        INDEX 中文列名2 (`中文列名2`) USING INVERTED,
+        INDEX 中文列名4 (`中文列名4`) USING INVERTED
+        ) ENGINE=OLAP
+        DUPLICATE KEY(`中文列名1`, `中文列名2`, `中文列名3`)
+        DISTRIBUTED BY HASH(`中文列名1`, `中文列名2`, `中文列名3`) BUCKETS 4
+        properties("replication_num" = "1");
+    """
+
+    sql """ insert into table_7298276 values ('2020-12-12',1,1,1);"""
+    qt_sql1 "select * from table_7298276;"
+    sql "delete from table_7298276 where 中文列名1 > '2012-08-17' and 中文列名2 > -68 and 中文列名3 in (1,2,3);"
+    qt_sql2 "select * from table_7298276;"
+}
\ No newline at end of file

From 53ce3e3e3f7abec9d61d8df0f44d048a9af05df2 Mon Sep 17 00:00:00 2001
From: zhaochangle <zhaochangle@selectdb.com>
Date: Thu, 15 Aug 2024 16:33:33 +0800
Subject: [PATCH 2/2] beut

---
 be/src/olap/delete_handler.cpp       | 4 ----
 be/test/olap/delete_handler_test.cpp | 4 ++++
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/be/src/olap/delete_handler.cpp b/be/src/olap/delete_handler.cpp
index 09e23d515445cb..b2692769906aae 100644
--- a/be/src/olap/delete_handler.cpp
+++ b/be/src/olap/delete_handler.cpp
@@ -21,8 +21,6 @@
 #include <gen_cpp/olap_file.pb.h>
 #include <thrift/protocol/TDebugProtocol.h>
 
-#include <boost/regex.hpp>
-#include <sstream>
 #include <string>
 #include <vector>
 
@@ -40,12 +38,10 @@
 using apache::thrift::ThriftDebugString;
 using std::vector;
 using std::string;
-using std::stringstream;
 
 using ::google::protobuf::RepeatedPtrField;
 
 namespace doris {
-using namespace ErrorCode;
 
 // construct sub condition from TCondition
 std::string construct_sub_predicate(const TCondition& condition) {
diff --git a/be/test/olap/delete_handler_test.cpp b/be/test/olap/delete_handler_test.cpp
index 335c163930df3e..0d45d28c2846b5 100644
--- a/be/test/olap/delete_handler_test.cpp
+++ b/be/test/olap/delete_handler_test.cpp
@@ -1225,6 +1225,10 @@ TEST_F(TestDeleteHandler, TestParseDeleteCondition) {
         {R"(a IS b IS NOT NULL)", true,  gen_cond(R"(a IS b)", "IS", R"(NOT NULL)"  )}, // test " IS " in column name
         {R"(_a-zA-Z@0-9 /.a-zA-Z0-9_+-/?@#$%^&*" ,:=hell)", true, gen_cond(R"(_a-zA-Z@0-9 /.a-zA-Z0-9_+-/?@#$%^&*" ,:)", "=", R"(hell)")}, // hellbound column name
         {R"(this is a col very loooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooon colum name=long)", true,  gen_cond(R"(this is a col very loooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooon colum name)", "=", R"(long)")}, // test " IS " in column name
+        {R"(中文列名1=b)"        , true,  gen_cond(R"(中文列名1)", "=" , R"(b)"        )}, // Chinese case
+        {R"(错!!误!=b)"         , false,  gen_cond(R"(abc)"   , "!=", R"(b)"         )}, // illegal character
+        {R"(##错误<=b)"         , false,  gen_cond(R"(abc)"   , "<=", R"(b)"         )}, // illegal prefix
+        {R"(κάνεις지내세요>>b)"   , true,  gen_cond(R"(κάνεις지내세요)", ">>", R"(b)"    )}, // other languages
     };
     for (auto& i : test_input) { test(i); }
 }