From 807bea10dace1a13e43f99b44f3d429b825ba613 Mon Sep 17 00:00:00 2001 From: felixwluo Date: Thu, 17 Apr 2025 21:31:24 +0800 Subject: [PATCH 1/6] [feat](function) SUBSTRING_INDEX function delimiter supports dynamic --- be/src/vec/functions/function_string.h | 63 ++++++++++--------- .../functions/scalar/SubstringIndex.java | 11 ---- .../string_functions/test_string_function.out | 8 +++ .../test_string_function.groovy | 25 ++++++++ 4 files changed, 68 insertions(+), 39 deletions(-) diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h index 905c906db8e294..389af483206bc1 100644 --- a/be/src/vec/functions/function_string.h +++ b/be/src/vec/functions/function_string.h @@ -1804,24 +1804,34 @@ class FunctionSubstringIndex : public IFunction { const auto* str_col = assert_cast(content_column.get()); - [[maybe_unused]] const auto& [delimiter_col, delimiter_const] = + // Handle both constant and non-constant delimiter parameters + ColumnPtr delimiter_column_ptr; + bool delimiter_const = false; + std::tie(delimiter_column_ptr, delimiter_const) = unpack_if_const(block.get_by_position(arguments[1]).column); - auto delimiter = delimiter_col->get_data_at(0); - int32_t delimiter_size = delimiter.size; + const auto* delimiter_col = assert_cast(delimiter_column_ptr.get()); - [[maybe_unused]] const auto& [part_num_col, part_const] = + ColumnPtr part_num_column_ptr; + bool part_num_const = false; + std::tie(part_num_column_ptr, part_num_const) = unpack_if_const(block.get_by_position(arguments[2]).column); - auto part_number = *((int*)part_num_col->get_data_at(0).data); + const auto* part_num_col = part_num_column_ptr.get(); - if (part_number == 0 || delimiter_size == 0) { - for (size_t i = 0; i < input_rows_count; ++i) { + for (size_t i = 0; i < input_rows_count; ++i) { + auto str = str_col->get_data_at(i); + auto delimiter = delimiter_col->get_data_at(delimiter_const ? 0 : i); + int32_t delimiter_size = delimiter.size; + + const auto* part_num_data = part_num_col->get_data_at(part_num_const ? 0 : i).data; + auto part_number = *reinterpret_cast(part_num_data); + + if (part_number == 0 || delimiter_size == 0) { StringOP::push_empty_string(i, res_chars, res_offsets); + continue; } - } else if (part_number > 0) { - if (delimiter_size == 1) { - // If delimiter is a char, use memchr to split - for (size_t i = 0; i < input_rows_count; ++i) { - auto str = str_col->get_data_at(i); + + if (part_number > 0) { + if (delimiter_size == 1) { int32_t offset = -1; int32_t num = 0; while (num < part_number) { @@ -1847,12 +1857,10 @@ class FunctionSubstringIndex : public IFunction { StringOP::push_value_string(std::string_view(str.data, str.size), i, res_chars, res_offsets); } - } - } else { - StringRef delimiter_ref(delimiter); - StringSearch search(&delimiter_ref); - for (size_t i = 0; i < input_rows_count; ++i) { - auto str = str_col->get_data_at(i); + } else { + // For multi-character delimiters + StringRef delimiter_ref(delimiter); + StringSearch search(&delimiter_ref); int32_t offset = -delimiter_size; int32_t num = 0; while (num < part_number) { @@ -1879,21 +1887,20 @@ class FunctionSubstringIndex : public IFunction { res_chars, res_offsets); } } - } - } else { - // if part_number is negative - part_number = -part_number; - for (size_t i = 0; i < input_rows_count; ++i) { - auto str = str_col->get_data_at(i); + } else { + int neg_part_number = -part_number; auto str_str = str.to_string(); int32_t offset = str.size; int32_t pre_offset = offset; int32_t num = 0; auto substr = str_str; - while (num <= part_number && offset >= 0) { - offset = (int)substr.rfind(delimiter, offset); + + std::string delimiter_str(reinterpret_cast(delimiter.data), + delimiter.size); + while (num <= neg_part_number && offset >= 0) { + offset = (int)substr.rfind(delimiter_str, offset); if (offset != -1) { - if (++num == part_number) { + if (++num == neg_part_number) { break; } pre_offset = offset; @@ -1905,7 +1912,7 @@ class FunctionSubstringIndex : public IFunction { } num = (offset == -1 && num != 0) ? num + 1 : num; - if (num == part_number) { + if (num == neg_part_number) { if (offset == -1) { StringOP::push_value_string(std::string_view(str.data, str.size), i, res_chars, res_offsets); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SubstringIndex.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SubstringIndex.java index bb9e2b749c410b..7751578b2a472e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SubstringIndex.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SubstringIndex.java @@ -18,7 +18,6 @@ package org.apache.doris.nereids.trees.expressions.functions.scalar; import org.apache.doris.catalog.FunctionSignature; -import org.apache.doris.nereids.exceptions.AnalysisException; import org.apache.doris.nereids.trees.expressions.Expression; import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable; @@ -53,16 +52,6 @@ public SubstringIndex(Expression arg0, Expression arg1, Expression arg2) { super("substring_index", arg0, arg1, arg2); } - @Override - public void checkLegalityBeforeTypeCoercion() { - for (int i = 1; i < children.size(); ++i) { - if (!getArgument(i).isConstant()) { - throw new AnalysisException(getName() - + " function except for the first argument, other parameter must be a constant."); - } - } - } - /** * withChildren. */ diff --git a/regression-test/data/nereids_p0/sql_functions/string_functions/test_string_function.out b/regression-test/data/nereids_p0/sql_functions/string_functions/test_string_function.out index d85794989f7de0..f42c4292ad4071 100644 --- a/regression-test/data/nereids_p0/sql_functions/string_functions/test_string_function.out +++ b/regression-test/data/nereids_p0/sql_functions/string_functions/test_string_function.out @@ -380,6 +380,14 @@ doris -- !sql -- \N +-- !sql -- +1 BBB AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 _02|CCC_03|DDD_04|EEE_05|FFF_06 _02|CCC_03|DDD_04|EEE_05|FFF_06 +2 ccc zyz_01|zyz_02|CCC_03|qwe_04|qwe_05|qwe_06 zyz_01|zyz_02|CCC_03|qwe_04|qwe_05|qwe_06 zyz_01|zyz_02|CCC_03|qwe_04|qwe_05|qwe_06 +3 DDD AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 _04|EEE_05|FFF_06 _04|EEE_05|FFF_06 +4 DDD sgr_01|wsc_02|CCC_03|DDD_04|rfv_05|rgb_06 _04|rfv_05|rgb_06 _04|rfv_05|rgb_06 +5 eee cdr_01|vfr_02|dfc_03|DDD_04|EEE_05|FFF_06 cdr_01|vfr_02|dfc_03|DDD_04|EEE_05|FFF_06 cdr_01|vfr_02|dfc_03|DDD_04|EEE_05|FFF_06 +6 A_01 AAA_01|dsd_02|ert_03|bgt_04|fgh_05|hyb_06 |dsd_02|ert_03|bgt_04|fgh_05|hyb_06 |dsd_02|ert_03|bgt_04|fgh_05|hyb_06 + -- !sql -- tNEW-STRorigin str diff --git a/regression-test/suites/nereids_p0/sql_functions/string_functions/test_string_function.groovy b/regression-test/suites/nereids_p0/sql_functions/string_functions/test_string_function.groovy index 6e9cd947bc2ed5..99d8e9dfddef56 100644 --- a/regression-test/suites/nereids_p0/sql_functions/string_functions/test_string_function.groovy +++ b/regression-test/suites/nereids_p0/sql_functions/string_functions/test_string_function.groovy @@ -188,6 +188,31 @@ suite("test_string_function") { qt_sql "select elt(1, \"hello\", \"doris\");" qt_sql "select elt(2, \"hello\", \"doris\");" qt_sql "select elt(3, \"hello\", \"doris\");" + qt_sql """ + SELECT t1.no + ,t1.sub_str + ,t1.str + ,substring_index(t1.str, t1.sub_str, -1) + ,t2.rst2 + FROM ( + SELECT 1 AS no, 'BBB' AS sub_str, 'AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06' AS str UNION ALL + SELECT 2 AS no, 'ccc' AS sub_str, 'zyz_01|zyz_02|CCC_03|qwe_04|qwe_05|qwe_06' AS str UNION ALL + SELECT 3 AS no, 'DDD' AS sub_str, 'AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06' AS str UNION ALL + SELECT 4 AS no, 'DDD' AS sub_str, 'sgr_01|wsc_02|CCC_03|DDD_04|rfv_05|rgb_06' AS str UNION ALL + SELECT 5 AS no, 'eee' AS sub_str, 'cdr_01|vfr_02|dfc_03|DDD_04|EEE_05|FFF_06' AS str UNION ALL + SELECT 6 AS no, 'A_01' AS sub_str, 'AAA_01|dsd_02|ert_03|bgt_04|fgh_05|hyb_06' AS str + ) t1 + LEFT JOIN ( + SELECT 1 AS no, 'BBB' AS sub_str, substring_index('AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06', 'BBB', -1) AS rst2 UNION ALL + SELECT 2 AS no, 'ccc' AS sub_str, substring_index('zyz_01|zyz_02|CCC_03|qwe_04|qwe_05|qwe_06', 'ccc', -1) AS rst2 UNION ALL + SELECT 3 AS no, 'DDD' AS sub_str, substring_index('AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06', 'DDD', -1) AS rst2 UNION ALL + SELECT 4 AS no, 'DDD' AS sub_str, substring_index('sgr_01|wsc_02|CCC_03|DDD_04|rfv_05|rgb_06', 'DDD', -1) AS rst2 UNION ALL + SELECT 5 AS no, 'eee' AS sub_str, substring_index('cdr_01|vfr_02|dfc_03|DDD_04|EEE_05|FFF_06', 'eee', -1) AS rst2 UNION ALL + SELECT 6 AS no, 'A_01' AS sub_str, substring_index('AAA_01|dsd_02|ert_03|bgt_04|fgh_05|hyb_06', 'A_01', -1) AS rst2 + ) t2 + ON t1.no = t2.no AND t1.sub_str = t2.sub_str + ORDER BY t1.no; + """ qt_sql "select sub_replace(\"this is origin str\",\"NEW-STR\",1);" qt_sql "select sub_replace(\"doris\",\"***\",1,2);" From eab6f8b924ed2b72a087e8c87361329cb3358dc3 Mon Sep 17 00:00:00 2001 From: felixwluo Date: Fri, 18 Apr 2025 23:31:49 +0800 Subject: [PATCH 2/6] save --- be/src/vec/functions/function_string.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h index 389af483206bc1..315bc15ef98f85 100644 --- a/be/src/vec/functions/function_string.h +++ b/be/src/vec/functions/function_string.h @@ -1895,8 +1895,8 @@ class FunctionSubstringIndex : public IFunction { int32_t num = 0; auto substr = str_str; - std::string delimiter_str(reinterpret_cast(delimiter.data), - delimiter.size); + StringRef delimiter_str(reinterpret_cast(delimiter.data), + delimiter.size); while (num <= neg_part_number && offset >= 0) { offset = (int)substr.rfind(delimiter_str, offset); if (offset != -1) { From c5d6237d986acc7b9174e6081471048bbad25d5f Mon Sep 17 00:00:00 2001 From: felixwluo Date: Mon, 21 Apr 2025 13:12:10 +0800 Subject: [PATCH 3/6] save --- .../string_functions/test_string_function.out | 13 +-- .../test_string_function.groovy | 98 ++++++++++++++----- 2 files changed, 82 insertions(+), 29 deletions(-) diff --git a/regression-test/data/nereids_p0/sql_functions/string_functions/test_string_function.out b/regression-test/data/nereids_p0/sql_functions/string_functions/test_string_function.out index f42c4292ad4071..be535f7402b3b1 100644 --- a/regression-test/data/nereids_p0/sql_functions/string_functions/test_string_function.out +++ b/regression-test/data/nereids_p0/sql_functions/string_functions/test_string_function.out @@ -381,12 +381,13 @@ doris \N -- !sql -- -1 BBB AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 _02|CCC_03|DDD_04|EEE_05|FFF_06 _02|CCC_03|DDD_04|EEE_05|FFF_06 -2 ccc zyz_01|zyz_02|CCC_03|qwe_04|qwe_05|qwe_06 zyz_01|zyz_02|CCC_03|qwe_04|qwe_05|qwe_06 zyz_01|zyz_02|CCC_03|qwe_04|qwe_05|qwe_06 -3 DDD AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 _04|EEE_05|FFF_06 _04|EEE_05|FFF_06 -4 DDD sgr_01|wsc_02|CCC_03|DDD_04|rfv_05|rgb_06 _04|rfv_05|rgb_06 _04|rfv_05|rgb_06 -5 eee cdr_01|vfr_02|dfc_03|DDD_04|EEE_05|FFF_06 cdr_01|vfr_02|dfc_03|DDD_04|EEE_05|FFF_06 cdr_01|vfr_02|dfc_03|DDD_04|EEE_05|FFF_06 -6 A_01 AAA_01|dsd_02|ert_03|bgt_04|fgh_05|hyb_06 |dsd_02|ert_03|bgt_04|fgh_05|hyb_06 |dsd_02|ert_03|bgt_04|fgh_05|hyb_06 +AAA|BBB|CCC BBB |CCC |CCC +XXX|YYY|ZZZ YYY |ZZZ |ZZZ + +-- !sql -- +1 AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 DDD _04|EEE_05|FFF_06 _04|EEE_05|FFF_06 +2 sgr_01|wsc_02|CCC_03|DDD_04|rfv_05|rgb_06 DDD _04|rfv_05|rgb_06 _04|rfv_05|rgb_06 +3 AAA_01|dsd_02|ert_03|bgt_04|fgh_05|hyb_06 A_01 |dsd_02|ert_03|bgt_04|fgh_05|hyb_06 |dsd_02|ert_03|bgt_04|fgh_05|hyb_06 -- !sql -- tNEW-STRorigin str diff --git a/regression-test/suites/nereids_p0/sql_functions/string_functions/test_string_function.groovy b/regression-test/suites/nereids_p0/sql_functions/string_functions/test_string_function.groovy index 99d8e9dfddef56..472d59f8ab9251 100644 --- a/regression-test/suites/nereids_p0/sql_functions/string_functions/test_string_function.groovy +++ b/regression-test/suites/nereids_p0/sql_functions/string_functions/test_string_function.groovy @@ -189,29 +189,81 @@ suite("test_string_function") { qt_sql "select elt(2, \"hello\", \"doris\");" qt_sql "select elt(3, \"hello\", \"doris\");" qt_sql """ - SELECT t1.no - ,t1.sub_str - ,t1.str - ,substring_index(t1.str, t1.sub_str, -1) - ,t2.rst2 - FROM ( - SELECT 1 AS no, 'BBB' AS sub_str, 'AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06' AS str UNION ALL - SELECT 2 AS no, 'ccc' AS sub_str, 'zyz_01|zyz_02|CCC_03|qwe_04|qwe_05|qwe_06' AS str UNION ALL - SELECT 3 AS no, 'DDD' AS sub_str, 'AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06' AS str UNION ALL - SELECT 4 AS no, 'DDD' AS sub_str, 'sgr_01|wsc_02|CCC_03|DDD_04|rfv_05|rgb_06' AS str UNION ALL - SELECT 5 AS no, 'eee' AS sub_str, 'cdr_01|vfr_02|dfc_03|DDD_04|EEE_05|FFF_06' AS str UNION ALL - SELECT 6 AS no, 'A_01' AS sub_str, 'AAA_01|dsd_02|ert_03|bgt_04|fgh_05|hyb_06' AS str - ) t1 - LEFT JOIN ( - SELECT 1 AS no, 'BBB' AS sub_str, substring_index('AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06', 'BBB', -1) AS rst2 UNION ALL - SELECT 2 AS no, 'ccc' AS sub_str, substring_index('zyz_01|zyz_02|CCC_03|qwe_04|qwe_05|qwe_06', 'ccc', -1) AS rst2 UNION ALL - SELECT 3 AS no, 'DDD' AS sub_str, substring_index('AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06', 'DDD', -1) AS rst2 UNION ALL - SELECT 4 AS no, 'DDD' AS sub_str, substring_index('sgr_01|wsc_02|CCC_03|DDD_04|rfv_05|rgb_06', 'DDD', -1) AS rst2 UNION ALL - SELECT 5 AS no, 'eee' AS sub_str, substring_index('cdr_01|vfr_02|dfc_03|DDD_04|EEE_05|FFF_06', 'eee', -1) AS rst2 UNION ALL - SELECT 6 AS no, 'A_01' AS sub_str, substring_index('AAA_01|dsd_02|ert_03|bgt_04|fgh_05|hyb_06', 'A_01', -1) AS rst2 - ) t2 - ON t1.no = t2.no AND t1.sub_str = t2.sub_str - ORDER BY t1.no; + SELECT + t1.str AS original_string, + t1.delimiter AS delimiter, + substring_index(t1.str, t1.delimiter, -1) AS dynamic_result, + t2.expected_result + FROM + ( + SELECT + 'AAA|BBB|CCC' AS str, + 'BBB' AS delimiter + UNION + ALL + SELECT + 'XXX|YYY|ZZZ' AS str, + 'YYY' AS delimiter + ) t1 + LEFT JOIN ( + SELECT + 'AAA|BBB|CCC' AS str, + 'BBB' AS delimiter, + substring_index('AAA|BBB|CCC', 'BBB', -1) AS expected_result + UNION + ALL + SELECT + 'XXX|YYY|ZZZ' AS str, + 'YYY' AS delimiter, + substring_index('XXX|YYY|ZZZ', 'YYY', -1) AS expected_result + ) t2 ON t1.str = t2.str + AND t1.delimiter = t2.delimiter; + """ + + sql """ DROP TABLE IF EXISTS test_substring_index; """ + + sql """ + CREATE TABLE test_substring_index ( + id int NULL, + str varchar(150) NULL, + delimiter varchar(150) NULL + ) ENGINE = OLAP UNIQUE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 3 PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql """ + INSERT INTO test_substring_index VALUES + (1, 'AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06', 'DDD'), + (2, 'sgr_01|wsc_02|CCC_03|DDD_04|rfv_05|rgb_06', 'DDD'), + (3, 'AAA_01|dsd_02|ert_03|bgt_04|fgh_05|hyb_06', 'A_01'); + """ + + qt_sql """ + SELECT + id, + str, + delimiter, + substring_index(str, delimiter, -1) AS dynamic_result, + CASE + WHEN id = 1 THEN substring_index( + 'AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06', + 'DDD', + -1 + ) + WHEN id = 2 THEN substring_index( + 'sgr_01|wsc_02|CCC_03|DDD_04|rfv_05|rgb_06', + 'DDD', + -1 + ) + WHEN id = 3 THEN substring_index( + 'AAA_01|dsd_02|ert_03|bgt_04|fgh_05|hyb_06', + 'A_01', + -1 + ) + END AS expected_result + FROM + test_substring_index order by id; """ qt_sql "select sub_replace(\"this is origin str\",\"NEW-STR\",1);" From 75cff698f8de08f49dd46c45b651f14d5c394c02 Mon Sep 17 00:00:00 2001 From: felixwluo Date: Mon, 21 Apr 2025 15:57:00 +0800 Subject: [PATCH 4/6] add p0 --- .../data/function_p0/test_substring_index.out | 48 ++++++ .../test_substring_index_columns.out | 23 +++ .../test_substring_index_simple.out | 82 +++++++++++ .../string_functions/test_string_function.out | 9 -- .../function_p0/test_substring_index.groovy | 137 ++++++++++++++++++ .../test_substring_index_columns.groovy | 114 +++++++++++++++ .../test_substring_index_simple.groovy | 89 ++++++++++++ .../test_string_function.groovy | 77 ---------- 8 files changed, 493 insertions(+), 86 deletions(-) create mode 100644 regression-test/data/function_p0/test_substring_index.out create mode 100644 regression-test/data/function_p0/test_substring_index_columns.out create mode 100644 regression-test/data/function_p0/test_substring_index_simple.out create mode 100644 regression-test/suites/function_p0/test_substring_index.groovy create mode 100644 regression-test/suites/function_p0/test_substring_index_columns.groovy create mode 100644 regression-test/suites/function_p0/test_substring_index_simple.groovy diff --git a/regression-test/data/function_p0/test_substring_index.out b/regression-test/data/function_p0/test_substring_index.out new file mode 100644 index 00000000000000..480f2c2b482028 --- /dev/null +++ b/regression-test/data/function_p0/test_substring_index.out @@ -0,0 +1,48 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +1 AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 BBB -1 _02|CCC_03|DDD_04|EEE_05|FFF_06 +2 zyz_01|zyz_02|CCC_03|qwe_04|qwe_05|qwe_06 ccc -1 zyz_01|zyz_02|CCC_03|qwe_04|qwe_05|qwe_06 +3 AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 DDD -1 _04|EEE_05|FFF_06 +4 sgr_01|wsc_02|CCC_03|DDD_04|rfv_05|rgb_06 DDD -1 _04|rfv_05|rgb_06 +5 cdr_01|vfr_02|dfc_03|DDD_04|EEE_05|FFF_06 eee -1 cdr_01|vfr_02|dfc_03|DDD_04|EEE_05|FFF_06 +6 AAA_01|dsd_02|ert_03|bgt_04|fgh_05|hyb_06 A_01 -1 |dsd_02|ert_03|bgt_04|fgh_05|hyb_06 +7 AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 BBB 1 AAA_01| +8 AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 | 2 AAA_01|BBB_02 +9 AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 | -2 EEE_05|FFF_06 +10 ABC | 1 ABC +11 ABC|DEF | 0 +12 ABC 1 +13 ABC|DEF|GHI 1 + +-- !sql -- +101 北京市|上海市|广州市|深圳市|成都市 | 2 北京市|上海市 +102 北京市|上海市|广州市|深圳市|成都市 | -2 深圳市|成都市 +103 北京市|上海市|广州市|深圳市|成都市 上海 -1 市|广州市|深圳市|成都市 +104 中国人民共和国 人民 1 中国 +105 中国人民共和国 人民 -1 共和国 +106 你好,世界!你好,朋友! 你好 1 +107 你好,世界!你好,朋友! 你好 -1 ,朋友! +108 你好,世界!你好,朋友! 世界 -1 !你好,朋友! +109 中文|测试|数据 测试 1 中文| +110 中文|测试|数据 测试 -1 |数据 + +-- !sql -- +201 hello😀world😀example 😀 1 hello +202 hello😀world😀example 😀 2 hello😀world +203 hello😀world😀example 😀 -1 example +204 👋👋hello👋world👋 👋 2 👋 +205 👋👋hello👋world👋 👋 -2 world👋 + +-- !sql -- +1 AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 BBB -1 _02|CCC_03|DDD_04|EEE_05|FFF_06 +3 AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 DDD -1 _04|EEE_05|FFF_06 +7 AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 BBB 1 AAA_01| +8 AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 | 2 AAA_01|BBB_02 +101 北京市|上海市|广州市|深圳市|成都市 | 2 北京市|上海市 +103 北京市|上海市|广州市|深圳市|成都市 上海 -1 市|广州市|深圳市|成都市 +201 hello😀world😀example 😀 1 hello +203 hello😀world😀example 😀 -1 example + +-- !sql -- + test|test test|test + diff --git a/regression-test/data/function_p0/test_substring_index_columns.out b/regression-test/data/function_p0/test_substring_index_columns.out new file mode 100644 index 00000000000000..ef023cd37a2956 --- /dev/null +++ b/regression-test/data/function_p0/test_substring_index_columns.out @@ -0,0 +1,23 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +1 BBB AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 _02|CCC_03|DDD_04|EEE_05|FFF_06 +2 ccc zyz_01|zyz_02|CCC_03|qwe_04|qwe_05|qwe_06 zyz_01|zyz_02|CCC_03|qwe_04|qwe_05|qwe_06 +3 DDD AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 _04|EEE_05|FFF_06 +4 DDD sgr_01|wsc_02|CCC_03|DDD_04|rfv_05|rgb_06 _04|rfv_05|rgb_06 +5 eee cdr_01|vfr_02|dfc_03|DDD_04|EEE_05|FFF_06 cdr_01|vfr_02|dfc_03|DDD_04|EEE_05|FFF_06 +6 A_01 AAA_01|dsd_02|ert_03|bgt_04|fgh_05|hyb_06 |dsd_02|ert_03|bgt_04|fgh_05|hyb_06 + +-- !sql -- +7 市 北京市|上海市|广州市|深圳市 北京 +8 人民 中华人民共和国 中华 共和国 +9 分隔符 中文分隔符测试分隔符数据 中文 数据 +10 你好 你好,世界!你好,朋友! ,朋友! + +-- !sql -- +1 field1,field2,field3,field4 , 2 field1,field2 +2 field1,field2,field3,field4 , -1 field4 +3 AAA_01|BBB_02|CCC_03 | 2 AAA_01|BBB_02 +4 AAA_01|BBB_02|CCC_03 | -2 BBB_02|CCC_03 +5 中文分隔符测试分隔符数据 分隔符 1 中文 +6 中文分隔符测试分隔符数据 分隔符 -1 数据 + diff --git a/regression-test/data/function_p0/test_substring_index_simple.out b/regression-test/data/function_p0/test_substring_index_simple.out new file mode 100644 index 00000000000000..ca236ccc9bd7bc --- /dev/null +++ b/regression-test/data/function_p0/test_substring_index_simple.out @@ -0,0 +1,82 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +AAA_01 + +-- !sql -- +CCC_03 + +-- !sql -- +AAA_01|BBB_02 + +-- !sql -- +BBB_02|CCC_03 + +-- !sql -- +AAA_01|BBB_02|CCC_03 + +-- !sql -- +AAA_01|BBB_02|CCC_03 + +-- !sql -- + + +-- !sql -- + + +-- !sql -- + + +-- !sql -- +AAA_01|BBB_02|CCC_03 + +-- !sql -- +AAA_01|BBB_02|CCC_03 + +-- !sql -- +AAA_01 + +-- !sql -- +CCC_03 + +-- !sql -- +_02|CCC_03 + +-- !sql -- + + +-- !sql -- + + +-- !sql -- +北京市|上海市 + +-- !sql -- +北京市 + +-- !sql -- +广州市 + +-- !sql -- +hello + +-- !sql -- +example + +-- !sql -- +AAA_01|BBB_02 + +-- !sql -- +AAA_01|BBB_02 + +-- !sql -- +AAA_01|BBB_02 + +-- !sql -- +AAA_01|BBB_02 + +-- !sql -- +中文_ + +-- !sql -- +_02|CCC_03 AAA_01|BBB_02|CCC_03 + diff --git a/regression-test/data/nereids_p0/sql_functions/string_functions/test_string_function.out b/regression-test/data/nereids_p0/sql_functions/string_functions/test_string_function.out index be535f7402b3b1..d85794989f7de0 100644 --- a/regression-test/data/nereids_p0/sql_functions/string_functions/test_string_function.out +++ b/regression-test/data/nereids_p0/sql_functions/string_functions/test_string_function.out @@ -380,15 +380,6 @@ doris -- !sql -- \N --- !sql -- -AAA|BBB|CCC BBB |CCC |CCC -XXX|YYY|ZZZ YYY |ZZZ |ZZZ - --- !sql -- -1 AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 DDD _04|EEE_05|FFF_06 _04|EEE_05|FFF_06 -2 sgr_01|wsc_02|CCC_03|DDD_04|rfv_05|rgb_06 DDD _04|rfv_05|rgb_06 _04|rfv_05|rgb_06 -3 AAA_01|dsd_02|ert_03|bgt_04|fgh_05|hyb_06 A_01 |dsd_02|ert_03|bgt_04|fgh_05|hyb_06 |dsd_02|ert_03|bgt_04|fgh_05|hyb_06 - -- !sql -- tNEW-STRorigin str diff --git a/regression-test/suites/function_p0/test_substring_index.groovy b/regression-test/suites/function_p0/test_substring_index.groovy new file mode 100644 index 00000000000000..47df1208a0072c --- /dev/null +++ b/regression-test/suites/function_p0/test_substring_index.groovy @@ -0,0 +1,137 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_substring_index") { + sql "set enable_nereids_planner=true" + sql "set enable_fallback_to_original_planner=false" + + sql "DROP TABLE IF EXISTS test_substring_index" + sql """ + CREATE TABLE test_substring_index ( + id INT, + str VARCHAR(100), + delimiter VARCHAR(10), + count INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ) + """ + + sql """ + INSERT INTO test_substring_index VALUES + (1, 'AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06', 'BBB', -1), + (2, 'zyz_01|zyz_02|CCC_03|qwe_04|qwe_05|qwe_06', 'ccc', -1), + (3, 'AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06', 'DDD', -1), + (4, 'sgr_01|wsc_02|CCC_03|DDD_04|rfv_05|rgb_06', 'DDD', -1), + (5, 'cdr_01|vfr_02|dfc_03|DDD_04|EEE_05|FFF_06', 'eee', -1), + (6, 'AAA_01|dsd_02|ert_03|bgt_04|fgh_05|hyb_06', 'A_01', -1), + (7, 'AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06', 'BBB', 1), + (8, 'AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06', '|', 2), + (9, 'AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06', '|', -2), + (10, 'ABC', '|', 1), + (11, 'ABC|DEF', '|', 0), + (12, '', 'ABC', 1), + (13, 'ABC|DEF|GHI', '', 1) + """ + + sql """ + INSERT INTO test_substring_index VALUES + (101, '北京市|上海市|广州市|深圳市|成都市', '|', 2), + (102, '北京市|上海市|广州市|深圳市|成都市', '|', -2), + (103, '北京市|上海市|广州市|深圳市|成都市', '上海', -1), + (104, '中国人民共和国', '人民', 1), + (105, '中国人民共和国', '人民', -1), + (106, '你好,世界!你好,朋友!', '你好', 1), + (107, '你好,世界!你好,朋友!', '你好', -1), + (108, '你好,世界!你好,朋友!', '世界', -1), + (109, '中文|测试|数据', '测试', 1), + (110, '中文|测试|数据', '测试', -1) + """ + + sql """ + INSERT INTO test_substring_index VALUES + (201, 'hello😀world😀example', '😀', 1), + (202, 'hello😀world😀example', '😀', 2), + (203, 'hello😀world😀example', '😀', -1), + (204, '👋👋hello👋world👋', '👋', 2), + (205, '👋👋hello👋world👋', '👋', -2) + """ + + qt_sql """ + SELECT + id, + str, + delimiter, + count, + substring_index(str, delimiter, count) as result + FROM test_substring_index + WHERE id BETWEEN 1 AND 13 + ORDER BY id + """ + + qt_sql """ + SELECT + id, + str, + delimiter, + count, + substring_index(str, delimiter, count) as result + FROM test_substring_index + WHERE id BETWEEN 101 AND 110 + ORDER BY id + """ + + qt_sql """ + SELECT + id, + str, + delimiter, + count, + substring_index(str, delimiter, count) as result + FROM test_substring_index + WHERE id BETWEEN 201 AND 205 + ORDER BY id + """ + + qt_sql """ + SELECT + a.id, + a.str, + a.delimiter, + b.count, + substring_index(a.str, a.delimiter, b.count) as result + FROM test_substring_index a + JOIN test_substring_index b ON a.id = b.id + WHERE a.id IN (1, 3, 7, 8, 101, 103, 201, 203) + ORDER BY a.id + """ + + qt_sql """ + SELECT + substring_index('', '', 1) as empty_all, + substring_index('test', '', 1) as empty_delimiter, + substring_index('', 'test', 1) as empty_string, + substring_index('test', 'test', 0) as zero_count, + substring_index('test|test', '|', 999) as large_count, + substring_index('test|test', '|', -999) as large_negative_count + """ + + sql "DROP TABLE IF EXISTS test_substring_index" +} \ No newline at end of file diff --git a/regression-test/suites/function_p0/test_substring_index_columns.groovy b/regression-test/suites/function_p0/test_substring_index_columns.groovy new file mode 100644 index 00000000000000..80fb3b5318da1d --- /dev/null +++ b/regression-test/suites/function_p0/test_substring_index_columns.groovy @@ -0,0 +1,114 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_substring_index_columns") { + sql "set enable_nereids_planner=true" + sql "set enable_fallback_to_original_planner=false" + + sql "DROP TABLE IF EXISTS test_substring_index_compat" + sql """ + CREATE TABLE test_substring_index_compat ( + no INT, + sub_str VARCHAR(50), + str VARCHAR(100) + ) ENGINE=OLAP + DUPLICATE KEY(no) + DISTRIBUTED BY HASH(no) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ) + """ + + sql """ + INSERT INTO test_substring_index_compat VALUES + (1, 'BBB', 'AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06'), + (2, 'ccc', 'zyz_01|zyz_02|CCC_03|qwe_04|qwe_05|qwe_06'), + (3, 'DDD', 'AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06'), + (4, 'DDD', 'sgr_01|wsc_02|CCC_03|DDD_04|rfv_05|rgb_06'), + (5, 'eee', 'cdr_01|vfr_02|dfc_03|DDD_04|EEE_05|FFF_06'), + (6, 'A_01', 'AAA_01|dsd_02|ert_03|bgt_04|fgh_05|hyb_06') + """ + + qt_sql """ + SELECT + no, + sub_str AS '分隔符字符串', + str AS '需要截取的字符串', + substring_index(str, sub_str, -1) AS '动态分隔符结果' + FROM test_substring_index_compat + ORDER BY no + """ + + sql """ + INSERT INTO test_substring_index_compat VALUES + (7, '市', '北京市|上海市|广州市|深圳市'), + (8, '人民', '中华人民共和国'), + (9, '分隔符', '中文分隔符测试分隔符数据'), + (10, '你好', '你好,世界!你好,朋友!') + """ + + qt_sql """ + SELECT + no, + sub_str AS '分隔符字符串', + str AS '需要截取的字符串', + substring_index(str, sub_str, 1) AS '正向截取', + substring_index(str, sub_str, -1) AS '反向截取' + FROM test_substring_index_compat + WHERE no > 6 + ORDER BY no + """ + + sql "DROP TABLE IF EXISTS test_dynamic_params" + sql """ + CREATE TABLE test_dynamic_params ( + id INT, + source_str VARCHAR(100), + delimiter VARCHAR(20), + count_val INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ) + """ + + sql """ + INSERT INTO test_dynamic_params VALUES + (1, 'field1,field2,field3,field4', ',', 2), + (2, 'field1,field2,field3,field4', ',', -1), + (3, 'AAA_01|BBB_02|CCC_03', '|', 2), + (4, 'AAA_01|BBB_02|CCC_03', '|', -2), + (5, '中文分隔符测试分隔符数据', '分隔符', 1), + (6, '中文分隔符测试分隔符数据', '分隔符', -1) + """ + + qt_sql """ + SELECT + id, + source_str, + delimiter, + count_val, + substring_index(source_str, delimiter, count_val) AS result + FROM test_dynamic_params + ORDER BY id + """ + + sql "DROP TABLE IF EXISTS test_substring_index_compat" + sql "DROP TABLE IF EXISTS test_dynamic_params" +} \ No newline at end of file diff --git a/regression-test/suites/function_p0/test_substring_index_simple.groovy b/regression-test/suites/function_p0/test_substring_index_simple.groovy new file mode 100644 index 00000000000000..82d97e2378233e --- /dev/null +++ b/regression-test/suites/function_p0/test_substring_index_simple.groovy @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_substring_index_simple") { + sql "set enable_nereids_planner=true" + sql "set enable_fallback_to_original_planner=false" + + qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', '|', 1) as result""" + + qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', '|', -1) as result""" + + qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', '|', 2) as result""" + + qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', '|', -2) as result""" + + qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', 'XYZ', 1) as result""" + + qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', 'XYZ', -1) as result""" + + qt_sql """SELECT substring_index('', '|', 1) as result""" + + qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', '', 1) as result""" + + qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', '|', 0) as result""" + + qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', '|', 10) as result""" + + qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', '|', -10) as result""" + + qt_sql """SELECT substring_index('AAA_01||BBB_02||CCC_03', '||', 1) as result""" + + qt_sql """SELECT substring_index('AAA_01||BBB_02||CCC_03', '||', -1) as result""" + + qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', 'BBB', -1) as result""" + + qt_sql """SELECT substring_index('|AAA_01|BBB_02|CCC_03', '|', 1) as result""" + + qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03|', '|', -1) as result""" + + qt_sql """SELECT substring_index('北京市|上海市|广州市', '|', 2) as result""" + + qt_sql """SELECT substring_index('北京市分隔符上海市分隔符广州市', '分隔符', 1) as result""" + + qt_sql """SELECT substring_index('北京市分隔符上海市分隔符广州市', '分隔符', -1) as result""" + + qt_sql """SELECT substring_index('hello😀world😀example', '😀', 1) as result""" + + qt_sql """SELECT substring_index('hello😀world😀example', '😀', -1) as result""" + + qt_sql """ + SELECT substring_index('AAA_01|BBB_02|CCC_03', (SELECT '|'), 2) as result + """ + + qt_sql """ + SELECT substring_index('AAA_01|BBB_02|CCC_03', '|', (SELECT 2)) as result + """ + + qt_sql """ + SELECT substring_index('AAA_01|BBB_02|CCC_03', (SELECT '|'), (SELECT 2)) as result + """ + + qt_sql """ + SELECT substring_index('AAA_01|BBB_02|CCC_03', concat('|'), 2) as result + """ + + qt_sql """ + SELECT substring_index('中文_分隔符_测试_分隔符_数据', concat('分', '隔', '符'), 1) as result + """ + + qt_sql """ + SELECT + substring_index('AAA_01|BBB_02|CCC_03', 'BBB', -1) as result1, + substring_index('AAA_01|BBB_02|CCC_03', 'bbb', -1) as result2 + """ +} \ No newline at end of file diff --git a/regression-test/suites/nereids_p0/sql_functions/string_functions/test_string_function.groovy b/regression-test/suites/nereids_p0/sql_functions/string_functions/test_string_function.groovy index 472d59f8ab9251..6e9cd947bc2ed5 100644 --- a/regression-test/suites/nereids_p0/sql_functions/string_functions/test_string_function.groovy +++ b/regression-test/suites/nereids_p0/sql_functions/string_functions/test_string_function.groovy @@ -188,83 +188,6 @@ suite("test_string_function") { qt_sql "select elt(1, \"hello\", \"doris\");" qt_sql "select elt(2, \"hello\", \"doris\");" qt_sql "select elt(3, \"hello\", \"doris\");" - qt_sql """ - SELECT - t1.str AS original_string, - t1.delimiter AS delimiter, - substring_index(t1.str, t1.delimiter, -1) AS dynamic_result, - t2.expected_result - FROM - ( - SELECT - 'AAA|BBB|CCC' AS str, - 'BBB' AS delimiter - UNION - ALL - SELECT - 'XXX|YYY|ZZZ' AS str, - 'YYY' AS delimiter - ) t1 - LEFT JOIN ( - SELECT - 'AAA|BBB|CCC' AS str, - 'BBB' AS delimiter, - substring_index('AAA|BBB|CCC', 'BBB', -1) AS expected_result - UNION - ALL - SELECT - 'XXX|YYY|ZZZ' AS str, - 'YYY' AS delimiter, - substring_index('XXX|YYY|ZZZ', 'YYY', -1) AS expected_result - ) t2 ON t1.str = t2.str - AND t1.delimiter = t2.delimiter; - """ - - sql """ DROP TABLE IF EXISTS test_substring_index; """ - - sql """ - CREATE TABLE test_substring_index ( - id int NULL, - str varchar(150) NULL, - delimiter varchar(150) NULL - ) ENGINE = OLAP UNIQUE KEY(id) DISTRIBUTED BY HASH(id) BUCKETS 3 PROPERTIES ( - "replication_allocation" = "tag.location.default: 1" - ); - """ - - sql """ - INSERT INTO test_substring_index VALUES - (1, 'AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06', 'DDD'), - (2, 'sgr_01|wsc_02|CCC_03|DDD_04|rfv_05|rgb_06', 'DDD'), - (3, 'AAA_01|dsd_02|ert_03|bgt_04|fgh_05|hyb_06', 'A_01'); - """ - - qt_sql """ - SELECT - id, - str, - delimiter, - substring_index(str, delimiter, -1) AS dynamic_result, - CASE - WHEN id = 1 THEN substring_index( - 'AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06', - 'DDD', - -1 - ) - WHEN id = 2 THEN substring_index( - 'sgr_01|wsc_02|CCC_03|DDD_04|rfv_05|rgb_06', - 'DDD', - -1 - ) - WHEN id = 3 THEN substring_index( - 'AAA_01|dsd_02|ert_03|bgt_04|fgh_05|hyb_06', - 'A_01', - -1 - ) - END AS expected_result - FROM - test_substring_index order by id; - """ qt_sql "select sub_replace(\"this is origin str\",\"NEW-STR\",1);" qt_sql "select sub_replace(\"doris\",\"***\",1,2);" From b24fcdf34f88bcc83823521124d2034ab2478466 Mon Sep 17 00:00:00 2001 From: felixwluo Date: Mon, 21 Apr 2025 22:05:35 +0800 Subject: [PATCH 5/6] fix --- be/src/vec/functions/function_string.h | 31 +++++++++++++++++++------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h index 315bc15ef98f85..f5853fcc90b1e9 100644 --- a/be/src/vec/functions/function_string.h +++ b/be/src/vec/functions/function_string.h @@ -1815,15 +1815,22 @@ class FunctionSubstringIndex : public IFunction { bool part_num_const = false; std::tie(part_num_column_ptr, part_num_const) = unpack_if_const(block.get_by_position(arguments[2]).column); - const auto* part_num_col = part_num_column_ptr.get(); + const ColumnVector* part_num_col = assert_cast*>(part_num_column_ptr.get()); + + // For constant multi-character delimiters, create StringRef and StringSearch only once + std::optional const_delimiter_ref; + std::optional const_search; + if (delimiter_const && delimiter_col->get_data_at(0).size > 1) { + const_delimiter_ref.emplace(delimiter_col->get_data_at(0)); + const_search.emplace(&const_delimiter_ref.value()); + } for (size_t i = 0; i < input_rows_count; ++i) { auto str = str_col->get_data_at(i); auto delimiter = delimiter_col->get_data_at(delimiter_const ? 0 : i); int32_t delimiter_size = delimiter.size; - const auto* part_num_data = part_num_col->get_data_at(part_num_const ? 0 : i).data; - auto part_number = *reinterpret_cast(part_num_data); + auto part_number = part_num_col->get_element(part_num_const ? 0 : i); if (part_number == 0 || delimiter_size == 0) { StringOP::push_empty_string(i, res_chars, res_offsets); @@ -1859,14 +1866,20 @@ class FunctionSubstringIndex : public IFunction { } } else { // For multi-character delimiters - StringRef delimiter_ref(delimiter); - StringSearch search(&delimiter_ref); + // Use pre-created StringRef and StringSearch for constant delimiters + StringRef delimiter_ref = const_delimiter_ref ? const_delimiter_ref.value() : StringRef(delimiter); + const StringSearch* search_ptr = const_search ? &const_search.value() : nullptr; + StringSearch local_search(&delimiter_ref); + if (!search_ptr) { + search_ptr = &local_search; + } + int32_t offset = -delimiter_size; int32_t num = 0; while (num < part_number) { size_t n = str.size - offset - delimiter_size; // search first match delimter_ref index from src string among str_offset to end - const char* pos = search.search(str.data + offset + delimiter_size, n); + const char* pos = search_ptr->search(str.data + offset + delimiter_size, n); if (pos < str.data + str.size) { offset = pos - str.data; num++; @@ -1895,8 +1908,10 @@ class FunctionSubstringIndex : public IFunction { int32_t num = 0; auto substr = str_str; - StringRef delimiter_str(reinterpret_cast(delimiter.data), - delimiter.size); + // Use pre-created StringRef for constant delimiters + StringRef delimiter_str = const_delimiter_ref ? const_delimiter_ref.value() : + StringRef(reinterpret_cast(delimiter.data), delimiter.size); + while (num <= neg_part_number && offset >= 0) { offset = (int)substr.rfind(delimiter_str, offset); if (offset != -1) { From d836a1410af52dc6aad1d1f7d2a01de970a2ffd9 Mon Sep 17 00:00:00 2001 From: felixwluo Date: Mon, 21 Apr 2025 22:19:40 +0800 Subject: [PATCH 6/6] format --- be/src/vec/functions/function_string.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h index f5853fcc90b1e9..bbad824a266862 100644 --- a/be/src/vec/functions/function_string.h +++ b/be/src/vec/functions/function_string.h @@ -1815,7 +1815,8 @@ class FunctionSubstringIndex : public IFunction { bool part_num_const = false; std::tie(part_num_column_ptr, part_num_const) = unpack_if_const(block.get_by_position(arguments[2]).column); - const ColumnVector* part_num_col = assert_cast*>(part_num_column_ptr.get()); + const ColumnVector* part_num_col = + assert_cast*>(part_num_column_ptr.get()); // For constant multi-character delimiters, create StringRef and StringSearch only once std::optional const_delimiter_ref; @@ -1867,7 +1868,8 @@ class FunctionSubstringIndex : public IFunction { } else { // For multi-character delimiters // Use pre-created StringRef and StringSearch for constant delimiters - StringRef delimiter_ref = const_delimiter_ref ? const_delimiter_ref.value() : StringRef(delimiter); + StringRef delimiter_ref = const_delimiter_ref ? const_delimiter_ref.value() + : StringRef(delimiter); const StringSearch* search_ptr = const_search ? &const_search.value() : nullptr; StringSearch local_search(&delimiter_ref); if (!search_ptr) { @@ -1909,8 +1911,11 @@ class FunctionSubstringIndex : public IFunction { auto substr = str_str; // Use pre-created StringRef for constant delimiters - StringRef delimiter_str = const_delimiter_ref ? const_delimiter_ref.value() : - StringRef(reinterpret_cast(delimiter.data), delimiter.size); + StringRef delimiter_str = + const_delimiter_ref + ? const_delimiter_ref.value() + : StringRef(reinterpret_cast(delimiter.data), + delimiter.size); while (num <= neg_part_number && offset >= 0) { offset = (int)substr.rfind(delimiter_str, offset);