Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 56 additions & 29 deletions be/src/vec/functions/function_string.h
Original file line number Diff line number Diff line change
Expand Up @@ -1804,24 +1804,42 @@ class FunctionSubstringIndex : public IFunction {

const auto* str_col = assert_cast<const ColumnString*>(content_column.get());

[[maybe_unused]] const auto& [delimiter_col, delimiter_const] =
// Handle both constant and non-constant delimiter parameters
ColumnPtr delimiter_column_ptr;
bool delimiter_const = false;
std::tie(delimiter_column_ptr, delimiter_const) =
unpack_if_const(block.get_by_position(arguments[1]).column);
auto delimiter = delimiter_col->get_data_at(0);
int32_t delimiter_size = delimiter.size;
const auto* delimiter_col = assert_cast<const ColumnString*>(delimiter_column_ptr.get());

[[maybe_unused]] const auto& [part_num_col, part_const] =
ColumnPtr part_num_column_ptr;
bool part_num_const = false;
std::tie(part_num_column_ptr, part_num_const) =
unpack_if_const(block.get_by_position(arguments[2]).column);
auto part_number = *((int*)part_num_col->get_data_at(0).data);
const ColumnVector<Int32>* part_num_col =
assert_cast<const ColumnVector<Int32>*>(part_num_column_ptr.get());

if (part_number == 0 || delimiter_size == 0) {
for (size_t i = 0; i < input_rows_count; ++i) {
// For constant multi-character delimiters, create StringRef and StringSearch only once
std::optional<StringRef> const_delimiter_ref;
std::optional<StringSearch> const_search;
if (delimiter_const && delimiter_col->get_data_at(0).size > 1) {
const_delimiter_ref.emplace(delimiter_col->get_data_at(0));
const_search.emplace(&const_delimiter_ref.value());
}

for (size_t i = 0; i < input_rows_count; ++i) {
auto str = str_col->get_data_at(i);
auto delimiter = delimiter_col->get_data_at(delimiter_const ? 0 : i);
int32_t delimiter_size = delimiter.size;

auto part_number = part_num_col->get_element(part_num_const ? 0 : i);

if (part_number == 0 || delimiter_size == 0) {
StringOP::push_empty_string(i, res_chars, res_offsets);
continue;
}
} else if (part_number > 0) {
if (delimiter_size == 1) {
// If delimiter is a char, use memchr to split
for (size_t i = 0; i < input_rows_count; ++i) {
auto str = str_col->get_data_at(i);

if (part_number > 0) {
if (delimiter_size == 1) {
int32_t offset = -1;
int32_t num = 0;
while (num < part_number) {
Expand All @@ -1847,18 +1865,23 @@ class FunctionSubstringIndex : public IFunction {
StringOP::push_value_string(std::string_view(str.data, str.size), i,
res_chars, res_offsets);
}
}
} else {
StringRef delimiter_ref(delimiter);
StringSearch search(&delimiter_ref);
for (size_t i = 0; i < input_rows_count; ++i) {
auto str = str_col->get_data_at(i);
} else {
// For multi-character delimiters
// Use pre-created StringRef and StringSearch for constant delimiters
StringRef delimiter_ref = const_delimiter_ref ? const_delimiter_ref.value()
: StringRef(delimiter);
const StringSearch* search_ptr = const_search ? &const_search.value() : nullptr;
StringSearch local_search(&delimiter_ref);
if (!search_ptr) {
search_ptr = &local_search;
}

int32_t offset = -delimiter_size;
int32_t num = 0;
while (num < part_number) {
size_t n = str.size - offset - delimiter_size;
// search first match delimter_ref index from src string among str_offset to end
const char* pos = search.search(str.data + offset + delimiter_size, n);
const char* pos = search_ptr->search(str.data + offset + delimiter_size, n);
if (pos < str.data + str.size) {
offset = pos - str.data;
num++;
Expand All @@ -1879,21 +1902,25 @@ class FunctionSubstringIndex : public IFunction {
res_chars, res_offsets);
}
}
}
} else {
// if part_number is negative
part_number = -part_number;
for (size_t i = 0; i < input_rows_count; ++i) {
auto str = str_col->get_data_at(i);
} else {
int neg_part_number = -part_number;
auto str_str = str.to_string();
int32_t offset = str.size;
int32_t pre_offset = offset;
int32_t num = 0;
auto substr = str_str;
while (num <= part_number && offset >= 0) {
offset = (int)substr.rfind(delimiter, offset);

// Use pre-created StringRef for constant delimiters
StringRef delimiter_str =
const_delimiter_ref
? const_delimiter_ref.value()
: StringRef(reinterpret_cast<const char*>(delimiter.data),
delimiter.size);

while (num <= neg_part_number && offset >= 0) {
offset = (int)substr.rfind(delimiter_str, offset);
if (offset != -1) {
if (++num == part_number) {
if (++num == neg_part_number) {
break;
}
pre_offset = offset;
Expand All @@ -1905,7 +1932,7 @@ class FunctionSubstringIndex : public IFunction {
}
num = (offset == -1 && num != 0) ? num + 1 : num;

if (num == part_number) {
if (num == neg_part_number) {
if (offset == -1) {
StringOP::push_value_string(std::string_view(str.data, str.size), i,
res_chars, res_offsets);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
package org.apache.doris.nereids.trees.expressions.functions.scalar;

import org.apache.doris.catalog.FunctionSignature;
import org.apache.doris.nereids.exceptions.AnalysisException;
import org.apache.doris.nereids.trees.expressions.Expression;
import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable;
Expand Down Expand Up @@ -53,16 +52,6 @@ public SubstringIndex(Expression arg0, Expression arg1, Expression arg2) {
super("substring_index", arg0, arg1, arg2);
}

@Override
public void checkLegalityBeforeTypeCoercion() {
for (int i = 1; i < children.size(); ++i) {
if (!getArgument(i).isConstant()) {
throw new AnalysisException(getName()
+ " function except for the first argument, other parameter must be a constant.");
}
}
}

/**
* withChildren.
*/
Expand Down
48 changes: 48 additions & 0 deletions regression-test/data/function_p0/test_substring_index.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !sql --
1 AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 BBB -1 _02|CCC_03|DDD_04|EEE_05|FFF_06
2 zyz_01|zyz_02|CCC_03|qwe_04|qwe_05|qwe_06 ccc -1 zyz_01|zyz_02|CCC_03|qwe_04|qwe_05|qwe_06
3 AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 DDD -1 _04|EEE_05|FFF_06
4 sgr_01|wsc_02|CCC_03|DDD_04|rfv_05|rgb_06 DDD -1 _04|rfv_05|rgb_06
5 cdr_01|vfr_02|dfc_03|DDD_04|EEE_05|FFF_06 eee -1 cdr_01|vfr_02|dfc_03|DDD_04|EEE_05|FFF_06
6 AAA_01|dsd_02|ert_03|bgt_04|fgh_05|hyb_06 A_01 -1 |dsd_02|ert_03|bgt_04|fgh_05|hyb_06
7 AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 BBB 1 AAA_01|
8 AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 | 2 AAA_01|BBB_02
9 AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 | -2 EEE_05|FFF_06
10 ABC | 1 ABC
11 ABC|DEF | 0
12 ABC 1
13 ABC|DEF|GHI 1

-- !sql --
101 北京市|上海市|广州市|深圳市|成都市 | 2 北京市|上海市
102 北京市|上海市|广州市|深圳市|成都市 | -2 深圳市|成都市
103 北京市|上海市|广州市|深圳市|成都市 上海 -1 市|广州市|深圳市|成都市
104 中国人民共和国 人民 1 中国
105 中国人民共和国 人民 -1 共和国
106 你好,世界!你好,朋友! 你好 1
107 你好,世界!你好,朋友! 你好 -1 ,朋友!
108 你好,世界!你好,朋友! 世界 -1 !你好,朋友!
109 中文|测试|数据 测试 1 中文|
110 中文|测试|数据 测试 -1 |数据

-- !sql --
201 hello😀world😀example 😀 1 hello
202 hello😀world😀example 😀 2 hello😀world
203 hello😀world😀example 😀 -1 example
204 👋👋hello👋world👋 👋 2 👋
205 👋👋hello👋world👋 👋 -2 world👋

-- !sql --
1 AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 BBB -1 _02|CCC_03|DDD_04|EEE_05|FFF_06
3 AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 DDD -1 _04|EEE_05|FFF_06
7 AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 BBB 1 AAA_01|
8 AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 | 2 AAA_01|BBB_02
101 北京市|上海市|广州市|深圳市|成都市 | 2 北京市|上海市
103 北京市|上海市|广州市|深圳市|成都市 上海 -1 市|广州市|深圳市|成都市
201 hello😀world😀example 😀 1 hello
203 hello😀world😀example 😀 -1 example

-- !sql --
test|test test|test

23 changes: 23 additions & 0 deletions regression-test/data/function_p0/test_substring_index_columns.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !sql --
1 BBB AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 _02|CCC_03|DDD_04|EEE_05|FFF_06
2 ccc zyz_01|zyz_02|CCC_03|qwe_04|qwe_05|qwe_06 zyz_01|zyz_02|CCC_03|qwe_04|qwe_05|qwe_06
3 DDD AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06 _04|EEE_05|FFF_06
4 DDD sgr_01|wsc_02|CCC_03|DDD_04|rfv_05|rgb_06 _04|rfv_05|rgb_06
5 eee cdr_01|vfr_02|dfc_03|DDD_04|EEE_05|FFF_06 cdr_01|vfr_02|dfc_03|DDD_04|EEE_05|FFF_06
6 A_01 AAA_01|dsd_02|ert_03|bgt_04|fgh_05|hyb_06 |dsd_02|ert_03|bgt_04|fgh_05|hyb_06

-- !sql --
7 市 北京市|上海市|广州市|深圳市 北京
8 人民 中华人民共和国 中华 共和国
9 分隔符 中文分隔符测试分隔符数据 中文 数据
10 你好 你好,世界!你好,朋友! ,朋友!

-- !sql --
1 field1,field2,field3,field4 , 2 field1,field2
2 field1,field2,field3,field4 , -1 field4
3 AAA_01|BBB_02|CCC_03 | 2 AAA_01|BBB_02
4 AAA_01|BBB_02|CCC_03 | -2 BBB_02|CCC_03
5 中文分隔符测试分隔符数据 分隔符 1 中文
6 中文分隔符测试分隔符数据 分隔符 -1 数据

82 changes: 82 additions & 0 deletions regression-test/data/function_p0/test_substring_index_simple.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !sql --
AAA_01

-- !sql --
CCC_03

-- !sql --
AAA_01|BBB_02

-- !sql --
BBB_02|CCC_03

-- !sql --
AAA_01|BBB_02|CCC_03

-- !sql --
AAA_01|BBB_02|CCC_03

-- !sql --


-- !sql --


-- !sql --


-- !sql --
AAA_01|BBB_02|CCC_03

-- !sql --
AAA_01|BBB_02|CCC_03

-- !sql --
AAA_01

-- !sql --
CCC_03

-- !sql --
_02|CCC_03

-- !sql --


-- !sql --


-- !sql --
北京市|上海市

-- !sql --
北京市

-- !sql --
广州市

-- !sql --
hello

-- !sql --
example

-- !sql --
AAA_01|BBB_02

-- !sql --
AAA_01|BBB_02

-- !sql --
AAA_01|BBB_02

-- !sql --
AAA_01|BBB_02

-- !sql --
中文_

-- !sql --
_02|CCC_03 AAA_01|BBB_02|CCC_03

Loading
Loading