Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 84 additions & 16 deletions be/src/vec/functions/function_string.h
Original file line number Diff line number Diff line change
Expand Up @@ -1755,6 +1755,7 @@ class FunctionSplitByString : public IFunction {
const auto& [right_column, right_const] =
unpack_if_const(block.get_by_position(arguments[1]).column);

DataTypePtr right_column_type = block.get_by_position(arguments[1]).type;
DataTypePtr src_column_type = block.get_by_position(arguments[0]).type;
auto dest_column_ptr = ColumnArray::create(make_nullable(src_column_type)->create_column(),
ColumnArray::ColumnOffsets::create());
Expand All @@ -1770,27 +1771,42 @@ class FunctionSplitByString : public IFunction {
dest_nested_column = dest_nullable_col->get_nested_column_ptr();
dest_nested_null_map = &dest_nullable_col->get_null_map_column().get_data();

if (auto col_left = check_and_get_column<ColumnString>(src_column.get())) {
if (auto col_right = check_and_get_column<ColumnString>(right_column.get())) {
if (right_const) {
_execute_constant(*col_left, col_right->get_data_at(0), *dest_nested_column,
dest_offsets, dest_nested_null_map);
} else {
_execute_vector(*col_left, *col_right, *dest_nested_column, dest_offsets,
dest_nested_null_map);
}
auto col_left = check_and_get_column<ColumnString>(src_column.get());
if (!col_left) {
return Status::InternalError("Left operator of function {} can not be {}", get_name(),
src_column_type->get_name());
}

block.replace_by_position(result, std::move(dest_column_ptr));
return Status::OK();
}
auto col_right = check_and_get_column<ColumnString>(right_column.get());
if (!col_right) {
return Status::InternalError("Right operator of function {} can not be {}", get_name(),
right_column_type->get_name());
}

// split_by_string(ColumnString, "xxx")
if (right_const) {
_execute_constant_delimiter(*col_left, col_right->get_data_at(0), *dest_nested_column,
dest_offsets, dest_nested_null_map);
} else if (left_const) {
// split_by_string("xxx", ColumnString)
_execute_constant_src_string(col_left->get_data_at(0), *col_right, *dest_nested_column,
dest_offsets, dest_nested_null_map);
} else {
// split_by_string(ColumnString, ColumnString)
_execute_vector(*col_left, *col_right, *dest_nested_column, dest_offsets,
dest_nested_null_map);
}
return Status::RuntimeError("unimplements function {}", get_name());

block.replace_by_position(result, std::move(dest_column_ptr));

return Status::OK();
}

private:
void _execute_constant(const ColumnString& src_column_string, const StringRef& delimiter_ref,
IColumn& dest_nested_column, ColumnArray::Offsets64& dest_offsets,
NullMapType* dest_nested_null_map) const {
void _execute_constant_delimiter(const ColumnString& src_column_string,
const StringRef& delimiter_ref, IColumn& dest_nested_column,
ColumnArray::Offsets64& dest_offsets,
NullMapType* dest_nested_null_map) const {
ColumnString& dest_column_string = reinterpret_cast<ColumnString&>(dest_nested_column);
ColumnString::Chars& column_string_chars = dest_column_string.get_chars();
ColumnString::Offsets& column_string_offsets = dest_column_string.get_offsets();
Expand Down Expand Up @@ -1911,6 +1927,58 @@ class FunctionSplitByString : public IFunction {
}
}

void _execute_constant_src_string(const StringRef& str_ref, const ColumnString& delimiter_col,
IColumn& dest_nested_column,
ColumnArray::Offsets64& dest_offsets,
NullMapType* dest_nested_null_map) const {
ColumnString& dest_column_string = reinterpret_cast<ColumnString&>(dest_nested_column);
ColumnString::Chars& column_string_chars = dest_column_string.get_chars();
ColumnString::Offsets& column_string_offsets = dest_column_string.get_offsets();
column_string_chars.reserve(0);

ColumnArray::Offset64 string_pos = 0;
ColumnArray::Offset64 dest_pos = 0;
const ColumnArray::Offset64 delimiter_offsets_size = delimiter_col.get_offsets().size();

for (size_t i = 0; i < delimiter_offsets_size; ++i) {
const StringRef delimiter_ref = delimiter_col.get_data_at(i);

if (delimiter_ref.size == 0) {
for (size_t str_pos = 0; str_pos < str_ref.size;) {
const size_t str_offset = str_pos;
const size_t old_size = column_string_chars.size();
str_pos++;
const size_t new_size = old_size + 1;
column_string_chars.resize(new_size);
memcpy(column_string_chars.data() + old_size, str_ref.data + str_offset, 1);
(*dest_nested_null_map).push_back(false);
string_pos++;
dest_pos++;
column_string_offsets.push_back(string_pos);
}
} else {
for (size_t str_pos = 0; str_pos <= str_ref.size;) {
const size_t str_offset = str_pos;
const size_t old_size = column_string_chars.size();
const size_t split_part_size = split_str(str_pos, str_ref, delimiter_ref);
str_pos += delimiter_ref.size;
const size_t new_size = old_size + split_part_size;
column_string_chars.resize(new_size);
if (split_part_size > 0) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what if str_ref.length>16

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I dont know. The snippet is copied from #18496

It seems that there is no guarantee for the memory alignment. I will figure it out.

Copy link
Contributor Author

@zhiqiang-hhhh zhiqiang-hhhh Dec 7, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we are safe here, since ColumnString uses PaddedPODArray as its data type.
using PaddedPODArray = PODArray<T, initial_bytes, TAllocator, 15, 16>; means it will always be safe to read/write 16 bytes after end of StrRef of a ColumnString.

memcpy_small_allow_read_write_overflow15(
column_string_chars.data() + old_size, str_ref.data + str_offset,
split_part_size);
}
(*dest_nested_null_map).push_back(false);
string_pos += split_part_size;
dest_pos++;
column_string_offsets.push_back(string_pos);
}
}
dest_offsets.push_back(dest_pos);
}
}

size_t split_str(size_t& pos, const StringRef str_ref, StringRef delimiter_ref) const {
size_t old_size = pos;
size_t str_size = str_ref.size;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,23 @@
9 a,b,c, , ["a", "b", "c", ""]
10 \N , \N

-- !sql_1 --
1 ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"]
2 ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"]

-- !sql_2 --
3 ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"]
4 ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"]

-- !sql_3 --
1 [] [] [] []
2 [] [] [] []
3 ["a", ",", "b", ",", "c"] ["a", ",", "b", ",", "c"] ["a", ",", "b", ",", "c"] ["a", ",", "b", ",", "c"]
4 ["a", ",", "b", ",", "c"] ["a", ",", "b", ",", "c"] ["a", ",", "b", ",", "c"] ["a", ",", "b", ",", "c"]

-- !sql_4 --
1 [] [] [] []
2 [] [] [] []
3 [""] [""] [""] [""]
4 [""] [""] [""] [""]

Original file line number Diff line number Diff line change
Expand Up @@ -102,4 +102,64 @@ suite("test_split_by_string") {


qt_sql "SELECT *, split_by_string(v1, v2) FROM ${tableName2} ORDER BY k1"

// Case where both of operator are column string is covered by above test.
sql """DROP TABLE IF EXISTS test_split_by_string_2"""
sql """
CREATE TABLE IF NOT EXISTS test_split_by_string_2 (
`rid` INT NULL,
`str` TEXT NULL,
`vc` VARCHAR(5) NULL,
`chr` CHAR(5) NULL,
`txt` TEXT NULL
) ENGINE=OLAP
DUPLICATE KEY(`rid`)
DISTRIBUTED BY HASH(`rid`) BUCKETS 1
PROPERTIES (
"replication_allocation" = "tag.location.default: 1",
"storage_format" = "V2"
)
"""
sql """ INSERT INTO test_split_by_string_2
VALUES (1, "", "", "", ""),
(2, "", "", "", ""),
(3, "a,b,c", "a,b,c", "a,b,c", "a,b,c"),
(4, "a,b,c", "a,b,c", "a,b,c", "a,b,c")
"""
// Left operator is const, right operator is column string
qt_sql_1 """
SELECT rid,
split_by_string("abc", str),
split_by_string("abc", vc),
split_by_string("abc", chr),
split_by_string("abc", txt)
FROM test_split_by_string_2 WHERE rid=1 OR rid=2 ORDER BY rid;
"""
// Left operator is column string, right operator is const
qt_sql_2 """
SELECT rid,
split_by_string(str, ","),
split_by_string(vc, ","),
split_by_string(chr, ","),
split_by_string(txt, ",")
FROM test_split_by_string_2 WHERE rid=3 OR rid=4 ORDER BY rid;
"""

// Empty string
qt_sql_3 """
SELECT rid,
split_by_string(str, ""),
split_by_string(vc, ""),
split_by_string(chr, ""),
split_by_string(txt, "")
FROM test_split_by_string_2 ORDER BY rid;
"""
qt_sql_4 """
SELECT rid,
split_by_string("", str),
split_by_string("", vc),
split_by_string("", chr),
split_by_string("", txt)
FROM test_split_by_string_2 ORDER BY rid;
"""
}