-
Notifications
You must be signed in to change notification settings - Fork 3.7k
[fix](split_by_string) Fix split by string core on column string #28030
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1755,6 +1755,7 @@ class FunctionSplitByString : public IFunction { | |
| const auto& [right_column, right_const] = | ||
| unpack_if_const(block.get_by_position(arguments[1]).column); | ||
|
|
||
| DataTypePtr right_column_type = block.get_by_position(arguments[1]).type; | ||
| DataTypePtr src_column_type = block.get_by_position(arguments[0]).type; | ||
| auto dest_column_ptr = ColumnArray::create(make_nullable(src_column_type)->create_column(), | ||
| ColumnArray::ColumnOffsets::create()); | ||
|
|
@@ -1770,27 +1771,42 @@ class FunctionSplitByString : public IFunction { | |
| dest_nested_column = dest_nullable_col->get_nested_column_ptr(); | ||
| dest_nested_null_map = &dest_nullable_col->get_null_map_column().get_data(); | ||
|
|
||
| if (auto col_left = check_and_get_column<ColumnString>(src_column.get())) { | ||
| if (auto col_right = check_and_get_column<ColumnString>(right_column.get())) { | ||
| if (right_const) { | ||
| _execute_constant(*col_left, col_right->get_data_at(0), *dest_nested_column, | ||
| dest_offsets, dest_nested_null_map); | ||
| } else { | ||
| _execute_vector(*col_left, *col_right, *dest_nested_column, dest_offsets, | ||
| dest_nested_null_map); | ||
| } | ||
| auto col_left = check_and_get_column<ColumnString>(src_column.get()); | ||
| if (!col_left) { | ||
| return Status::InternalError("Left operator of function {} can not be {}", get_name(), | ||
| src_column_type->get_name()); | ||
| } | ||
|
|
||
| block.replace_by_position(result, std::move(dest_column_ptr)); | ||
| return Status::OK(); | ||
| } | ||
| auto col_right = check_and_get_column<ColumnString>(right_column.get()); | ||
| if (!col_right) { | ||
| return Status::InternalError("Right operator of function {} can not be {}", get_name(), | ||
| right_column_type->get_name()); | ||
| } | ||
|
|
||
| // split_by_string(ColumnString, "xxx") | ||
| if (right_const) { | ||
| _execute_constant_delimiter(*col_left, col_right->get_data_at(0), *dest_nested_column, | ||
| dest_offsets, dest_nested_null_map); | ||
| } else if (left_const) { | ||
| // split_by_string("xxx", ColumnString) | ||
| _execute_constant_src_string(col_left->get_data_at(0), *col_right, *dest_nested_column, | ||
| dest_offsets, dest_nested_null_map); | ||
| } else { | ||
| // split_by_string(ColumnString, ColumnString) | ||
| _execute_vector(*col_left, *col_right, *dest_nested_column, dest_offsets, | ||
| dest_nested_null_map); | ||
| } | ||
| return Status::RuntimeError("unimplements function {}", get_name()); | ||
|
|
||
| block.replace_by_position(result, std::move(dest_column_ptr)); | ||
|
|
||
| return Status::OK(); | ||
| } | ||
|
|
||
| private: | ||
| void _execute_constant(const ColumnString& src_column_string, const StringRef& delimiter_ref, | ||
| IColumn& dest_nested_column, ColumnArray::Offsets64& dest_offsets, | ||
| NullMapType* dest_nested_null_map) const { | ||
| void _execute_constant_delimiter(const ColumnString& src_column_string, | ||
| const StringRef& delimiter_ref, IColumn& dest_nested_column, | ||
| ColumnArray::Offsets64& dest_offsets, | ||
| NullMapType* dest_nested_null_map) const { | ||
| ColumnString& dest_column_string = reinterpret_cast<ColumnString&>(dest_nested_column); | ||
| ColumnString::Chars& column_string_chars = dest_column_string.get_chars(); | ||
| ColumnString::Offsets& column_string_offsets = dest_column_string.get_offsets(); | ||
|
|
@@ -1911,6 +1927,58 @@ class FunctionSplitByString : public IFunction { | |
| } | ||
| } | ||
|
|
||
| void _execute_constant_src_string(const StringRef& str_ref, const ColumnString& delimiter_col, | ||
| IColumn& dest_nested_column, | ||
| ColumnArray::Offsets64& dest_offsets, | ||
| NullMapType* dest_nested_null_map) const { | ||
| ColumnString& dest_column_string = reinterpret_cast<ColumnString&>(dest_nested_column); | ||
zhiqiang-hhhh marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| ColumnString::Chars& column_string_chars = dest_column_string.get_chars(); | ||
| ColumnString::Offsets& column_string_offsets = dest_column_string.get_offsets(); | ||
| column_string_chars.reserve(0); | ||
|
|
||
| ColumnArray::Offset64 string_pos = 0; | ||
| ColumnArray::Offset64 dest_pos = 0; | ||
| const ColumnArray::Offset64 delimiter_offsets_size = delimiter_col.get_offsets().size(); | ||
|
|
||
| for (size_t i = 0; i < delimiter_offsets_size; ++i) { | ||
| const StringRef delimiter_ref = delimiter_col.get_data_at(i); | ||
|
|
||
| if (delimiter_ref.size == 0) { | ||
| for (size_t str_pos = 0; str_pos < str_ref.size;) { | ||
| const size_t str_offset = str_pos; | ||
| const size_t old_size = column_string_chars.size(); | ||
| str_pos++; | ||
| const size_t new_size = old_size + 1; | ||
| column_string_chars.resize(new_size); | ||
| memcpy(column_string_chars.data() + old_size, str_ref.data + str_offset, 1); | ||
| (*dest_nested_null_map).push_back(false); | ||
| string_pos++; | ||
| dest_pos++; | ||
| column_string_offsets.push_back(string_pos); | ||
| } | ||
| } else { | ||
| for (size_t str_pos = 0; str_pos <= str_ref.size;) { | ||
| const size_t str_offset = str_pos; | ||
| const size_t old_size = column_string_chars.size(); | ||
| const size_t split_part_size = split_str(str_pos, str_ref, delimiter_ref); | ||
| str_pos += delimiter_ref.size; | ||
| const size_t new_size = old_size + split_part_size; | ||
| column_string_chars.resize(new_size); | ||
| if (split_part_size > 0) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what if str_ref.length>16
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I dont know. The snippet is copied from #18496 It seems that there is no guarantee for the memory alignment. I will figure it out.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we are safe here, since ColumnString uses PaddedPODArray as its data type. |
||
| memcpy_small_allow_read_write_overflow15( | ||
| column_string_chars.data() + old_size, str_ref.data + str_offset, | ||
| split_part_size); | ||
| } | ||
| (*dest_nested_null_map).push_back(false); | ||
| string_pos += split_part_size; | ||
| dest_pos++; | ||
| column_string_offsets.push_back(string_pos); | ||
| } | ||
| } | ||
| dest_offsets.push_back(dest_pos); | ||
| } | ||
| } | ||
|
|
||
| size_t split_str(size_t& pos, const StringRef str_ref, StringRef delimiter_ref) const { | ||
| size_t old_size = pos; | ||
| size_t str_size = str_ref.size; | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.