From 1c90b20cb671b91508c57391934ff58a6277280f Mon Sep 17 00:00:00 2001 From: zhiqiang-hhhh Date: Tue, 5 Dec 2023 21:05:00 +0800 Subject: [PATCH 1/3] Fix --- be/src/vec/functions/function_string.h | 108 +++++++++++++++--- .../string_functions/test_split_by_string.out | 8 ++ .../test_split_by_string.groovy | 42 +++++++ 3 files changed, 139 insertions(+), 19 deletions(-) diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h index b2508decf723bc..9a9af5a7f90ef0 100644 --- a/be/src/vec/functions/function_string.h +++ b/be/src/vec/functions/function_string.h @@ -1747,7 +1747,7 @@ class FunctionSplitByString : public IFunction { } Status execute_impl(FunctionContext* /*context*/, Block& block, const ColumnNumbers& arguments, - size_t result, size_t /*input_rows_count*/) const override { + size_t result, size_t input_rows_count) const override { DCHECK_EQ(arguments.size(), 2); const auto& [src_column, left_const] = @@ -1755,6 +1755,7 @@ class FunctionSplitByString : public IFunction { const auto& [right_column, right_const] = unpack_if_const(block.get_by_position(arguments[1]).column); + DataTypePtr right_column_type = block.get_by_position(arguments[1]).type; DataTypePtr src_column_type = block.get_by_position(arguments[0]).type; auto dest_column_ptr = ColumnArray::create(make_nullable(src_column_type)->create_column(), ColumnArray::ColumnOffsets::create()); @@ -1762,35 +1763,50 @@ class FunctionSplitByString : public IFunction { IColumn* dest_nested_column = &dest_column_ptr->get_data(); auto& dest_offsets = dest_column_ptr->get_offsets(); DCHECK(dest_nested_column != nullptr); - dest_nested_column->reserve(0); - dest_offsets.reserve(0); + dest_nested_column->reserve(input_rows_count); + dest_offsets.reserve(input_rows_count); NullMapType* dest_nested_null_map = nullptr; ColumnNullable* dest_nullable_col = reinterpret_cast(dest_nested_column); dest_nested_column = dest_nullable_col->get_nested_column_ptr(); dest_nested_null_map = &dest_nullable_col->get_null_map_column().get_data(); - if (auto col_left = check_and_get_column(src_column.get())) { - if (auto col_right = check_and_get_column(right_column.get())) { - if (right_const) { - _execute_constant(*col_left, col_right->get_data_at(0), *dest_nested_column, - dest_offsets, dest_nested_null_map); - } else { - _execute_vector(*col_left, *col_right, *dest_nested_column, dest_offsets, - dest_nested_null_map); - } + auto col_left = check_and_get_column(src_column.get()); + if (!col_left) { + return Status::InternalError("Left operator of function {} can not be {}", get_name(), + src_column_type->get_name()); + } - block.replace_by_position(result, std::move(dest_column_ptr)); - return Status::OK(); - } + auto col_right = check_and_get_column(right_column.get()); + if (!col_right) { + return Status::InternalError("Right operator of function {} can not be {}", get_name(), + right_column_type->get_name()); } - return Status::RuntimeError("unimplements function {}", get_name()); + + // split_by_string(ColumnString, "xxx") + if (right_const) { + _execute_constant_delimiter(*col_left, col_right->get_data_at(0), *dest_nested_column, + dest_offsets, dest_nested_null_map); + } else if (left_const) { + // split_by_string("xxx", ColumnString) + _execute_constant_src_string(col_left->get_data_at(0), *col_right, *dest_nested_column, + dest_offsets, dest_nested_null_map); + } else { + // split_by_string(ColumnString, ColumnString) + _execute_vector(*col_left, *col_right, *dest_nested_column, dest_offsets, + dest_nested_null_map); + } + + block.replace_by_position(result, std::move(dest_column_ptr)); + + return Status::OK(); } private: - void _execute_constant(const ColumnString& src_column_string, const StringRef& delimiter_ref, - IColumn& dest_nested_column, ColumnArray::Offsets64& dest_offsets, - NullMapType* dest_nested_null_map) const { + void _execute_constant_delimiter(const ColumnString& src_column_string, + const StringRef& delimiter_ref, IColumn& dest_nested_column, + ColumnArray::Offsets64& dest_offsets, + NullMapType* dest_nested_null_map) const { ColumnString& dest_column_string = reinterpret_cast(dest_nested_column); ColumnString::Chars& column_string_chars = dest_column_string.get_chars(); ColumnString::Offsets& column_string_offsets = dest_column_string.get_offsets(); @@ -1911,6 +1927,60 @@ class FunctionSplitByString : public IFunction { } } + void _execute_constant_src_string(const StringRef& str_ref, const ColumnString& delimiter_col, + IColumn& dest_nested_column, + ColumnArray::Offsets64& dest_offsets, + NullMapType* dest_nested_null_map) const { + ColumnString& dest_column_string = reinterpret_cast(dest_nested_column); + ColumnString::Chars& column_string_chars = dest_column_string.get_chars(); + ColumnString::Offsets& column_string_offsets = dest_column_string.get_offsets(); + column_string_chars.reserve(0); + + ColumnArray::Offset64 string_pos = 0; + ColumnArray::Offset64 dest_pos = 0; + const ColumnArray::Offset64 delimiter_offsets_size = delimiter_col.get_offsets().size(); + + // TODO: if src_string_ref.size == 0; + + for (size_t i = 0; i < delimiter_offsets_size; i++) { + const StringRef delimiter_ref = delimiter_col.get_data_at(i); + + if (delimiter_ref.size == 0) { + for (size_t str_pos = 0; str_pos < str_ref.size;) { + const size_t str_offset = str_pos; + const size_t old_size = column_string_chars.size(); + str_pos++; + const size_t new_size = old_size + 1; + column_string_chars.resize(new_size); + memcpy(column_string_chars.data() + old_size, str_ref.data + str_offset, 1); + (*dest_nested_null_map).push_back(false); + string_pos++; + dest_pos++; + column_string_offsets.push_back(string_pos); + } + } else { + for (size_t str_pos = 0; str_pos <= str_ref.size;) { + const size_t str_offset = str_pos; + const size_t old_size = column_string_chars.size(); + const size_t split_part_size = split_str(str_pos, str_ref, delimiter_ref); + str_pos += delimiter_ref.size; + const size_t new_size = old_size + split_part_size; + column_string_chars.resize(new_size); + if (split_part_size > 0) { + memcpy_small_allow_read_write_overflow15( + column_string_chars.data() + old_size, str_ref.data + str_offset, + split_part_size); + } + (*dest_nested_null_map).push_back(false); + string_pos += split_part_size; + dest_pos++; + column_string_offsets.push_back(string_pos); + } + } + dest_offsets.push_back(dest_pos); + } + } + size_t split_str(size_t& pos, const StringRef str_ref, StringRef delimiter_ref) const { size_t old_size = pos; size_t str_size = str_ref.size; diff --git a/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_string.out b/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_string.out index 00d9ad99781881..f5d3f74b96a253 100644 --- a/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_string.out +++ b/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_string.out @@ -87,3 +87,11 @@ 9 a,b,c, , ["a", "b", "c", ""] 10 \N , \N +-- !sql_1 -- +1 ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] +2 ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] + +-- !sql_2 -- +3 ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] +4 ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] + diff --git a/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_string.groovy b/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_string.groovy index d3f0588518124c..69681efb3ee6d6 100644 --- a/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_string.groovy +++ b/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_string.groovy @@ -102,4 +102,46 @@ suite("test_split_by_string") { qt_sql "SELECT *, split_by_string(v1, v2) FROM ${tableName2} ORDER BY k1" + + // Case where both of operator are column string is covered by above test. + sql """DROP TABLE IF EXISTS test_split_by_string_2""" + sql """ + CREATE TABLE IF NOT EXISTS test_split_by_string_2 ( + `rid` INT NULL, + `str` TEXT NULL, + `vc` VARCHAR(5) NULL, + `chr` CHAR(5) NULL, + `txt` TEXT NULL + ) ENGINE=OLAP + DUPLICATE KEY(`rid`) + DISTRIBUTED BY HASH(`rid`) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "storage_format" = "V2" + ) + """ + sql """ INSERT INTO test_split_by_string_2 + VALUES (1, "", "", "", ""), + (2, "", "", "", ""), + (3, "a,b,c", "a,b,c", "a,b,c", "a,b,c"), + (4, "a,b,c", "a,b,c", "a,b,c", "a,b,c") + """ + // Left operator is const, right operator is column string + qt_sql_1 """ + SELECT rid, + split_by_string("abc", str), + split_by_string("abc", vc), + split_by_string("abc", chr), + split_by_string("abc", txt) + FROM test_split_by_string_2 WHERE rid=1 OR rid=2 ORDER BY rid; + """ + // Left operator is column string, right operator is const + qt_sql_2 """ + SELECT rid, + split_by_string(str, ","), + split_by_string(vc, ","), + split_by_string(chr, ","), + split_by_string(txt, ",") + FROM test_split_by_string_2 WHERE rid=3 OR rid=4 ORDER BY rid; + """ } \ No newline at end of file From 6d05a90abb706f0c3c8bdc5159acc807bf15aa31 Mon Sep 17 00:00:00 2001 From: zhiqiang-hhhh Date: Tue, 5 Dec 2023 21:57:42 +0800 Subject: [PATCH 2/3] empty string --- be/src/vec/functions/function_string.h | 9 ++++----- .../string_functions/test_split_by_string.out | 12 +++++++++++ .../test_split_by_string.groovy | 20 ++++++++++++++++++- 3 files changed, 35 insertions(+), 6 deletions(-) diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h index 9a9af5a7f90ef0..56a05d0f333f34 100644 --- a/be/src/vec/functions/function_string.h +++ b/be/src/vec/functions/function_string.h @@ -1747,7 +1747,7 @@ class FunctionSplitByString : public IFunction { } Status execute_impl(FunctionContext* /*context*/, Block& block, const ColumnNumbers& arguments, - size_t result, size_t input_rows_count) const override { + size_t result, size_t /*input_rows_count*/) const override { DCHECK_EQ(arguments.size(), 2); const auto& [src_column, left_const] = @@ -1763,8 +1763,8 @@ class FunctionSplitByString : public IFunction { IColumn* dest_nested_column = &dest_column_ptr->get_data(); auto& dest_offsets = dest_column_ptr->get_offsets(); DCHECK(dest_nested_column != nullptr); - dest_nested_column->reserve(input_rows_count); - dest_offsets.reserve(input_rows_count); + dest_nested_column->reserve(0); + dest_offsets.reserve(0); NullMapType* dest_nested_null_map = nullptr; ColumnNullable* dest_nullable_col = reinterpret_cast(dest_nested_column); @@ -1940,9 +1940,8 @@ class FunctionSplitByString : public IFunction { ColumnArray::Offset64 dest_pos = 0; const ColumnArray::Offset64 delimiter_offsets_size = delimiter_col.get_offsets().size(); - // TODO: if src_string_ref.size == 0; - for (size_t i = 0; i < delimiter_offsets_size; i++) { + for (size_t i = 0; i < delimiter_offsets_size; ++i) { const StringRef delimiter_ref = delimiter_col.get_data_at(i); if (delimiter_ref.size == 0) { diff --git a/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_string.out b/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_string.out index f5d3f74b96a253..c46fa2bd27e2cc 100644 --- a/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_string.out +++ b/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_string.out @@ -95,3 +95,15 @@ 3 ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] 4 ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] +-- !sql_3 -- +1 [] [] [] [] +2 [] [] [] [] +3 ["a", ",", "b", ",", "c"] ["a", ",", "b", ",", "c"] ["a", ",", "b", ",", "c"] ["a", ",", "b", ",", "c"] +4 ["a", ",", "b", ",", "c"] ["a", ",", "b", ",", "c"] ["a", ",", "b", ",", "c"] ["a", ",", "b", ",", "c"] + +-- !sql_4 -- +1 [] [] [] [] +2 [] [] [] [] +3 [""] [""] [""] [""] +4 [""] [""] [""] [""] + diff --git a/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_string.groovy b/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_string.groovy index 69681efb3ee6d6..2ec70e361242ce 100644 --- a/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_string.groovy +++ b/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_string.groovy @@ -143,5 +143,23 @@ suite("test_split_by_string") { split_by_string(chr, ","), split_by_string(txt, ",") FROM test_split_by_string_2 WHERE rid=3 OR rid=4 ORDER BY rid; - """ + """ + + // Empty string + qt_sql_3 """ + SELECT rid, + split_by_string(str, ""), + split_by_string(vc, ""), + split_by_string(chr, ""), + split_by_string(txt, "") + FROM test_split_by_string_2 ORDER BY rid; + """ + qt_sql_4 """ + SELECT rid, + split_by_string("", str), + split_by_string("", vc), + split_by_string("", chr), + split_by_string("", txt) + FROM test_split_by_string_2 ORDER BY rid; + """ } \ No newline at end of file From 6eba8ca60d7ae43857c9a666dfd4bcc53a188a74 Mon Sep 17 00:00:00 2001 From: zhiqiang-hhhh Date: Tue, 5 Dec 2023 22:00:49 +0800 Subject: [PATCH 3/3] format --- be/src/vec/functions/function_string.h | 1 - 1 file changed, 1 deletion(-) diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h index 56a05d0f333f34..4487d88a951b21 100644 --- a/be/src/vec/functions/function_string.h +++ b/be/src/vec/functions/function_string.h @@ -1940,7 +1940,6 @@ class FunctionSplitByString : public IFunction { ColumnArray::Offset64 dest_pos = 0; const ColumnArray::Offset64 delimiter_offsets_size = delimiter_col.get_offsets().size(); - for (size_t i = 0; i < delimiter_offsets_size; ++i) { const StringRef delimiter_ref = delimiter_col.get_data_at(i);