From c48f7be5e5a7d90a96bb2a68fabf777d513d38b9 Mon Sep 17 00:00:00 2001 From: jacktengg <18241664+jacktengg@users.noreply.github.com> Date: Wed, 1 Mar 2023 15:30:55 +0800 Subject: [PATCH 1/2] [opt](string) optimize string equal comparision --- be/src/vec/functions/functions_comparison.h | 187 +++++++++++++++ .../datatype_p0/string/test_string_basic.out | 90 ++++++++ .../string/test_string_basic.groovy | 217 ++++++++++++++++++ 3 files changed, 494 insertions(+) diff --git a/be/src/vec/functions/functions_comparison.h b/be/src/vec/functions/functions_comparison.h index db616348649166..1c1dfed380d2cc 100644 --- a/be/src/vec/functions/functions_comparison.h +++ b/be/src/vec/functions/functions_comparison.h @@ -126,6 +126,118 @@ struct GenericComparisonImpl { } }; +template +struct StringComparisonImpl { + static void NO_INLINE string_vector_string_vector(const ColumnString::Chars& a_data, + const ColumnString::Offsets& a_offsets, + const ColumnString::Chars& b_data, + const ColumnString::Offsets& b_offsets, + PaddedPODArray& c) { + size_t size = a_offsets.size(); + ColumnString::Offset prev_a_offset = 0; + ColumnString::Offset prev_b_offset = 0; + const auto* a_pos = a_data.data(); + const auto* b_pos = b_data.data(); + + for (size_t i = 0; i < size; ++i) { + c[i] = Op::apply(memcmp_small_allow_overflow15( + a_pos + prev_a_offset, a_offsets[i] - prev_a_offset, + b_pos + prev_b_offset, b_offsets[i] - prev_b_offset), + 0); + + prev_a_offset = a_offsets[i]; + prev_b_offset = b_offsets[i]; + } + } + + static void NO_INLINE string_vector_constant(const ColumnString::Chars& a_data, + const ColumnString::Offsets& a_offsets, + const ColumnString::Chars& b_data, + ColumnString::Offset b_size, + PaddedPODArray& c) { + size_t size = a_offsets.size(); + ColumnString::Offset prev_a_offset = 0; + const auto* a_pos = a_data.data(); + const auto* b_pos = b_data.data(); + + for (size_t i = 0; i < size; ++i) { + c[i] = Op::apply( + memcmp_small_allow_overflow15(a_pos + prev_a_offset, + a_offsets[i] - prev_a_offset, b_pos, b_size), + 0); + + prev_a_offset = a_offsets[i]; + } + } + + static void constant_string_vector(const ColumnString::Chars& a_data, + ColumnString::Offset a_size, + const ColumnString::Chars& b_data, + const ColumnString::Offsets& b_offsets, + PaddedPODArray& c) { + StringComparisonImpl::string_vector_constant(b_data, b_offsets, + a_data, a_size, c); + } +}; + +template +struct StringEqualsImpl { + static void NO_INLINE string_vector_string_vector(const ColumnString::Chars& a_data, + const ColumnString::Offsets& a_offsets, + const ColumnString::Chars& b_data, + const ColumnString::Offsets& b_offsets, + PaddedPODArray& c) { + size_t size = a_offsets.size(); + ColumnString::Offset prev_a_offset = 0; + ColumnString::Offset prev_b_offset = 0; + const auto* a_pos = a_data.data(); + const auto* b_pos = b_data.data(); + + for (size_t i = 0; i < size; ++i) { + auto a_size = a_offsets[i] - prev_a_offset; + auto b_size = b_offsets[i] - prev_b_offset; + + c[i] = positive == memequal_small_allow_overflow15(a_pos + prev_a_offset, a_size, + b_pos + prev_b_offset, b_size); + + prev_a_offset = a_offsets[i]; + prev_b_offset = b_offsets[i]; + } + } + + static void NO_INLINE string_vector_constant(const ColumnString::Chars& a_data, + const ColumnString::Offsets& a_offsets, + const ColumnString::Chars& b_data, + ColumnString::Offset b_size, + PaddedPODArray& c) { + size_t size = a_offsets.size(); + ColumnString::Offset prev_a_offset = 0; + + for (size_t i = 0; i < size; ++i) { + auto a_size = a_offsets[i] - prev_a_offset; + + c[i] = positive == memequal_small_allow_overflow15(a_data.data() + prev_a_offset, + a_size, b_data.data(), b_size); + + prev_a_offset = a_offsets[i]; + } + } + + static void NO_INLINE constant_string_vector(const ColumnString::Chars& a_data, + ColumnString::Offset a_size, + const ColumnString::Chars& b_data, + const ColumnString::Offsets& b_offsets, + PaddedPODArray& c) { + string_vector_constant(b_data, b_offsets, a_data, a_size, c); + } +}; + +template +struct StringComparisonImpl> : StringEqualsImpl {}; + +template +struct StringComparisonImpl> : StringEqualsImpl {}; + struct NameEquals { static constexpr auto name = "eq"; }; @@ -291,6 +403,75 @@ class FunctionComparison : public IFunction { return Status::OK(); } + Status execute_string(Block& block, size_t result, const IColumn* c0, const IColumn* c1) { + const ColumnString* c0_string = check_and_get_column(c0); + const ColumnString* c1_string = check_and_get_column(c1); + const ColumnConst* c0_const = check_and_get_column_const_string_or_fixedstring(c0); + const ColumnConst* c1_const = check_and_get_column_const_string_or_fixedstring(c1); + if (!((c0_string || c0_const) && (c1_string || c1_const))) { + return Status::NotSupported("Illegal columns {}, {} of argument of function {}", + c0->get_name(), c1->get_name(), name); + } + + if (c0_const && c1_const) { + execute_generic_identical_types(block, result, c0, c1); + return Status::OK(); + } + + const ColumnString::Chars* c0_const_chars = nullptr; + const ColumnString::Chars* c1_const_chars = nullptr; + ColumnString::Offset c0_const_size = 0; + ColumnString::Offset c1_const_size = 0; + + if (c0_const) { + const ColumnString* c0_const_string = + check_and_get_column(&c0_const->get_data_column()); + + if (c0_const_string) { + c0_const_chars = &c0_const_string->get_chars(); + c0_const_size = c0_const_string->get_data_at(0).size; + } else + return Status::NotSupported("Illegal columns {}, of argument of function {}", + c0->get_name(), name); + } + + if (c1_const) { + const ColumnString* c1_const_string = + check_and_get_column(&c1_const->get_data_column()); + + if (c1_const_string) { + c1_const_chars = &c1_const_string->get_chars(); + c1_const_size = c1_const_string->get_data_at(0).size; + } else + return Status::NotSupported("Illegal columns {}, of argument of function {}", + c1->get_name(), name); + } + + using StringImpl = StringComparisonImpl>; + + auto c_res = ColumnUInt8::create(); + ColumnUInt8::Container& vec_res = c_res->get_data(); + vec_res.resize(c0->size()); + + if (c0_string && c1_string) { + StringImpl::string_vector_string_vector( + c0_string->get_chars(), c0_string->get_offsets(), c1_string->get_chars(), + c1_string->get_offsets(), vec_res); + } else if (c0_string && c1_const) { + StringImpl::string_vector_constant(c0_string->get_chars(), c0_string->get_offsets(), + *c1_const_chars, c1_const_size, vec_res); + } else if (c0_const && c1_string) { + StringImpl::constant_string_vector(*c0_const_chars, c0_const_size, + c1_string->get_chars(), c1_string->get_offsets(), + vec_res); + } else { + return Status::NotSupported("Illegal columns {}, {} of argument of function {}", + c0->get_name(), c1->get_name(), name); + } + block.replace_by_position(result, std::move(c_res)); + return Status::OK(); + } + void execute_generic_identical_types(Block& block, size_t result, const IColumn* c0, const IColumn* c1) { bool c0_const = is_column_const(*c0); @@ -373,6 +554,9 @@ class FunctionComparison : public IFunction { const bool left_is_num = col_left_untyped->is_numeric(); const bool right_is_num = col_right_untyped->is_numeric(); + const bool left_is_string = which_left.is_string_or_fixed_string(); + const bool right_is_string = which_right.is_string_or_fixed_string(); + // Compare date and datetime direct use the Int64 compare. Keep the comment // may we should refactor the code. // bool date_and_datetime = (left_type != right_type) && which_left.is_date_or_datetime() && @@ -417,6 +601,9 @@ class FunctionComparison : public IFunction { } return execute_decimal(block, result, col_with_type_and_name_left, col_with_type_and_name_right); + } else if (left_is_string && right_is_string) { + return execute_string(block, result, col_with_type_and_name_left.column.get(), + col_with_type_and_name_right.column.get()); } else { // TODO: varchar and string maybe need a quickly way return execute_generic(block, result, col_with_type_and_name_left, diff --git a/regression-test/data/datatype_p0/string/test_string_basic.out b/regression-test/data/datatype_p0/string/test_string_basic.out index 475cf22f27db8e..fc1268643c1592 100644 --- a/regression-test/data/datatype_p0/string/test_string_basic.out +++ b/regression-test/data/datatype_p0/string/test_string_basic.out @@ -5,3 +5,93 @@ 1 1f44fb91f47cab16f711973af06294a0 65536 2 3c514d3b89e26e2f983b7bd4cbb82055 1048576 +-- !col_eq_col -- +\N 0 0 +8001 1 0 +8008 8 0 +8008 8008 1 + +-- !col_neq_col -- +\N 0 0 +8001 1 1 +8008 8 1 +8008 8008 0 + +-- !col_gt_col -- +\N 0 0 +8001 1 1 +8008 8 1 +8008 8008 0 + +-- !col_eq_const -- +\N 0 0 +8001 1 0 +8008 8 1 +8008 8008 1 + +-- !col_neq_const -- +\N 0 0 +8001 1 1 +8008 8 0 +8008 8008 0 + +-- !col_lt_const -- +\N 0 0 +8001 1 1 +8008 8 0 +8008 8008 0 + +-- !const_eq_col -- +\N 0 0 +8001 1 0 +8008 8 1 +8008 8008 1 + +-- !const_neq_col -- +\N 0 0 +8001 1 1 +8008 8 0 +8008 8008 0 + +-- !const_gt_col -- +\N 0 0 +8001 1 1 +8008 8 0 +8008 8008 0 + +-- !const_eq_const -- +\N 0 1 +8001 1 1 +8008 8 1 +8008 8008 1 + +-- !const_neq_const -- +\N 0 0 +8001 1 0 +8008 8 0 +8008 8008 0 + +-- !const_gt_const -- +\N 0 1 +8001 1 1 +8008 8 1 +8008 8008 1 + +-- !col_eq_null -- +\N 0 0 +8001 1 0 +8008 8 0 +8008 8008 0 + +-- !col_neq_null -- +\N 0 0 +8001 1 0 +8008 8 0 +8008 8008 0 + +-- !col_gt_null -- +\N 0 0 +8001 1 0 +8008 8 0 +8008 8008 0 + diff --git a/regression-test/suites/datatype_p0/string/test_string_basic.groovy b/regression-test/suites/datatype_p0/string/test_string_basic.groovy index 17533d1631ad33..a24f0916577c7f 100644 --- a/regression-test/suites/datatype_p0/string/test_string_basic.groovy +++ b/regression-test/suites/datatype_p0/string/test_string_basic.groovy @@ -47,5 +47,222 @@ suite("test_string_basic") { (2, repeat("test1111", 131072)) """ order_qt_select_str_tb "select k1, md5(v1), length(v1) from ${tbName}" + + sql """drop table if exists test_string_cmp;""" + + sql """ + CREATE TABLE `test_string_cmp` ( + `ts` datetime NULL, + `s1` varchar(32) NULL, + `s2` varchar(128) NULL + ) ENGINE = OLAP + DUPLICATE KEY(`ts`) + DISTRIBUTED BY HASH(`s1`) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1" + ); + """ + + sql """ + INSERT INTO `test_string_cmp` VALUES ('2023-02-22 12:00:00', '8001', '1'), + ('2023-02-22 12:00:03', '8008', '8'), + ('2023-02-22 12:00:03', '8008', '8008'), + ('2023-02-22 12:00:03', null, '0'); + """ + + qt_col_eq_col """ + select + s1, s2, + if( + s1 = s2, + 1, + 0 + ) as counts + from + test_string_cmp + order by s1, s2, counts; + """ + + qt_col_neq_col """ + select + s1, s2, + if( + s1 != s2, + 1, + 0 + ) as counts + from + test_string_cmp + order by s1, s2, counts; + """ + + qt_col_gt_col """ + select + s1, s2, + if( + s1 > s2, + 1, + 0 + ) as counts + from + test_string_cmp + order by s1, s2, counts; + """ + + qt_col_eq_const """ + select + s1, s2, + if( + s1 = '8008', + 1, + 0 + ) as counts + from + test_string_cmp + order by s1, s2, counts; + """ + + qt_col_neq_const """ + select + s1, s2, + if( + s1 != '8008', + 1, + 0 + ) as counts + from + test_string_cmp + order by s1, s2, counts; + """ + + qt_col_lt_const """ + select + s1, s2, + if( + s1 < '8008', + 1, + 0 + ) as counts + from + test_string_cmp + order by s1, s2, counts; + """ + + qt_const_eq_col """ + select + s1, s2, + if( + '8008' = s1, + 1, + 0 + ) as counts + from + test_string_cmp + order by s1, s2, counts; + """ + + qt_const_neq_col """ + select + s1, s2, + if( + '8008' != s1, + 1, + 0 + ) as counts + from + test_string_cmp + order by s1, s2, counts; + """ + + qt_const_gt_col """ + select + s1, s2, + if( + '8008' > s1, + 1, + 0 + ) as counts + from + test_string_cmp + order by s1, s2, counts; + """ + + qt_const_eq_const """ + select + s1, s2, + if( + '8008' = substr('a8008', 2, 4), + 1, + 0 + ) as counts + from + test_string_cmp + order by s1, s2, counts; + """ + + qt_const_neq_const """ + select + s1, s2, + if( + '8008' != substr('a8008', 2, 4), + 1, + 0 + ) as counts + from + test_string_cmp + order by s1, s2, counts; + """ + + qt_const_gt_const """ + select + s1, s2, + if( + '8008' > substr('a8007', 2, 4), + 1, + 0 + ) as counts + from + test_string_cmp + order by s1, s2, counts; + """ + + qt_col_eq_null """ + select + s1, s2, + if( + s1 = null, + 1, + 0 + ) as counts + from + test_string_cmp + order by s1, s2, counts; + """ + + qt_col_neq_null """ + select + s1, s2, + if( + s1 != null, + 1, + 0 + ) as counts + from + test_string_cmp + order by s1, s2, counts; + """ + + qt_col_gt_null """ + select + s1, s2, + if( + s1 > null, + 1, + 0 + ) as counts + from + test_string_cmp + order by s1, s2, counts; + """ } From 55eef0be269eeac8c690e371eb832cb96be74fc4 Mon Sep 17 00:00:00 2001 From: jacktengg <18241664+jacktengg@users.noreply.github.com> Date: Thu, 2 Mar 2023 16:14:23 +0800 Subject: [PATCH 2/2] fix code format --- be/src/vec/functions/functions_comparison.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/be/src/vec/functions/functions_comparison.h b/be/src/vec/functions/functions_comparison.h index 1c1dfed380d2cc..714467c5766152 100644 --- a/be/src/vec/functions/functions_comparison.h +++ b/be/src/vec/functions/functions_comparison.h @@ -430,9 +430,10 @@ class FunctionComparison : public IFunction { if (c0_const_string) { c0_const_chars = &c0_const_string->get_chars(); c0_const_size = c0_const_string->get_data_at(0).size; - } else + } else { return Status::NotSupported("Illegal columns {}, of argument of function {}", c0->get_name(), name); + } } if (c1_const) { @@ -442,9 +443,10 @@ class FunctionComparison : public IFunction { if (c1_const_string) { c1_const_chars = &c1_const_string->get_chars(); c1_const_size = c1_const_string->get_data_at(0).size; - } else + } else { return Status::NotSupported("Illegal columns {}, of argument of function {}", c1->get_name(), name); + } } using StringImpl = StringComparisonImpl>;