From 6413b987299b62e8207c435e2e4877b9bb7f4537 Mon Sep 17 00:00:00 2001 From: yangzhg Date: Mon, 7 Feb 2022 17:21:30 +0800 Subject: [PATCH] [fix] fix ltrim result may incorrect in some case --- be/CMakeLists.txt | 3 +- be/src/exprs/v_string_functions.h | 219 ----------------------- be/src/vec/functions/function_string.cpp | 17 +- be/test/exprs/string_functions_test.cpp | 56 +++--- 4 files changed, 39 insertions(+), 256 deletions(-) delete mode 100644 be/src/exprs/v_string_functions.h diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt index 6bf98132f1d4a6..2d50c674a1d7a7 100644 --- a/be/CMakeLists.txt +++ b/be/CMakeLists.txt @@ -674,7 +674,6 @@ else() ${DORIS_DEPENDENCIES} ${WL_START_GROUP} ${X86_DEPENDENCIES} - ${WL_END_GROUP} ) endif() @@ -690,6 +689,8 @@ if (WITH_MYSQL) ) endif() +set(DORIS_DEPENDENCIES ${DORIS_DEPENDENCIES} ${WL_END_GROUP}) + message(STATUS "DORIS_DEPENDENCIES is ${DORIS_DEPENDENCIES}") # Add all external dependencies. They should come after the palo libs. diff --git a/be/src/exprs/v_string_functions.h b/be/src/exprs/v_string_functions.h deleted file mode 100644 index 3fd9845535eda1..00000000000000 --- a/be/src/exprs/v_string_functions.h +++ /dev/null @@ -1,219 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#ifndef BE_V_STRING_FUNCTIONS_H -#define BE_V_STRING_FUNCTIONS_H - -#include -#include -#include "runtime/string_value.hpp" - -#ifdef __SSE2__ -#include -#endif - -namespace doris { -class VStringFunctions { -public: -#ifdef __SSE2__ - /// n equals to 16 chars length - static constexpr auto REGISTER_SIZE = sizeof(__m128i); -#endif -public: - static StringVal rtrim(const StringVal& str) { - if (str.is_null || str.len == 0) { - return str; - } - auto begin = 0; - auto end = str.len - 1; -#ifdef __SSE2__ - char blank = ' '; - const auto pattern = _mm_set1_epi8(blank); - while (end - begin + 1 >= REGISTER_SIZE) { - const auto v_haystack = _mm_loadu_si128(reinterpret_cast(str.ptr + end + 1 - REGISTER_SIZE)); - const auto v_against_pattern = _mm_cmpeq_epi8(v_haystack, pattern); - const auto mask = _mm_movemask_epi8(v_against_pattern); - int offset = __builtin_clz(~(mask << REGISTER_SIZE)); - /// means not found - if (offset == 0) - { - return StringVal(str.ptr + begin, end - begin + 1); - } else { - end -= offset; - } - } -#endif - while (end >= begin && str.ptr[end] == ' ') { - --end; - } - if (end < 0) { - return StringVal(""); - } - return StringVal(str.ptr + begin, end - begin + 1); - } - - static StringVal ltrim(const StringVal& str) { - if (str.is_null || str.len == 0) { - return str; - } - auto begin = 0; - auto end = str.len - 1; -#ifdef __SSE2__ - char blank = ' '; - const auto pattern = _mm_set1_epi8(blank); - while (end - begin + 1 >= REGISTER_SIZE) { - const auto v_haystack = _mm_loadu_si128(reinterpret_cast(str.ptr + begin)); - const auto v_against_pattern = _mm_cmpeq_epi8(v_haystack, pattern); - const auto mask = _mm_movemask_epi8(v_against_pattern); - const auto offset = __builtin_ctz(mask ^ 0xffff); - /// means not found - if (offset == 0) - { - return StringVal(str.ptr + begin, end - begin + 1); - } else if (offset > REGISTER_SIZE) { - begin += REGISTER_SIZE; - } else { - begin += offset; - return StringVal(str.ptr + begin, end - begin + 1); - } - } -#endif - while (begin <= end && str.ptr[begin] == ' ') { - ++begin; - } - return StringVal(str.ptr + begin, end - begin + 1); - } - - static StringVal trim(const StringVal& str) { - if (str.is_null || str.len == 0) { - return str; - } - return rtrim(ltrim(str)); - } - - static bool is_ascii(StringVal str) { - #ifdef __SSE2__ - size_t i = 0; - __m128i binary_code = _mm_setzero_si128(); - if (str.len >= REGISTER_SIZE) { - for (; i <= str.len - REGISTER_SIZE; i += REGISTER_SIZE) { - __m128i chars = _mm_loadu_si128((const __m128i*)(str.ptr + i)); - binary_code = _mm_or_si128(binary_code, chars); - } - } - int mask = _mm_movemask_epi8(binary_code); - - char or_code = 0; - for (; i < str.len; i++) { - or_code |= str.ptr[i]; - } - mask |= (or_code & 0x80); - - return !mask; - #else - char or_code = 0; - for (size_t i = 0; i < str.len; i++) { - or_code |= str.ptr[i]; - } - return !(or_code & 0x80); - #endif - } - - static void reverse(const StringVal& str, StringVal dst) { - if (str.is_null) { - dst.ptr = NULL; - return; - } - const bool is_ascii = VStringFunctions::is_ascii(str); - if (is_ascii) { - int64_t begin = 0; - int64_t end = str.len; - int64_t result_end = dst.len; - #if defined(__SSE2__) - const auto shuffle_array = _mm_set_epi64((__m64)0x00'01'02'03'04'05'06'07ull, (__m64)0x08'09'0a'0b'0c'0d'0e'0full); - for (; (begin + REGISTER_SIZE) < end; begin += REGISTER_SIZE) { - result_end -= REGISTER_SIZE; - _mm_storeu_si128((__m128i*)(dst.ptr + result_end), - _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)(str.ptr + begin)), shuffle_array)); - } - #endif - for (; begin < end; ++begin) { - --result_end; - dst.ptr[result_end] = str.ptr[begin]; - } - } else { - for (size_t i = 0, char_size = 0; i < str.len; i += char_size) { - char_size = get_utf8_byte_length((unsigned)(str.ptr)[i]); - std::copy(str.ptr + i, str.ptr + i + char_size, dst.ptr + str.len - i - char_size); - } - } - } - - static size_t get_utf8_byte_length(unsigned char byte) { - size_t char_size = 0; - if (byte >= 0xFC) { - char_size = 6; - } else if (byte >= 0xF8) { - char_size = 5; - } else if (byte >= 0xF0) { - char_size = 4; - } else if (byte >= 0xE0) { - char_size = 3; - } else if (byte >= 0xC0) { - char_size = 2; - } else { - char_size = 1; - } - return char_size; - } - - static void hex_encode(const unsigned char* src_str, size_t length, char* dst_str) { - static constexpr auto hex_table = "0123456789ABCDEF"; - auto src_str_end = src_str + length; - -#if defined(__SSE2__) - constexpr auto step = sizeof(uint64); - if (src_str + step < src_str_end) { - const auto hex_map = _mm_loadu_si128(reinterpret_cast(hex_table)); - const auto mask_map = _mm_set1_epi8(0x0F); - - do { - auto data = _mm_loadu_si64(src_str); - auto hex_loc = _mm_and_si128(_mm_unpacklo_epi8(_mm_srli_epi64(data, 4), data), mask_map); - _mm_storeu_si128(reinterpret_cast<__m128i *>(dst_str), _mm_shuffle_epi8(hex_map, hex_loc)); - - src_str += step; - dst_str += step * 2; - } while (src_str + step < src_str_end); - } -#endif - char res[2]; - // hex(str) str length is n, result must be 2 * n length - for (; src_str < src_str_end; src_str += 1, dst_str += 2) { - // low 4 bits - *(res + 1) = hex_table[src_str[0] & 0x0F]; - // high 4 bits - *res = hex_table[(src_str[0] >> 4)]; - std::copy(res, res + 2, dst_str); - } - } -}; -} - -#endif //BE_V_STRING_FUNCTIONS_H \ No newline at end of file diff --git a/be/src/vec/functions/function_string.cpp b/be/src/vec/functions/function_string.cpp index bdeb7e57b6b536..a89edf5d73f16e 100644 --- a/be/src/vec/functions/function_string.cpp +++ b/be/src/vec/functions/function_string.cpp @@ -23,9 +23,9 @@ #include #include -#include "exprs/v_string_functions.h" #include "runtime/string_search.hpp" #include "util/encryption_util.h" +#include "util/simd/vstring_function.h" #include "util/url_coding.h" #include "vec/common/pod_array_fwd.h" #include "vec/functions/function_string_to_string.h" @@ -258,8 +258,8 @@ struct ReverseImpl { auto src_str = reinterpret_cast(&data[offsets[i - 1]]); int64_t src_len = offsets[i] - offsets[i - 1] - 1; char dst[src_len]; - VStringFunctions::reverse(StringVal((uint8_t*)src_str, src_len), - StringVal((uint8_t*)dst, src_len)); + simd::VStringFunctions::reverse(StringVal((uint8_t*)src_str, src_len), + StringVal((uint8_t*)dst, src_len)); StringOP::push_value_string(std::string_view(dst, src_len), i, res_data, res_offsets); } return Status::OK(); @@ -271,9 +271,7 @@ struct HexStringName { }; struct HexStringImpl { - static DataTypes get_variadic_argument_types() { - return {std::make_shared()}; - } + static DataTypes get_variadic_argument_types() { return {std::make_shared()}; } static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, ColumnString::Chars& dst_data, ColumnString::Offsets& dst_offsets) { @@ -293,7 +291,8 @@ struct HexStringImpl { dst_data_ptr++; offset++; } else { - VStringFunctions::hex_encode(source, srclen, reinterpret_cast(dst_data_ptr)); + simd::VStringFunctions::hex_encode(source, srclen, + reinterpret_cast(dst_data_ptr)); dst_data_ptr[srclen * 2] = '\0'; dst_data_ptr += (srclen * 2 + 1); offset += (srclen * 2 + 1); @@ -355,10 +354,10 @@ struct TrimImpl { const char* raw_str = reinterpret_cast(&data[offsets[i - 1]]); StringVal str(raw_str); if constexpr (is_ltrim) { - str = VStringFunctions::ltrim(str); + str = simd::VStringFunctions::ltrim(str); } if constexpr (is_rtrim) { - str = VStringFunctions::rtrim(str); + str = simd::VStringFunctions::rtrim(str); } StringOP::push_value_string(std::string_view((char*)str.ptr, str.len), i, res_data, res_offsets); diff --git a/be/test/exprs/string_functions_test.cpp b/be/test/exprs/string_functions_test.cpp index 035e6a1a34cc9c..acae0e871e5bc2 100644 --- a/be/test/exprs/string_functions_test.cpp +++ b/be/test/exprs/string_functions_test.cpp @@ -16,17 +16,18 @@ // under the License. #include "exprs/string_functions.h" -#include "exprs/v_string_functions.h" +#include #include + #include #include -#include #include "exprs/anyval_util.h" #include "test_util/test_util.h" #include "testutil/function_utils.h" #include "util/logging.h" +#include "util/simd/vstring_function.h" namespace doris { @@ -47,8 +48,7 @@ class StringFunctionsTest : public testing::Test { TEST_F(StringFunctionsTest, do_money_format_for_bigint_bench) { doris_udf::FunctionContext* context = new doris_udf::FunctionContext(); - StringVal expected = - AnyValUtil::from_string(ctx, std::string("9,223,372,036,854,775,807.00")); + StringVal expected = AnyValUtil::from_string(ctx, std::string("9,223,372,036,854,775,807.00")); BigIntVal bigIntVal(9223372036854775807); for (int i = 0; i < LOOP_LESS_OR_MORE(10, 10000000); i++) { StringVal result = StringFunctions::money_format(context, bigIntVal); @@ -98,7 +98,7 @@ TEST_F(StringFunctionsTest, money_format_large_int) { value = MIN_INT128; result = StringFunctions::money_format(context, doris_udf::LargeIntVal(value)); expected = AnyValUtil::from_string_temp( - context, std::string("-170,141,183,460,469,231,731,687,303,715,884,105,728.00")); + context, std::string("-170,141,183,460,469,231,731,687,303,715,884,105,728.00")); ASSERT_EQ(expected, result); delete context; } @@ -682,80 +682,82 @@ TEST_F(StringFunctionsTest, upper) { TEST_F(StringFunctionsTest, ltrim) { // no blank StringVal src("hello worldaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); - StringVal res = VStringFunctions::ltrim(src); + StringVal res = simd::VStringFunctions::ltrim(src); ASSERT_EQ(src, res); // empty string StringVal src1(""); - res = VStringFunctions::ltrim(src1); + res = simd::VStringFunctions::ltrim(src1); ASSERT_EQ(src1, res); // null string StringVal src2(StringVal::null()); - res = VStringFunctions::ltrim(src2); + res = simd::VStringFunctions::ltrim(src2); ASSERT_EQ(src2, res); // less than 16 blanks StringVal src3(" hello worldaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); - res = VStringFunctions::ltrim(src3); + res = simd::VStringFunctions::ltrim(src3); ASSERT_EQ(src, res); // more than 16 blanks StringVal src4(" hello worldaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); - res = VStringFunctions::ltrim(src4); + res = simd::VStringFunctions::ltrim(src4); ASSERT_EQ(src, res); // all are blanks, less than 16 blanks StringVal src5(" "); - res = VStringFunctions::ltrim(src5); + res = simd::VStringFunctions::ltrim(src5); ASSERT_EQ(StringVal(""), res); // all are blanks, more than 16 blanks StringVal src6(" "); - res = VStringFunctions::ltrim(src6); + res = simd::VStringFunctions::ltrim(src6); ASSERT_EQ(StringVal(""), res); // src less than 16 length StringVal src7(" 12345678910"); - res = VStringFunctions::ltrim(src7); + res = simd::VStringFunctions::ltrim(src7); ASSERT_EQ(StringVal("12345678910"), res); } TEST_F(StringFunctionsTest, rtrim) { // no blank StringVal src("hello worldaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); - StringVal res = VStringFunctions::rtrim(src); + StringVal res = simd::VStringFunctions::rtrim(src); ASSERT_EQ(src, res); // empty string StringVal src1(""); - res = VStringFunctions::rtrim(src1); + res = simd::VStringFunctions::rtrim(src1); ASSERT_EQ(src1, res); // null string StringVal src2(StringVal::null()); - res = VStringFunctions::rtrim(src2); + res = simd::VStringFunctions::rtrim(src2); ASSERT_EQ(src2, res); // less than 16 blanks StringVal src3("hello worldaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa "); - res = VStringFunctions::rtrim(src3); + res = simd::VStringFunctions::rtrim(src3); ASSERT_EQ(src, res); // more than 16 blanks StringVal src4("hello worldaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa "); - res = VStringFunctions::rtrim(src4); + res = simd::VStringFunctions::rtrim(src4); ASSERT_EQ(src, res); // all are blanks, less than 16 blanks StringVal src5(" "); - res = VStringFunctions::rtrim(src5); + res = simd::VStringFunctions::rtrim(src5); ASSERT_EQ(StringVal(""), res); // all are blanks, more than 16 blanks StringVal src6(" "); - res = VStringFunctions::rtrim(src6); + res = simd::VStringFunctions::rtrim(src6); ASSERT_EQ(StringVal(""), res); // src less than 16 length StringVal src7("12345678910 "); - res = VStringFunctions::rtrim(src7); + res = simd::VStringFunctions::rtrim(src7); ASSERT_EQ(StringVal("12345678910"), res); } TEST_F(StringFunctionsTest, is_ascii) { - ASSERT_EQ(true, VStringFunctions::is_ascii(StringVal("hello123"))); - ASSERT_EQ(true, VStringFunctions::is_ascii(StringVal("hello123fwrewerwerwerwrsfqrwerwefwfwrwfsfwe"))); - ASSERT_EQ(false, VStringFunctions::is_ascii(StringVal("运维组123"))); - ASSERT_EQ(false, VStringFunctions::is_ascii(StringVal("hello123运维组fwrewerwerwerwrsfqrwerwefwfwrwfsfwe"))); - ASSERT_EQ(true, VStringFunctions::is_ascii(StringVal::null())); - ASSERT_EQ(true, VStringFunctions::is_ascii(StringVal(""))); + ASSERT_EQ(true, simd::VStringFunctions::is_ascii(StringVal("hello123"))); + ASSERT_EQ(true, simd::VStringFunctions::is_ascii( + StringVal("hello123fwrewerwerwerwrsfqrwerwefwfwrwfsfwe"))); + ASSERT_EQ(false, simd::VStringFunctions::is_ascii(StringVal("运维组123"))); + ASSERT_EQ(false, simd::VStringFunctions::is_ascii( + StringVal("hello123运维组fwrewerwerwerwrsfqrwerwefwfwrwfsfwe"))); + ASSERT_EQ(true, simd::VStringFunctions::is_ascii(StringVal::null())); + ASSERT_EQ(true, simd::VStringFunctions::is_ascii(StringVal(""))); } } // namespace doris