From aec5e68af725942571b9c38dde2594b513f36a5a Mon Sep 17 00:00:00 2001 From: happenlee Date: Wed, 26 Oct 2022 00:51:55 +0800 Subject: [PATCH 1/2] [opt](exec) Replace get_utf8_byte_length function by array --- be/src/exprs/string_functions.cpp | 10 ++++----- be/src/util/simd/vstring_function.h | 29 ++++++++++---------------- be/src/vec/functions/function_string.h | 22 ++++--------------- 3 files changed, 20 insertions(+), 41 deletions(-) diff --git a/be/src/exprs/string_functions.cpp b/be/src/exprs/string_functions.cpp index bb033d0c041772..399db12bba0a86 100644 --- a/be/src/exprs/string_functions.cpp +++ b/be/src/exprs/string_functions.cpp @@ -37,7 +37,7 @@ void StringFunctions::init() {} size_t get_char_len(const StringVal& str, std::vector* str_index) { size_t char_len = 0; for (size_t i = 0, char_size = 0; i < str.len; i += char_size) { - char_size = get_utf8_byte_length((unsigned)(str.ptr)[i]); + char_size = UTF8_BYTE_LENGTH[(unsigned)(str.ptr)[i]]; str_index->push_back(i); ++char_len; } @@ -65,7 +65,7 @@ StringVal StringFunctions::substring(FunctionContext* context, const StringVal& size_t byte_pos = 0; std::vector index; for (size_t i = 0, char_size = 0; i < str.len; i += char_size) { - char_size = get_utf8_byte_length((unsigned)(str.ptr)[i]); + char_size = UTF8_BYTE_LENGTH[(unsigned)(str.ptr)[i]]; index.push_back(i); if (pos.val > 0 && index.size() > pos.val + len.val) { break; @@ -328,7 +328,7 @@ IntVal StringFunctions::char_utf8_length(FunctionContext* context, const StringV } size_t char_len = 0; for (size_t i = 0, char_size = 0; i < str.len; i += char_size) { - char_size = get_utf8_byte_length((unsigned)(str.ptr)[i]); + char_size = UTF8_BYTE_LENGTH[(unsigned)(str.ptr)[i]]; ++char_len; } return IntVal(char_len); @@ -429,7 +429,7 @@ IntVal StringFunctions::instr(FunctionContext* context, const StringVal& str, if (loc > 0) { size_t char_len = 0; for (size_t i = 0, char_size = 0; i < loc; i += char_size) { - char_size = get_utf8_byte_length((unsigned)(str.ptr)[i]); + char_size = UTF8_BYTE_LENGTH[(unsigned)(str.ptr)[i]]; ++char_len; } loc = char_len; @@ -477,7 +477,7 @@ IntVal StringFunctions::locate_pos(FunctionContext* context, const StringVal& su // Hive returns the position in the original string starting from 1. size_t char_len = 0; for (size_t i = 0, char_size = 0; i < match_pos; i += char_size) { - char_size = get_utf8_byte_length((unsigned)(adjusted_str.ptr)[i]); + char_size = UTF8_BYTE_LENGTH[(unsigned)(adjusted_str.ptr)[i]]; ++char_len; } match_pos = char_len; diff --git a/be/src/util/simd/vstring_function.h b/be/src/util/simd/vstring_function.h index e627683d1a4821..f19cc33d6e8f88 100644 --- a/be/src/util/simd/vstring_function.h +++ b/be/src/util/simd/vstring_function.h @@ -30,23 +30,16 @@ namespace doris { -static size_t get_utf8_byte_length(unsigned char byte) { - size_t char_size = 0; - if (byte >= 0xFC) { - char_size = 6; - } else if (byte >= 0xF8) { - char_size = 5; - } else if (byte >= 0xF0) { - char_size = 4; - } else if (byte >= 0xE0) { - char_size = 3; - } else if (byte >= 0xC0) { - char_size = 2; - } else { - char_size = 1; - } - return char_size; -} +static constexpr std::array UTF8_BYTE_LENGTH = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6}; namespace simd { @@ -149,7 +142,7 @@ class VStringFunctions { } } else { for (size_t i = 0, char_size = 0; i < str.len; i += char_size) { - char_size = get_utf8_byte_length((unsigned)(str.ptr)[i]); + char_size = UTF8_BYTE_LENGTH[(unsigned)(str.ptr)[i]]; std::copy(str.ptr + i, str.ptr + i + char_size, dst.ptr + str.len - i - char_size); } } diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h index cc51514b8b80ba..1663fde90aed3e 100644 --- a/be/src/vec/functions/function_string.h +++ b/be/src/vec/functions/function_string.h @@ -58,24 +58,10 @@ namespace doris::vectorized { -inline size_t get_utf8_byte_length(unsigned char byte) { - size_t char_size = 0; - if (byte < 0xC0) { - char_size = 1; - } else if (byte >= 0xF0) { - char_size = 4; - } else if (byte >= 0xE0) { - char_size = 3; - } else { - char_size = 2; - } - return char_size; -} - inline size_t get_char_len(const std::string_view& str, std::vector* str_index) { size_t char_len = 0; for (size_t i = 0, char_size = 0; i < str.length(); i += char_size) { - char_size = get_utf8_byte_length(str[i]); + char_size = UTF8_BYTE_LENGTH[(unsigned)str[i]]; str_index->push_back(i); ++char_len; } @@ -85,7 +71,7 @@ inline size_t get_char_len(const std::string_view& str, std::vector* str inline size_t get_char_len(const StringVal& str, std::vector* str_index) { size_t char_len = 0; for (size_t i = 0, char_size = 0; i < str.len; i += char_size) { - char_size = get_utf8_byte_length((unsigned)(str.ptr)[i]); + char_size = UTF8_BYTE_LENGTH[(unsigned)(str.ptr)[i]]; str_index->push_back(i); ++char_len; } @@ -95,7 +81,7 @@ inline size_t get_char_len(const StringVal& str, std::vector* str_index) inline size_t get_char_len(const StringValue& str, size_t end_pos) { size_t char_len = 0; for (size_t i = 0, char_size = 0; i < std::min(str.len, end_pos); i += char_size) { - char_size = get_utf8_byte_length((unsigned)(str.ptr)[i]); + char_size = UTF8_BYTE_LENGTH[(unsigned)(str.ptr)[i]]; ++char_len; } return char_len; @@ -192,7 +178,7 @@ struct SubstringUtil { size_t byte_pos = 0; index.clear(); for (size_t j = 0, char_size = 0; j < str_size; j += char_size) { - char_size = get_utf8_byte_length((unsigned)(raw_str)[j]); + char_size = UTF8_BYTE_LENGTH[(unsigned)(raw_str)[j]]; index.push_back(j); if (start[i] > 0 && index.size() > start[i] + len[i]) { break; From df1d722069d240a9904617d593b4b50a8cbacdae Mon Sep 17 00:00:00 2001 From: happenlee Date: Wed, 26 Oct 2022 11:44:10 +0800 Subject: [PATCH 2/2] change unsigned to unsigned char --- be/src/exprs/string_functions.cpp | 10 +++++----- be/src/util/simd/vstring_function.h | 2 +- be/src/vec/functions/function_string.h | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/be/src/exprs/string_functions.cpp b/be/src/exprs/string_functions.cpp index 399db12bba0a86..96a8be323e9c52 100644 --- a/be/src/exprs/string_functions.cpp +++ b/be/src/exprs/string_functions.cpp @@ -37,7 +37,7 @@ void StringFunctions::init() {} size_t get_char_len(const StringVal& str, std::vector* str_index) { size_t char_len = 0; for (size_t i = 0, char_size = 0; i < str.len; i += char_size) { - char_size = UTF8_BYTE_LENGTH[(unsigned)(str.ptr)[i]]; + char_size = UTF8_BYTE_LENGTH[(unsigned char)(str.ptr)[i]]; str_index->push_back(i); ++char_len; } @@ -65,7 +65,7 @@ StringVal StringFunctions::substring(FunctionContext* context, const StringVal& size_t byte_pos = 0; std::vector index; for (size_t i = 0, char_size = 0; i < str.len; i += char_size) { - char_size = UTF8_BYTE_LENGTH[(unsigned)(str.ptr)[i]]; + char_size = UTF8_BYTE_LENGTH[(unsigned char)(str.ptr)[i]]; index.push_back(i); if (pos.val > 0 && index.size() > pos.val + len.val) { break; @@ -328,7 +328,7 @@ IntVal StringFunctions::char_utf8_length(FunctionContext* context, const StringV } size_t char_len = 0; for (size_t i = 0, char_size = 0; i < str.len; i += char_size) { - char_size = UTF8_BYTE_LENGTH[(unsigned)(str.ptr)[i]]; + char_size = UTF8_BYTE_LENGTH[(unsigned char)(str.ptr)[i]]; ++char_len; } return IntVal(char_len); @@ -429,7 +429,7 @@ IntVal StringFunctions::instr(FunctionContext* context, const StringVal& str, if (loc > 0) { size_t char_len = 0; for (size_t i = 0, char_size = 0; i < loc; i += char_size) { - char_size = UTF8_BYTE_LENGTH[(unsigned)(str.ptr)[i]]; + char_size = UTF8_BYTE_LENGTH[(unsigned char)(str.ptr)[i]]; ++char_len; } loc = char_len; @@ -477,7 +477,7 @@ IntVal StringFunctions::locate_pos(FunctionContext* context, const StringVal& su // Hive returns the position in the original string starting from 1. size_t char_len = 0; for (size_t i = 0, char_size = 0; i < match_pos; i += char_size) { - char_size = UTF8_BYTE_LENGTH[(unsigned)(adjusted_str.ptr)[i]]; + char_size = UTF8_BYTE_LENGTH[(unsigned char)(adjusted_str.ptr)[i]]; ++char_len; } match_pos = char_len; diff --git a/be/src/util/simd/vstring_function.h b/be/src/util/simd/vstring_function.h index f19cc33d6e8f88..0e8c3d075e017c 100644 --- a/be/src/util/simd/vstring_function.h +++ b/be/src/util/simd/vstring_function.h @@ -142,7 +142,7 @@ class VStringFunctions { } } else { for (size_t i = 0, char_size = 0; i < str.len; i += char_size) { - char_size = UTF8_BYTE_LENGTH[(unsigned)(str.ptr)[i]]; + char_size = UTF8_BYTE_LENGTH[(unsigned char)(str.ptr)[i]]; std::copy(str.ptr + i, str.ptr + i + char_size, dst.ptr + str.len - i - char_size); } } diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h index 1663fde90aed3e..d3b278a0af1af1 100644 --- a/be/src/vec/functions/function_string.h +++ b/be/src/vec/functions/function_string.h @@ -61,7 +61,7 @@ namespace doris::vectorized { inline size_t get_char_len(const std::string_view& str, std::vector* str_index) { size_t char_len = 0; for (size_t i = 0, char_size = 0; i < str.length(); i += char_size) { - char_size = UTF8_BYTE_LENGTH[(unsigned)str[i]]; + char_size = UTF8_BYTE_LENGTH[(unsigned char)str[i]]; str_index->push_back(i); ++char_len; } @@ -71,7 +71,7 @@ inline size_t get_char_len(const std::string_view& str, std::vector* str inline size_t get_char_len(const StringVal& str, std::vector* str_index) { size_t char_len = 0; for (size_t i = 0, char_size = 0; i < str.len; i += char_size) { - char_size = UTF8_BYTE_LENGTH[(unsigned)(str.ptr)[i]]; + char_size = UTF8_BYTE_LENGTH[(unsigned char)(str.ptr)[i]]; str_index->push_back(i); ++char_len; } @@ -81,7 +81,7 @@ inline size_t get_char_len(const StringVal& str, std::vector* str_index) inline size_t get_char_len(const StringValue& str, size_t end_pos) { size_t char_len = 0; for (size_t i = 0, char_size = 0; i < std::min(str.len, end_pos); i += char_size) { - char_size = UTF8_BYTE_LENGTH[(unsigned)(str.ptr)[i]]; + char_size = UTF8_BYTE_LENGTH[(unsigned char)(str.ptr)[i]]; ++char_len; } return char_len; @@ -178,7 +178,7 @@ struct SubstringUtil { size_t byte_pos = 0; index.clear(); for (size_t j = 0, char_size = 0; j < str_size; j += char_size) { - char_size = UTF8_BYTE_LENGTH[(unsigned)(raw_str)[j]]; + char_size = UTF8_BYTE_LENGTH[(unsigned char)(raw_str)[j]]; index.push_back(j); if (start[i] > 0 && index.size() > start[i] + len[i]) { break;