Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions be/cmake/thirdparty.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -171,3 +171,7 @@ endif()
if ("${CMAKE_BUILD_TARGET_ARCH}" STREQUAL "x86" OR "${CMAKE_BUILD_TARGET_ARCH}" STREQUAL "x86_64")
add_thirdparty(deflate)
endif()

add_thirdparty(icuuc LIB64)
add_thirdparty(icui18n LIB64)
add_thirdparty(icudata LIB64)
62 changes: 52 additions & 10 deletions be/src/vec/functions/function_string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,12 @@
#include <ctype.h>
#include <math.h>
#include <re2/stringpiece.h>
#include <unicode/unistr.h>
#include <unicode/ustream.h>

#include <bitset>
#include <cstddef>
#include <cstdint>
#include <string_view>

#include "common/status.h"
Expand Down Expand Up @@ -446,20 +449,59 @@ struct TransferImpl {
return Status::OK();
}

const bool is_ascii = simd::VStringFunctions::is_ascii({data.data(), data.size()});
res_offsets.resize(offset_size);
memcpy_small_allow_read_write_overflow15(
res_offsets.data(), offsets.data(),
offset_size * sizeof(ColumnString::Offsets::value_type));

size_t data_length = data.size();
res_data.resize(data_length);
if constexpr (std::is_same_v<OpName, NameToUpper>) {
simd::VStringFunctions::to_upper(data.data(), data_length, res_data.data());
} else if constexpr (std::is_same_v<OpName, NameToLower>) {
simd::VStringFunctions::to_lower(data.data(), data_length, res_data.data());
if (is_ascii) {
memcpy_small_allow_read_write_overflow15(
res_offsets.data(), offsets.data(),
offset_size * sizeof(ColumnString::Offsets::value_type));

size_t data_length = data.size();
res_data.resize(data_length);
if constexpr (std::is_same_v<OpName, NameToUpper>) {
simd::VStringFunctions::to_upper(data.data(), data_length, res_data.data());
} else if constexpr (std::is_same_v<OpName, NameToLower>) {
simd::VStringFunctions::to_lower(data.data(), data_length, res_data.data());
}
} else {
execute_utf8(data, offsets, res_data, res_offsets);
}

return Status::OK();
}

static void execute_utf8(const ColumnString::Chars& data, const ColumnString::Offsets& offsets,
ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) {
std::string result;
for (int64_t i = 0; i < offsets.size(); ++i) {
const char* begin = reinterpret_cast<const char*>(&data[offsets[i - 1]]);
uint32_t size = offsets[i] - offsets[i - 1];

result.clear();
if constexpr (std::is_same_v<OpName, NameToUpper>) {
to_upper_utf8(begin, size, result);
} else if constexpr (std::is_same_v<OpName, NameToLower>) {
to_lower_utf8(begin, size, result);
}
StringOP::push_value_string(result, i, res_data, res_offsets);
}
}

static void to_upper_utf8(const char* data, uint32_t size, std::string& result) {
icu::StringPiece sp;
sp.set(data, size);
icu::UnicodeString unicode_str = icu::UnicodeString::fromUTF8(sp);
unicode_str.toUpper();
unicode_str.toUTF8String(result);
}

static void to_lower_utf8(const char* data, uint32_t size, std::string& result) {
icu::StringPiece sp;
sp.set(data, size);
icu::UnicodeString unicode_str = icu::UnicodeString::fromUTF8(sp);
unicode_str.toLower();
unicode_str.toUTF8String(result);
}
};

// Capitalize first letter
Expand Down
18 changes: 13 additions & 5 deletions be/test/vec/function/function_string_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -496,6 +496,13 @@ TEST(function_string_test, function_string_lower_test) {
{{std::string("123ABC_")}, std::string("123abc_")},
{{std::string("MYtestSTR")}, std::string("myteststr")},
{{std::string("")}, std::string("")},
{{std::string("ÀÇ")}, std::string("àç")},
{{std::string("ÀÇAC123")}, std::string("àçac123")},
{{std::string("İstanbul")}, std::string("i̇stanbul")},
{{std::string("KIZILAY")}, std::string("kizilay")},
{{std::string("GROSSE")}, std::string("grosse")},
{{std::string("Å")}, std::string("å")},
{{std::string("ΣΟΦΟΣ")}, std::string("σοφος")},
{{Null()}, Null()},
//bug{{std::string("ΔΟΚΙΜΑΣΤΙΚΌ ΚΕΊΜΕΝΟ")}, std::string("δοκιμαστικό κείμενο")},
};
Expand Down Expand Up @@ -536,12 +543,13 @@ TEST(function_string_test, function_string_upper_test) {
{{std::string("โพสต์ทดสอบ")}, std::string("โพสต์ทดสอบ")},
{{std::string("יידיש טעקסט")}, std::string("יידיש טעקסט")},
//bug{{std::string("Exámplè wïth âccents")}, std::string("EXÁMPLÈ WÏTH ÂCCENTS")},
{{std::string("ⓔⓧⓐⓜⓟⓛⓔ ⓦⓘⓣⓗ ⓒⓘⓡⓒⓛⓔ ⓛⓔⓣⓣⓔⓡⓢ")},
std::string("ⓔⓧⓐⓜⓟⓛⓔ ⓦⓘⓣⓗ ⓒⓘⓡⓒⓛⓔ ⓛⓔⓣⓣⓔⓡⓢ")},
{{std::string("🅴🆇🅰🅼🅿🅻🅴 🆆🅸🆃🅷 🆂🆀🆄🅰🆁🅴 🅻🅴🆃🆃🅴🆁🆂")},
std::string("🅴🆇🅰🅼🅿🅻🅴 🆆🅸🆃🅷 🆂🆀🆄🅰🆁🅴 🅻🅴🆃🆃🅴🆁🆂")},
{{std::string("àç")}, std::string("ÀÇ")},
{{std::string("straße")}, std::string("STRASSE")},
{{std::string("àçac123")}, std::string("ÀÇAC123")},
{{std::string("ffi")}, std::string("FFI")},
{{std::string("Dž")}, std::string("DŽ")},
{{std::string("Ångström")}, std::string("ÅNGSTRÖM")},
};

check_function_all_arg_comb<DataTypeString, true>(func_name, input_types, data_set);
check_function_all_arg_comb<DataTypeString, true>(std::string("ucase"), input_types,
data_set);
Expand Down
1 change: 1 addition & 0 deletions dist/LICENSE-dist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1536,3 +1536,4 @@ Other dependencies:
* xxhash: 0.8.1 -- licenses/LICENSE-xxhash.txt
* concurrentqueue: 1.0.3 -- licenses/LICENSE-concurrentqueue.txt
* FlameGraph -- licenses/LICENSE-CDDL-1.0.txt
* icu 75.1 -- licenses/LICENSE-icu.txt
Loading
Loading