From afe1bb34d08c90a31584e79c2dc40a2164c6a4e2 Mon Sep 17 00:00:00 2001 From: baishen Date: Tue, 23 Aug 2022 23:28:55 +0800 Subject: [PATCH 01/11] feat: support function unhex pushdown --- dbms/src/Flash/Coprocessor/DAGUtils.cpp | 2 +- dbms/src/Functions/FunctionsString.cpp | 164 ++++++++++++++++++ .../Functions/tests/gtest_strings_unhex.cpp | 58 +++++++ tests/fullstack-test/expr/unhex.test | 35 ++++ 4 files changed, 258 insertions(+), 1 deletion(-) create mode 100644 dbms/src/Functions/tests/gtest_strings_unhex.cpp create mode 100644 tests/fullstack-test/expr/unhex.test diff --git a/dbms/src/Flash/Coprocessor/DAGUtils.cpp b/dbms/src/Flash/Coprocessor/DAGUtils.cpp index 75eb75ecbb0..06cfb4ee271 100755 --- a/dbms/src/Flash/Coprocessor/DAGUtils.cpp +++ b/dbms/src/Flash/Coprocessor/DAGUtils.cpp @@ -670,7 +670,7 @@ const std::unordered_map scalar_func_map({ {tipb::ScalarFuncSig::Trim3Args, "tidbTrim"}, {tipb::ScalarFuncSig::LTrim, "tidbLTrim"}, {tipb::ScalarFuncSig::RTrim, "tidbRTrim"}, - //{tipb::ScalarFuncSig::UnHex, "cast"}, + {tipb::ScalarFuncSig::UnHex, "tidbUnHex"}, {tipb::ScalarFuncSig::UpperUTF8, "upperUTF8"}, {tipb::ScalarFuncSig::Upper, "upperBinary"}, //{tipb::ScalarFuncSig::CharLength, "upper"}, diff --git a/dbms/src/Functions/FunctionsString.cpp b/dbms/src/Functions/FunctionsString.cpp index 8d70b368149..0c62270120e 100644 --- a/dbms/src/Functions/FunctionsString.cpp +++ b/dbms/src/Functions/FunctionsString.cpp @@ -5615,6 +5615,169 @@ class FunctionElt : public IFunction } }; +class FunctionTiDBUnHex : public IFunction +{ +public: + static constexpr auto name = "tidbUnHex"; + FunctionTiDBUnHex() = default; + + static FunctionPtr create(const Context & /*context*/) + { + return std::make_shared(); + } + + std::string getName() const override { return name; } + size_t getNumberOfArguments() const override { return 1; } + bool useDefaultImplementationForConstants() const override { return true; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (!arguments[0]->isStringOrFixedString()) + throw Exception( + fmt::format("Illegal type {} of first argument of function {}", arguments[0]->getName(), getName()), + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + return makeNullable(std::make_shared()); + } + + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result) const override + { + const ColumnPtr & column = block.getByPosition(arguments[0]).column; + + size_t size = block.rows(); + auto col_res = ColumnString::create(); + auto result_null_map = ColumnUInt8::create(size, 0); + if (const auto * col = checkAndGetColumn(column.get())) + { + executeString(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets(), result_null_map->getData()); + block.getByPosition(result).column = ColumnNullable::create(std::move(col_res), std::move(result_null_map)); + } + else if (const auto * col = checkAndGetColumn(column.get())) + { + executeFixedString(col->getChars(), col->getN(), col_res->getChars(), col_res->getOffsets(), result_null_map->getData()); + block.getByPosition(result).column = ColumnNullable::create(std::move(col_res), std::move(result_null_map)); + } + else + { + throw Exception(fmt::format("Illegal argument of function {}", getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + } + } + +private: + static void executeString(const ColumnString::Chars_t & data, + const ColumnString::Offsets & offsets, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets, + ColumnUInt8::Container & res_null_map) + { + size_t size = offsets.size(); + res_data.resize(data.size() / 2 + size); + res_offsets.resize(size); + + ColumnString::Offset pos = 0; + ColumnString::Offset prev_offset = 0; + for (size_t i = 0; i < size; ++i) + { + size_t begin = prev_offset; + size_t length = offsets[i] - prev_offset; + unhexOne(data, length, i, begin, pos, res_data, res_offsets, res_null_map); + pos = res_offsets[i]; + prev_offset = offsets[i]; + } + res_data.resize(pos); + } + + static void executeFixedString(const ColumnString::Chars_t & data, + const size_t size, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets, + ColumnUInt8::Container & res_null_map) + { + size_t length = data.size() / size; + if (length % 2 != 0) + { + res_data.resize(data.size() / 2 + size); + } + else + { + res_data.resize(data.size() / 2); + } + res_offsets.resize(size); + + ColumnString::Offset pos = 0; + for (size_t i = 0; i < size; ++i) + { + size_t begin = i * length; + unhexOne(data, length, i, begin, pos, res_data, res_offsets, res_null_map); + pos = res_offsets[i]; + } + res_data.resize(pos); + } + + static void unhexOne(const ColumnString::Chars_t & data, + const size_t length, + const size_t idx, + size_t begin, + size_t pos, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets, + ColumnUInt8::Container & res_null_map) + { + char low; + char high; + res_offsets[idx] = pos; + if (length % 2 != 0) + { + const char * byte = reinterpret_cast(&data[begin]); + bool ok = fromHexChar(byte, low); + if (!ok) + { + res_null_map[idx] = 1; + return; + } + res_data[pos] = low; + pos++; + begin++; + } + for (size_t i = begin; i < begin + length; i += 2) + { + const char * byte1 = reinterpret_cast(&data[i]); + const char * byte2 = reinterpret_cast(&data[i + 1]); + bool ok1 = fromHexChar(byte1, high); + bool ok2 = fromHexChar(byte2, low); + if (!ok1 || !ok2) + { + res_null_map[idx] = 1; + return; + } + int val = (high << 4) | low; + res_data[pos] = val; + pos++; + } + res_offsets[idx] = pos; + } + + static bool fromHexChar(const char *in, char &out) { + if (*in >= '0' && *in <= '9') + { + out = *in - '0'; + } + else if (*in >= 'a' && *in <= 'f') + { + out = *in - 'a' + 10; + } + else if (*in >= 'A' && *in <= 'F') + { + out = *in - 'A' + 10; + } + else + { + return false; + } + return true; + } + +}; + // clang-format off struct NameEmpty { static constexpr auto name = "empty"; }; struct NameNotEmpty { static constexpr auto name = "notEmpty"; }; @@ -5704,5 +5867,6 @@ void registerFunctionsString(FunctionFactory & factory) factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); + factory.registerFunction(); } } // namespace DB diff --git a/dbms/src/Functions/tests/gtest_strings_unhex.cpp b/dbms/src/Functions/tests/gtest_strings_unhex.cpp new file mode 100644 index 00000000000..32beabce7d0 --- /dev/null +++ b/dbms/src/Functions/tests/gtest_strings_unhex.cpp @@ -0,0 +1,58 @@ +// Copyright 2022 PingCAP, Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include +#include + + +namespace DB +{ +namespace tests +{ +class UnHexTest : public DB::tests::FunctionTest +{ +}; + +TEST_F(UnHexTest, unhex_all_unit_Test) +try +{ + const String & func_name = "tidbUnHex"; + + ASSERT_COLUMN_EQ( + createColumn>({"www.pingcap.com", "abcd", std::nullopt, std::nullopt, ""}), + executeFunction( + func_name, + createColumn>({"7777772E70696E676361702E636F6D", "61626364", std::nullopt, "GG", ""}))); + + // CJK and emoji + ASSERT_COLUMN_EQ( + createColumn>({"さらに入", "测试测试测试测试abcd测试", "🍻", "🏴‍☠️"}), + executeFunction( + func_name, + createColumn>({"E38195E38289E381ABE585A5", "E6B58BE8AF95E6B58BE8AF95E6B58BE8AF95E6B58BE8AF9561626364E6B58BE8AF95", "F09F8DBB", "F09F8FB4E2808DE298A0EFB88F"}))); + + // Special Empty Character + ASSERT_COLUMN_EQ( + createColumn>({"\t", "\t", "\n" "\n", " "}), + executeFunction( + func_name, + createColumn>({"9", "09", "A", "0A", "20"}))); +} +CATCH +} // namespace tests +} // namespace DB \ No newline at end of file diff --git a/tests/fullstack-test/expr/unhex.test b/tests/fullstack-test/expr/unhex.test new file mode 100644 index 00000000000..37e7dab9b2a --- /dev/null +++ b/tests/fullstack-test/expr/unhex.test @@ -0,0 +1,35 @@ +# Copyright 2022 PingCAP, Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +mysql> drop table if exists test.t; +mysql> create table if not exists test.t(a char(100), b int); + +mysql> insert into test.t values('7777772E70696E676361702E636F6D', 3039); +mysql> insert into test.t values('61626364', 61626364); +mysql> insert into test.t values('GG', -1); +mysql> insert into test.t values('E38195E38289E381ABE585A5', 313233); +mysql> insert into test.t values('F09F8DBB', 414243); +mysql> alter table test.t set tiflash replica 1; +func> wait_table test t + +mysql> set tidb_enforce_mpp=1; set tidb_isolation_read_engines='tiflash'; select unhex(a), unhex(b) from test.t; ++-----------------+----------+ +| unhex(a) | unhex(b) | ++-----------------+----------+ +| www.pingcap.com | 09 | +| abcd | abcd | +| NULL | NULL | +| さらに入 | 123 | +| 🍻 | ABC | ++-----------------+----------+ From 343cd8f219a7872e83f6375fb53722c783f47f5a Mon Sep 17 00:00:00 2001 From: baishen Date: Wed, 24 Aug 2022 15:50:34 +0800 Subject: [PATCH 02/11] format --- dbms/src/Functions/FunctionsString.cpp | 34 +++++++++---------- .../Functions/tests/gtest_strings_unhex.cpp | 4 ++- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/dbms/src/Functions/FunctionsString.cpp b/dbms/src/Functions/FunctionsString.cpp index 0c62270120e..3f25aca3cd6 100644 --- a/dbms/src/Functions/FunctionsString.cpp +++ b/dbms/src/Functions/FunctionsString.cpp @@ -5664,10 +5664,10 @@ class FunctionTiDBUnHex : public IFunction private: static void executeString(const ColumnString::Chars_t & data, - const ColumnString::Offsets & offsets, - ColumnString::Chars_t & res_data, - ColumnString::Offsets & res_offsets, - ColumnUInt8::Container & res_null_map) + const ColumnString::Offsets & offsets, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets, + ColumnUInt8::Container & res_null_map) { size_t size = offsets.size(); res_data.resize(data.size() / 2 + size); @@ -5687,10 +5687,10 @@ class FunctionTiDBUnHex : public IFunction } static void executeFixedString(const ColumnString::Chars_t & data, - const size_t size, - ColumnString::Chars_t & res_data, - ColumnString::Offsets & res_offsets, - ColumnUInt8::Container & res_null_map) + const size_t size, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets, + ColumnUInt8::Container & res_null_map) { size_t length = data.size() / size; if (length % 2 != 0) @@ -5714,13 +5714,13 @@ class FunctionTiDBUnHex : public IFunction } static void unhexOne(const ColumnString::Chars_t & data, - const size_t length, - const size_t idx, - size_t begin, - size_t pos, - ColumnString::Chars_t & res_data, - ColumnString::Offsets & res_offsets, - ColumnUInt8::Container & res_null_map) + const size_t length, + const size_t idx, + size_t begin, + size_t pos, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets, + ColumnUInt8::Container & res_null_map) { char low; char high; @@ -5756,7 +5756,8 @@ class FunctionTiDBUnHex : public IFunction res_offsets[idx] = pos; } - static bool fromHexChar(const char *in, char &out) { + static bool fromHexChar(const char * in, char & out) + { if (*in >= '0' && *in <= '9') { out = *in - '0'; @@ -5775,7 +5776,6 @@ class FunctionTiDBUnHex : public IFunction } return true; } - }; // clang-format off diff --git a/dbms/src/Functions/tests/gtest_strings_unhex.cpp b/dbms/src/Functions/tests/gtest_strings_unhex.cpp index 32beabce7d0..c97f5109b96 100644 --- a/dbms/src/Functions/tests/gtest_strings_unhex.cpp +++ b/dbms/src/Functions/tests/gtest_strings_unhex.cpp @@ -48,7 +48,9 @@ try // Special Empty Character ASSERT_COLUMN_EQ( - createColumn>({"\t", "\t", "\n" "\n", " "}), + createColumn>({"\t", "\t", "\n" + "\n", + " "}), executeFunction( func_name, createColumn>({"9", "09", "A", "0A", "20"}))); From 760eea4abe3722a6989b57c3bb134857a1f5de47 Mon Sep 17 00:00:00 2001 From: baishen Date: Wed, 16 Nov 2022 23:27:38 +0800 Subject: [PATCH 03/11] fix test --- dbms/src/Functions/FunctionsString.cpp | 10 ++++++---- dbms/src/Functions/tests/gtest_strings_unhex.cpp | 4 +--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/dbms/src/Functions/FunctionsString.cpp b/dbms/src/Functions/FunctionsString.cpp index 206ca5bf945..afdc9582ac6 100644 --- a/dbms/src/Functions/FunctionsString.cpp +++ b/dbms/src/Functions/FunctionsString.cpp @@ -6019,7 +6019,7 @@ class FunctionTiDBUnHex : public IFunction for (size_t i = 0; i < size; ++i) { size_t begin = prev_offset; - size_t length = offsets[i] - prev_offset; + size_t length = offsets[i] - prev_offset - 1; unhexOne(data, length, i, begin, pos, res_data, res_offsets, res_null_map); pos = res_offsets[i]; prev_offset = offsets[i]; @@ -6065,7 +6065,9 @@ class FunctionTiDBUnHex : public IFunction { char low; char high; - res_offsets[idx] = pos; + size_t end = begin + length; + res_offsets[idx] = pos + 1; + if (length % 2 != 0) { const char * byte = reinterpret_cast(&data[begin]); @@ -6079,7 +6081,7 @@ class FunctionTiDBUnHex : public IFunction pos++; begin++; } - for (size_t i = begin; i < begin + length; i += 2) + for (size_t i = begin; i < end; i += 2) { const char * byte1 = reinterpret_cast(&data[i]); const char * byte2 = reinterpret_cast(&data[i + 1]); @@ -6094,7 +6096,7 @@ class FunctionTiDBUnHex : public IFunction res_data[pos] = val; pos++; } - res_offsets[idx] = pos; + res_offsets[idx] = pos + 1; } static bool fromHexChar(const char * in, char & out) diff --git a/dbms/src/Functions/tests/gtest_strings_unhex.cpp b/dbms/src/Functions/tests/gtest_strings_unhex.cpp index c97f5109b96..23dd8d27f0e 100644 --- a/dbms/src/Functions/tests/gtest_strings_unhex.cpp +++ b/dbms/src/Functions/tests/gtest_strings_unhex.cpp @@ -48,9 +48,7 @@ try // Special Empty Character ASSERT_COLUMN_EQ( - createColumn>({"\t", "\t", "\n" - "\n", - " "}), + createColumn>({"\t", "\t", "\n", "\n", " "}), executeFunction( func_name, createColumn>({"9", "09", "A", "0A", "20"}))); From 51e507e71c845d8ae44424c236e45faa5e9910c3 Mon Sep 17 00:00:00 2001 From: baishen Date: Sat, 19 Nov 2022 22:51:32 +0800 Subject: [PATCH 04/11] fix test --- dbms/src/Functions/FunctionsString.cpp | 143 ++++++++++++++---- .../Functions/tests/gtest_strings_unhex.cpp | 49 ++++++ 2 files changed, 163 insertions(+), 29 deletions(-) diff --git a/dbms/src/Functions/FunctionsString.cpp b/dbms/src/Functions/FunctionsString.cpp index afdc9582ac6..a95f27e11e3 100644 --- a/dbms/src/Functions/FunctionsString.cpp +++ b/dbms/src/Functions/FunctionsString.cpp @@ -5973,7 +5973,7 @@ class FunctionTiDBUnHex : public IFunction DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { - if (!arguments[0]->isStringOrFixedString()) + if (!arguments[0]->isStringOrFixedString() && !arguments[0]->isNumber()) throw Exception( fmt::format("Illegal type {} of first argument of function {}", arguments[0]->getName(), getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); @@ -5987,14 +5987,18 @@ class FunctionTiDBUnHex : public IFunction size_t size = block.rows(); auto col_res = ColumnString::create(); auto result_null_map = ColumnUInt8::create(size, 0); - if (const auto * col = checkAndGetColumn(column.get())) - { - executeString(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets(), result_null_map->getData()); - block.getByPosition(result).column = ColumnNullable::create(std::move(col_res), std::move(result_null_map)); - } - else if (const auto * col = checkAndGetColumn(column.get())) + + if (executeUnHexString(column, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) + || executeUnHexFixedString(column, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) + || executeUnHexInt(column, true, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) + || executeUnHexInt(column, true, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) + || executeUnHexInt(column, true, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) + || executeUnHexInt(column, true, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) + || executeUnHexInt(column, false, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) + || executeUnHexInt(column, false, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) + || executeUnHexInt(column, false, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) + || executeUnHexInt(column, false, col_res->getChars(), col_res->getOffsets(), result_null_map->getData())) { - executeFixedString(col->getChars(), col->getN(), col_res->getChars(), col_res->getOffsets(), result_null_map->getData()); block.getByPosition(result).column = ColumnNullable::create(std::move(col_res), std::move(result_null_map)); } else @@ -6004,13 +6008,19 @@ class FunctionTiDBUnHex : public IFunction } private: - static void executeString(const ColumnString::Chars_t & data, - const ColumnString::Offsets & offsets, - ColumnString::Chars_t & res_data, - ColumnString::Offsets & res_offsets, - ColumnUInt8::Container & res_null_map) + static bool executeUnHexString(const ColumnPtr & column, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets, + ColumnUInt8::Container & res_null_map) { - size_t size = offsets.size(); + const auto col = checkAndGetColumn(column.get()); + if (col == nullptr) + { + return false; + } + const size_t size = col->size(); + const ColumnString::Chars_t & data = col->getChars(); + const ColumnString::Offsets & offsets = col->getOffsets(); res_data.resize(data.size() / 2 + size); res_offsets.resize(size); @@ -6025,22 +6035,31 @@ class FunctionTiDBUnHex : public IFunction prev_offset = offsets[i]; } res_data.resize(pos); + + return true; } - static void executeFixedString(const ColumnString::Chars_t & data, - const size_t size, - ColumnString::Chars_t & res_data, - ColumnString::Offsets & res_offsets, - ColumnUInt8::Container & res_null_map) + static bool executeUnHexFixedString(const ColumnPtr & column, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets, + ColumnUInt8::Container & res_null_map) { - size_t length = data.size() / size; + const auto col = checkAndGetColumn(column.get()); + if (col == nullptr) + { + return false; + } + const size_t size = col->size(); + const ColumnString::Chars_t & data = col->getChars(); + const size_t length = col->getN(); + if (length % 2 != 0) { - res_data.resize(data.size() / 2 + size); + res_data.resize(length / 2 + 1); } else { - res_data.resize(data.size() / 2); + res_data.resize(length / 2); } res_offsets.resize(size); @@ -6052,6 +6071,76 @@ class FunctionTiDBUnHex : public IFunction pos = res_offsets[i]; } res_data.resize(pos); + + return true; + } + + template + static bool executeUnHexInt( + const ColumnPtr & column, + const bool is_unsigned, + ColumnString::Chars_t & res_chars, + ColumnString::Offsets & res_offsets, + ColumnUInt8::Container & res_null_map) + { + const auto col = checkAndGetColumn>(column.get()); + if (col == nullptr) + { + return false; + } + const size_t size = col->size(); + res_chars.resize(size * 10); + res_offsets.resize(size); + + char low; + char high; + char data[20]; + size_t length; + size_t pos = 0; + for (size_t i = 0; i < size; ++i) + { + if (is_unsigned) + { + UInt64 number = col->getUInt(i); + length = sprintf(data, "%llu", number); + } + else + { + Int64 number = col->getInt(i); + if (number < 0) + { + pos++; + res_offsets[i] = pos; + res_null_map[i] = 1; + continue; + } + length = sprintf(data, "%lli", number); + } + + size_t begin = 0; + if (length % 2 != 0) + { + const char * byte = reinterpret_cast(&data[begin]); + fromHexChar(byte, low); + res_chars[pos] = low; + pos++; + begin++; + } + for (size_t j = begin; j < length; j += 2) + { + const char * byte1 = reinterpret_cast(&data[j]); + const char * byte2 = reinterpret_cast(&data[j + 1]); + fromHexChar(byte1, high); + fromHexChar(byte2, low); + res_chars[pos] = (high << 4) | low; + pos++; + } + pos++; + res_offsets[i] = pos; + } + res_chars.resize(pos); + + return true; } static void unhexOne(const ColumnString::Chars_t & data, @@ -6071,8 +6160,7 @@ class FunctionTiDBUnHex : public IFunction if (length % 2 != 0) { const char * byte = reinterpret_cast(&data[begin]); - bool ok = fromHexChar(byte, low); - if (!ok) + if (!fromHexChar(byte, low)) { res_null_map[idx] = 1; return; @@ -6085,15 +6173,12 @@ class FunctionTiDBUnHex : public IFunction { const char * byte1 = reinterpret_cast(&data[i]); const char * byte2 = reinterpret_cast(&data[i + 1]); - bool ok1 = fromHexChar(byte1, high); - bool ok2 = fromHexChar(byte2, low); - if (!ok1 || !ok2) + if (!fromHexChar(byte1, high) || !fromHexChar(byte2, low)) { res_null_map[idx] = 1; return; } - int val = (high << 4) | low; - res_data[pos] = val; + res_data[pos] = (high << 4) | low; pos++; } res_offsets[idx] = pos + 1; diff --git a/dbms/src/Functions/tests/gtest_strings_unhex.cpp b/dbms/src/Functions/tests/gtest_strings_unhex.cpp index 23dd8d27f0e..439c5f0bab8 100644 --- a/dbms/src/Functions/tests/gtest_strings_unhex.cpp +++ b/dbms/src/Functions/tests/gtest_strings_unhex.cpp @@ -52,6 +52,55 @@ try executeFunction( func_name, createColumn>({"9", "09", "A", "0A", "20"}))); + + ASSERT_COLUMN_EQ( + createColumn>({"abcd", "\tg", std::nullopt, std::nullopt}), + executeFunction( + func_name, + createColumn>({61626364, 967, std::nullopt, -1}))); + + ASSERT_COLUMN_EQ( + createColumn>({"abcd", "\tg", std::nullopt}), + executeFunction( + func_name, + createColumn>({61626364, 967, std::nullopt}))); + + ASSERT_COLUMN_EQ( + createColumn>({"abc", "\tg", std::nullopt, std::nullopt}), + executeFunction( + func_name, + createColumn>({616263, 967, std::nullopt, -1}))); + + ASSERT_COLUMN_EQ( + createColumn>({"abc", "\tg", std::nullopt}), + executeFunction( + func_name, + createColumn>({616263, 967, std::nullopt}))); + + ASSERT_COLUMN_EQ( + createColumn>({"ab", "\tg", std::nullopt, std::nullopt}), + executeFunction( + func_name, + createColumn>({6162, 967, std::nullopt, -1}))); + + ASSERT_COLUMN_EQ( + createColumn>({"ab", "\tg", std::nullopt}), + executeFunction( + func_name, + createColumn>({6162, 967, std::nullopt}))); + + ASSERT_COLUMN_EQ( + createColumn>({"a", "\t", std::nullopt, std::nullopt}), + executeFunction( + func_name, + createColumn>({61, 9, std::nullopt, -1}))); + + ASSERT_COLUMN_EQ( + createColumn>({"a", "\t", std::nullopt}), + executeFunction( + func_name, + createColumn>({61, 9, std::nullopt}))); + } CATCH } // namespace tests From 692013f1fbce85b5a0074600147e86f4ae7f5807 Mon Sep 17 00:00:00 2001 From: baishen Date: Sat, 19 Nov 2022 23:23:42 +0800 Subject: [PATCH 05/11] fix --- dbms/src/Functions/FunctionsString.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbms/src/Functions/FunctionsString.cpp b/dbms/src/Functions/FunctionsString.cpp index a95f27e11e3..bdd291bea13 100644 --- a/dbms/src/Functions/FunctionsString.cpp +++ b/dbms/src/Functions/FunctionsString.cpp @@ -6102,7 +6102,7 @@ class FunctionTiDBUnHex : public IFunction if (is_unsigned) { UInt64 number = col->getUInt(i); - length = sprintf(data, "%llu", number); + length = sprintf(data, "%lu", number); } else { @@ -6114,7 +6114,7 @@ class FunctionTiDBUnHex : public IFunction res_null_map[i] = 1; continue; } - length = sprintf(data, "%lli", number); + length = sprintf(data, "%li", number); } size_t begin = 0; From 66a2b66fcb3497ae072c87919e3d901c954c0826 Mon Sep 17 00:00:00 2001 From: baishen Date: Sat, 19 Nov 2022 23:35:19 +0800 Subject: [PATCH 06/11] format --- dbms/src/Functions/tests/gtest_strings_unhex.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dbms/src/Functions/tests/gtest_strings_unhex.cpp b/dbms/src/Functions/tests/gtest_strings_unhex.cpp index 439c5f0bab8..fab052ff7bd 100644 --- a/dbms/src/Functions/tests/gtest_strings_unhex.cpp +++ b/dbms/src/Functions/tests/gtest_strings_unhex.cpp @@ -100,8 +100,7 @@ try executeFunction( func_name, createColumn>({61, 9, std::nullopt}))); - } CATCH } // namespace tests -} // namespace DB \ No newline at end of file +} // namespace DB From 148d7fb0f108708ad8899121dac0a0e7195c4817 Mon Sep 17 00:00:00 2001 From: baishen Date: Sun, 20 Nov 2022 00:04:08 +0800 Subject: [PATCH 07/11] tidy --- dbms/src/Functions/FunctionsString.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbms/src/Functions/FunctionsString.cpp b/dbms/src/Functions/FunctionsString.cpp index bdd291bea13..cfbe7225168 100644 --- a/dbms/src/Functions/FunctionsString.cpp +++ b/dbms/src/Functions/FunctionsString.cpp @@ -6013,7 +6013,7 @@ class FunctionTiDBUnHex : public IFunction ColumnString::Offsets & res_offsets, ColumnUInt8::Container & res_null_map) { - const auto col = checkAndGetColumn(column.get()); + const auto * const col = checkAndGetColumn(column.get()); if (col == nullptr) { return false; @@ -6044,7 +6044,7 @@ class FunctionTiDBUnHex : public IFunction ColumnString::Offsets & res_offsets, ColumnUInt8::Container & res_null_map) { - const auto col = checkAndGetColumn(column.get()); + const auto * const col = checkAndGetColumn(column.get()); if (col == nullptr) { return false; From 2acc88d3dc56265472f251bc81cdd03e319550f1 Mon Sep 17 00:00:00 2001 From: baishen Date: Mon, 21 Nov 2022 23:08:52 +0800 Subject: [PATCH 08/11] fix --- dbms/src/Functions/FunctionsString.cpp | 60 ++++--------------- .../Functions/tests/gtest_strings_unhex.cpp | 9 ++- 2 files changed, 19 insertions(+), 50 deletions(-) diff --git a/dbms/src/Functions/FunctionsString.cpp b/dbms/src/Functions/FunctionsString.cpp index cfbe7225168..779a094997a 100644 --- a/dbms/src/Functions/FunctionsString.cpp +++ b/dbms/src/Functions/FunctionsString.cpp @@ -5973,7 +5973,7 @@ class FunctionTiDBUnHex : public IFunction DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { - if (!arguments[0]->isStringOrFixedString() && !arguments[0]->isNumber()) + if (!arguments[0]->isString() && !arguments[0]->isNumber()) throw Exception( fmt::format("Illegal type {} of first argument of function {}", arguments[0]->getName(), getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); @@ -5989,15 +5989,14 @@ class FunctionTiDBUnHex : public IFunction auto result_null_map = ColumnUInt8::create(size, 0); if (executeUnHexString(column, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) - || executeUnHexFixedString(column, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) - || executeUnHexInt(column, true, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) - || executeUnHexInt(column, true, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) - || executeUnHexInt(column, true, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) - || executeUnHexInt(column, true, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) - || executeUnHexInt(column, false, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) - || executeUnHexInt(column, false, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) - || executeUnHexInt(column, false, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) - || executeUnHexInt(column, false, col_res->getChars(), col_res->getOffsets(), result_null_map->getData())) + || executeUnHexInt(column, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) + || executeUnHexInt(column, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) + || executeUnHexInt(column, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) + || executeUnHexInt(column, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) + || executeUnHexInt(column, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) + || executeUnHexInt(column, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) + || executeUnHexInt(column, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) + || executeUnHexInt(column, col_res->getChars(), col_res->getOffsets(), result_null_map->getData())) { block.getByPosition(result).column = ColumnNullable::create(std::move(col_res), std::move(result_null_map)); } @@ -6039,46 +6038,9 @@ class FunctionTiDBUnHex : public IFunction return true; } - static bool executeUnHexFixedString(const ColumnPtr & column, - ColumnString::Chars_t & res_data, - ColumnString::Offsets & res_offsets, - ColumnUInt8::Container & res_null_map) - { - const auto * const col = checkAndGetColumn(column.get()); - if (col == nullptr) - { - return false; - } - const size_t size = col->size(); - const ColumnString::Chars_t & data = col->getChars(); - const size_t length = col->getN(); - - if (length % 2 != 0) - { - res_data.resize(length / 2 + 1); - } - else - { - res_data.resize(length / 2); - } - res_offsets.resize(size); - - ColumnString::Offset pos = 0; - for (size_t i = 0; i < size; ++i) - { - size_t begin = i * length; - unhexOne(data, length, i, begin, pos, res_data, res_offsets, res_null_map); - pos = res_offsets[i]; - } - res_data.resize(pos); - - return true; - } - - template + template static bool executeUnHexInt( const ColumnPtr & column, - const bool is_unsigned, ColumnString::Chars_t & res_chars, ColumnString::Offsets & res_offsets, ColumnUInt8::Container & res_null_map) @@ -6099,7 +6061,7 @@ class FunctionTiDBUnHex : public IFunction size_t pos = 0; for (size_t i = 0; i < size; ++i) { - if (is_unsigned) + if constexpr (is_unsigned) { UInt64 number = col->getUInt(i); length = sprintf(data, "%lu", number); diff --git a/dbms/src/Functions/tests/gtest_strings_unhex.cpp b/dbms/src/Functions/tests/gtest_strings_unhex.cpp index fab052ff7bd..95955129851 100644 --- a/dbms/src/Functions/tests/gtest_strings_unhex.cpp +++ b/dbms/src/Functions/tests/gtest_strings_unhex.cpp @@ -28,7 +28,7 @@ class UnHexTest : public DB::tests::FunctionTest { }; -TEST_F(UnHexTest, unhex_all_unit_Test) +TEST_F(UnHexTest, unhexAllUnitTest) try { const String & func_name = "tidbUnHex"; @@ -53,6 +53,13 @@ try func_name, createColumn>({"9", "09", "A", "0A", "20"}))); + // Const Column + ASSERT_COLUMN_EQ( + createConstColumn>(4, "ab"), + executeFunction( + func_name, + createConstColumn(4, "6162"))); + ASSERT_COLUMN_EQ( createColumn>({"abcd", "\tg", std::nullopt, std::nullopt}), executeFunction( From 5bcca8023668bccfceeb07f561b40d439595ecbd Mon Sep 17 00:00:00 2001 From: baishen Date: Tue, 22 Nov 2022 11:12:10 +0800 Subject: [PATCH 09/11] remove int implement --- dbms/src/Functions/FunctionsString.cpp | 77 +------------------ .../Functions/tests/gtest_strings_unhex.cpp | 48 ------------ 2 files changed, 1 insertion(+), 124 deletions(-) diff --git a/dbms/src/Functions/FunctionsString.cpp b/dbms/src/Functions/FunctionsString.cpp index 779a094997a..7435b9a9973 100644 --- a/dbms/src/Functions/FunctionsString.cpp +++ b/dbms/src/Functions/FunctionsString.cpp @@ -5988,15 +5988,7 @@ class FunctionTiDBUnHex : public IFunction auto col_res = ColumnString::create(); auto result_null_map = ColumnUInt8::create(size, 0); - if (executeUnHexString(column, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) - || executeUnHexInt(column, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) - || executeUnHexInt(column, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) - || executeUnHexInt(column, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) - || executeUnHexInt(column, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) - || executeUnHexInt(column, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) - || executeUnHexInt(column, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) - || executeUnHexInt(column, col_res->getChars(), col_res->getOffsets(), result_null_map->getData()) - || executeUnHexInt(column, col_res->getChars(), col_res->getOffsets(), result_null_map->getData())) + if (executeUnHexString(column, col_res->getChars(), col_res->getOffsets(), result_null_map->getData())) { block.getByPosition(result).column = ColumnNullable::create(std::move(col_res), std::move(result_null_map)); } @@ -6038,73 +6030,6 @@ class FunctionTiDBUnHex : public IFunction return true; } - template - static bool executeUnHexInt( - const ColumnPtr & column, - ColumnString::Chars_t & res_chars, - ColumnString::Offsets & res_offsets, - ColumnUInt8::Container & res_null_map) - { - const auto col = checkAndGetColumn>(column.get()); - if (col == nullptr) - { - return false; - } - const size_t size = col->size(); - res_chars.resize(size * 10); - res_offsets.resize(size); - - char low; - char high; - char data[20]; - size_t length; - size_t pos = 0; - for (size_t i = 0; i < size; ++i) - { - if constexpr (is_unsigned) - { - UInt64 number = col->getUInt(i); - length = sprintf(data, "%lu", number); - } - else - { - Int64 number = col->getInt(i); - if (number < 0) - { - pos++; - res_offsets[i] = pos; - res_null_map[i] = 1; - continue; - } - length = sprintf(data, "%li", number); - } - - size_t begin = 0; - if (length % 2 != 0) - { - const char * byte = reinterpret_cast(&data[begin]); - fromHexChar(byte, low); - res_chars[pos] = low; - pos++; - begin++; - } - for (size_t j = begin; j < length; j += 2) - { - const char * byte1 = reinterpret_cast(&data[j]); - const char * byte2 = reinterpret_cast(&data[j + 1]); - fromHexChar(byte1, high); - fromHexChar(byte2, low); - res_chars[pos] = (high << 4) | low; - pos++; - } - pos++; - res_offsets[i] = pos; - } - res_chars.resize(pos); - - return true; - } - static void unhexOne(const ColumnString::Chars_t & data, const size_t length, const size_t idx, diff --git a/dbms/src/Functions/tests/gtest_strings_unhex.cpp b/dbms/src/Functions/tests/gtest_strings_unhex.cpp index 95955129851..e8e513c669a 100644 --- a/dbms/src/Functions/tests/gtest_strings_unhex.cpp +++ b/dbms/src/Functions/tests/gtest_strings_unhex.cpp @@ -59,54 +59,6 @@ try executeFunction( func_name, createConstColumn(4, "6162"))); - - ASSERT_COLUMN_EQ( - createColumn>({"abcd", "\tg", std::nullopt, std::nullopt}), - executeFunction( - func_name, - createColumn>({61626364, 967, std::nullopt, -1}))); - - ASSERT_COLUMN_EQ( - createColumn>({"abcd", "\tg", std::nullopt}), - executeFunction( - func_name, - createColumn>({61626364, 967, std::nullopt}))); - - ASSERT_COLUMN_EQ( - createColumn>({"abc", "\tg", std::nullopt, std::nullopt}), - executeFunction( - func_name, - createColumn>({616263, 967, std::nullopt, -1}))); - - ASSERT_COLUMN_EQ( - createColumn>({"abc", "\tg", std::nullopt}), - executeFunction( - func_name, - createColumn>({616263, 967, std::nullopt}))); - - ASSERT_COLUMN_EQ( - createColumn>({"ab", "\tg", std::nullopt, std::nullopt}), - executeFunction( - func_name, - createColumn>({6162, 967, std::nullopt, -1}))); - - ASSERT_COLUMN_EQ( - createColumn>({"ab", "\tg", std::nullopt}), - executeFunction( - func_name, - createColumn>({6162, 967, std::nullopt}))); - - ASSERT_COLUMN_EQ( - createColumn>({"a", "\t", std::nullopt, std::nullopt}), - executeFunction( - func_name, - createColumn>({61, 9, std::nullopt, -1}))); - - ASSERT_COLUMN_EQ( - createColumn>({"a", "\t", std::nullopt}), - executeFunction( - func_name, - createColumn>({61, 9, std::nullopt}))); } CATCH } // namespace tests From 505d0950fc5a0d24f74225ddb55478a79758859b Mon Sep 17 00:00:00 2001 From: baishen Date: Tue, 29 Nov 2022 15:44:12 +0800 Subject: [PATCH 10/11] fix --- dbms/src/Functions/FunctionsString.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Functions/FunctionsString.cpp b/dbms/src/Functions/FunctionsString.cpp index 7435b9a9973..e6f3e1c680f 100644 --- a/dbms/src/Functions/FunctionsString.cpp +++ b/dbms/src/Functions/FunctionsString.cpp @@ -5973,7 +5973,7 @@ class FunctionTiDBUnHex : public IFunction DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { - if (!arguments[0]->isString() && !arguments[0]->isNumber()) + if (!arguments[0]->isString()) { throw Exception( fmt::format("Illegal type {} of first argument of function {}", arguments[0]->getName(), getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); From 857819beea5335c0b01d121efad938f2b424de7a Mon Sep 17 00:00:00 2001 From: baishen Date: Fri, 2 Dec 2022 14:02:11 +0800 Subject: [PATCH 11/11] add tests --- dbms/src/Functions/FunctionsString.cpp | 2 +- dbms/src/Functions/tests/gtest_strings_unhex.cpp | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/dbms/src/Functions/FunctionsString.cpp b/dbms/src/Functions/FunctionsString.cpp index e6f3e1c680f..591c9cd2ddd 100644 --- a/dbms/src/Functions/FunctionsString.cpp +++ b/dbms/src/Functions/FunctionsString.cpp @@ -5973,7 +5973,7 @@ class FunctionTiDBUnHex : public IFunction DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { - if (!arguments[0]->isString()) { + if (!arguments[0]->isString()) throw Exception( fmt::format("Illegal type {} of first argument of function {}", arguments[0]->getName(), getName()), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); diff --git a/dbms/src/Functions/tests/gtest_strings_unhex.cpp b/dbms/src/Functions/tests/gtest_strings_unhex.cpp index e8e513c669a..7ff65ba801f 100644 --- a/dbms/src/Functions/tests/gtest_strings_unhex.cpp +++ b/dbms/src/Functions/tests/gtest_strings_unhex.cpp @@ -39,6 +39,16 @@ try func_name, createColumn>({"7777772E70696E676361702E636F6D", "61626364", std::nullopt, "GG", ""}))); + ASSERT_COLUMN_EQ( + createColumn>({"ѐёђѓєѕіїјЉЊЋЌЍЎЏ", "+ѐ-ё*ђ/ѓ!є@ѕ#і$@ї%ј……Љ&Њ(Ћ)Ќ¥Ѝ#Ў@Џ!^", "αβγδεζηθικλμνξοπρστυφχψως", "▲α▼βγ➨δε☎ζη✂θι€κλ♫μν✓ξο✚πρ℉στ♥υφ♖χψ♘ω★ς✕", "թփձջրչճժծքոեռտըւիօպասդֆգհյկլխզղցվբնմշ"}), + executeFunction( + func_name, + createColumn({"d190d191d192d193d194d195d196d197d198d089d08ad08bd08cd08dd08ed08f", + "2bd1902dd1912ad1922fd19321d19440d19523d1962440d19725d198e280a6e280a6d08926d08aefbc88d08befbc89d08cefbfa5d08d23d08e40d08fefbc815e", + "ceb1ceb2ceb3ceb4ceb5ceb6ceb7ceb8ceb9cebacebbcebccebdcebecebfcf80cf81cf83cf84cf85cf86cf87cf88cf89cf82", + "e296b2ceb1e296bcceb2ceb3e29ea8ceb4ceb5e2988eceb6ceb7e29c82ceb8ceb9e282accebacebbe299abcebccebde29c93cebecebfe29c9acf80cf81e28489cf83cf84e299a5cf85cf86e29996cf87cf88e29998cf89e29885cf82e29c95", + "d5a9d683d5b1d5bbd680d5b9d5b3d5aad5aed684d5b8d5a5d5bcd5bfd5a8d682d5abd685d5bad5a1d5bdd5a4d686d5a3d5b0d5b5d5afd5acd5add5a6d5b2d681d5bed5a2d5b6d5b4d5b7"}))); + // CJK and emoji ASSERT_COLUMN_EQ( createColumn>({"さらに入", "测试测试测试测试abcd测试", "🍻", "🏴‍☠️"}),