From c520e8b633cebf5115b62e3539b4a25582729b0e Mon Sep 17 00:00:00 2001 From: sjyango Date: Mon, 25 Dec 2023 14:10:51 +0000 Subject: [PATCH 1/3] update ip parser --- be/src/util/types.h | 5 - be/src/vec/common/format_ip.h | 163 ++++++++++++++++++++++- be/src/vec/runtime/ipv4_value.h | 88 +++--------- be/src/vec/runtime/ipv6_value.h | 228 +++----------------------------- 4 files changed, 201 insertions(+), 283 deletions(-) diff --git a/be/src/util/types.h b/be/src/util/types.h index b04c0644f6fa7f..7688dd60390855 100644 --- a/be/src/util/types.h +++ b/be/src/util/types.h @@ -44,9 +44,4 @@ inline int128_t get_int128_from_unalign(const void* address) { return value; } -inline uint128_t get_uint128_from_unalign(const void* address) { - uint128_t value = 0; - memcpy(&value, address, sizeof(uint128_t)); - return value; -} } // namespace doris diff --git a/be/src/vec/common/format_ip.h b/be/src/vec/common/format_ip.h index ab9ac4b6595032..ff70500ea1d6f6 100644 --- a/be/src/vec/common/format_ip.h +++ b/be/src/vec/common/format_ip.h @@ -20,7 +20,9 @@ #pragma once +#include #include +#include #include #include @@ -34,7 +36,7 @@ constexpr size_t IPV4_MAX_TEXT_LENGTH = 15; /// Does not count tail zero byte. constexpr size_t IPV6_MAX_TEXT_LENGTH = 39; constexpr size_t IPV4_MIN_NUM_VALUE = 0; //num value of '0.0.0.0' constexpr size_t IPV4_MAX_NUM_VALUE = 4294967295; //num value of '255.255.255.255' -constexpr int IPV4_MAX_OCTET_VALUE = 255; //max vulue of octet +constexpr int IPV4_MAX_OCTET_VALUE = 255; //max value of octet constexpr size_t IPV4_OCTET_BITS = 8; constexpr size_t DECIMAL_BASE = 10; constexpr size_t IPV6_BINARY_LENGTH = 16; @@ -198,4 +200,163 @@ inline bool parseIPv4whole(const char* src, unsigned char* dst) { */ void formatIPv6(const unsigned char* src, char*& dst, uint8_t zeroed_tail_bytes_count = 0); +/** Unsafe (no bounds-checking for src nor dst), optimized version of parsing IPv6 string. +* +* Parses the input string `src` and stores binary big-endian value into buffer pointed by `dst`, +* which should be long enough. In case of failure zeroes IPV6_BINARY_LENGTH bytes of buffer pointed by `dst`. +* +* WARNING - this function is adapted to work with ReadBuffer, where src is the position reference (ReadBuffer::position()) +* and eof is the ReadBuffer::eof() - therefore algorithm below does not rely on buffer's continuity. +* To parse strings use overloads below. +* +* @param src - iterator (reference to pointer) over input string - warning - continuity is not guaranteed. +* @param eof - function returning true if iterator riched the end - warning - can break iterator's continuity. +* @param dst - where to put output bytes, expected to be non-null and at IPV6_BINARY_LENGTH-long. +* @param first_block - preparsed first block +* @return - true if parsed successfully, false otherwise. +*/ +template +requires(std::is_same::type, char>::value) +inline bool parseIPv6(T*& src, EOFfunction eof, unsigned char* dst, int32_t first_block = -1) { + const auto clear_dst = [dst]() { + std::memset(dst, '\0', IPV6_BINARY_LENGTH); + return false; + }; + + if (src == nullptr || eof()) return clear_dst(); + + int groups = 0; /// number of parsed groups + unsigned char* iter = dst; /// iterator over dst buffer + unsigned char* zptr = + nullptr; /// pointer into dst buffer array where all-zeroes block ("::") is started + + std::memset(dst, '\0', IPV6_BINARY_LENGTH); + + if (first_block >= 0) { + *iter++ = static_cast((first_block >> 8) & 0xffu); + *iter++ = static_cast(first_block & 0xffu); + if (*src == ':') { + zptr = iter; + ++src; + } + ++groups; + } + + bool group_start = true; + + while (!eof() && groups < 8) { + if (*src == ':') { + ++src; + if (eof()) /// trailing colon is not allowed + return clear_dst(); + + group_start = true; + + if (*src == ':') { + if (zptr != nullptr) /// multiple all-zeroes blocks are not allowed + return clear_dst(); + zptr = iter; + ++src; + continue; + } + if (groups == 0) /// leading colon is not allowed + return clear_dst(); + } + + if (*src == '.') /// mixed IPv4 parsing + { + if (groups <= 1 && zptr == nullptr) /// IPv4 block can't be the first + return clear_dst(); + + if (group_start) /// first octet of IPv4 should be already parsed as an IPv6 group + return clear_dst(); + + ++src; + if (eof()) return clear_dst(); + + /// last parsed group should be reinterpreted as a decimal value - it's the first octet of IPv4 + --groups; + iter -= 2; + + UInt16 num = 0; + for (int i = 0; i < 2; ++i) { + unsigned char first = (iter[i] >> 4) & 0x0fu; + unsigned char second = iter[i] & 0x0fu; + if (first > 9 || second > 9) return clear_dst(); + (num *= 100) += first * 10 + second; + } + if (num > 255) return clear_dst(); + + /// parse IPv4 with known first octet + if (!parseIPv4(src, eof, iter, num)) return clear_dst(); + + if constexpr (std::endian::native == std::endian::little) + std::reverse(iter, iter + IPV4_BINARY_LENGTH); + + iter += 4; + groups += 2; + break; /// IPv4 block is the last - end of parsing + } + + if (!group_start) /// end of parsing + break; + group_start = false; + + UInt16 val = 0; /// current decoded group + int xdigits = 0; /// number of decoded hex digits in current group + + for (; !eof() && xdigits < 4; ++src, ++xdigits) { + UInt8 num = unhex(*src); + if (num == 0xFF) break; + (val <<= 4) |= num; + } + + if (xdigits == 0) /// end of parsing + break; + + *iter++ = static_cast((val >> 8) & 0xffu); + *iter++ = static_cast(val & 0xffu); + ++groups; + } + + /// either all 8 groups or all-zeroes block should be present + if (groups < 8 && zptr == nullptr) return clear_dst(); + + if (zptr != nullptr) /// process all-zeroes block + { + size_t msize = iter - zptr; + std::memmove(dst + IPV6_BINARY_LENGTH - msize, zptr, msize); + std::memset(zptr, '\0', IPV6_BINARY_LENGTH - (iter - dst)); + } + + return true; +} + +/// returns pointer to the right after parsed sequence or null on failed parsing +inline const char* parseIPv6(const char* src, const char* end, unsigned char* dst) { + if (parseIPv6( + src, [&src, end]() { return src == end; }, dst)) + return src; + return nullptr; +} + +/// returns true if whole buffer was parsed successfully +inline bool parseIPv6whole(const char* src, const char* end, unsigned char* dst) { + return parseIPv6(src, end, dst) == end; +} + +/// returns pointer to the right after parsed sequence or null on failed parsing +inline const char* parseIPv6(const char* src, unsigned char* dst) { + if (parseIPv6( + src, []() { return false; }, dst)) + return src; + return nullptr; +} + +/// returns true if whole null-terminated string was parsed successfully +inline bool parseIPv6whole(const char* src, unsigned char* dst) { + const char* end = parseIPv6(src, dst); + return end != nullptr && *end == '\0'; +} + } // namespace doris::vectorized diff --git a/be/src/vec/runtime/ipv4_value.h b/be/src/vec/runtime/ipv4_value.h index fb42ed66f9ca5c..d8ea5a6af2b36e 100644 --- a/be/src/vec/runtime/ipv4_value.h +++ b/be/src/vec/runtime/ipv4_value.h @@ -17,14 +17,13 @@ #pragma once -#include - #include #include #include #include #include "util/string_parser.hpp" +#include "vec/common/format_ip.h" namespace doris { @@ -34,87 +33,40 @@ class IPv4Value { explicit IPv4Value(vectorized::IPv4 ipv4) { _value = ipv4; } - explicit IPv4Value(std::string ipv4) {} - - [[nodiscard]] const vectorized::IPv4& value() const { return _value; } + const vectorized::IPv4& value() const { return _value; } vectorized::IPv4& value() { return _value; } void set_value(vectorized::IPv4 ipv4) { _value = ipv4; } - bool from_string(std::string ipv4) { return from_string(_value, ipv4); } - - [[nodiscard]] std::string to_string() const { return to_string(_value); } + bool from_string(const std::string& ipv4_str) { return from_string(_value, ipv4_str); } - static bool from_string(vectorized::IPv4& value, std::string ipv4) { - remove_ipv4_space(ipv4); + std::string to_string() const { return to_string(_value); } - // shortest ipv4 string is `0.0.0.0` whose length is 7 - if (ipv4.size() < 7 || !is_valid_string(ipv4)) { + static bool from_string(vectorized::IPv4& value, const std::string& ipv4_str) { + if (ipv4_str.empty()) { return false; } - - vectorized::IPv4 octets[4] = {0}; - std::istringstream iss(ipv4); - std::string octet; - uint8_t octet_index = 0; - - while (getline(iss, octet, '.')) { - if (octet_index >= 4) { - return false; - } - - StringParser::ParseResult result; - vectorized::IPv4 val = StringParser::string_to_unsigned_int( - octet.c_str(), octet.length(), &result); - if (result != StringParser::PARSE_SUCCESS || val > 255) { - return false; - } - - octets[octet_index++] = val; - } - - if (octet_index != 4) { + int64_t parse_value; + const char* src = ipv4_str.c_str(); + const char* end = ipv4_str.c_str() + ipv4_str.size() - 1; + while (std::isspace(*src)) ++src; + while (std::isspace(*end)) --end; + if (!vectorized::parseIPv4whole(src, ++end, reinterpret_cast(&parse_value))) { return false; } - - value = (octets[0] << 24) | (octets[1] << 16) | (octets[2] << 8) | octets[3]; + value = static_cast(parse_value); return true; } static std::string to_string(vectorized::IPv4 value) { - std::stringstream ss; - ss << ((value >> 24) & 0xFF) << '.' << ((value >> 16) & 0xFF) << '.' - << ((value >> 8) & 0xFF) << '.' << (value & 0xFF); - return ss.str(); - } - - static void remove_ipv4_space(std::string& ipv4) { - if (ipv4.empty()) { - return; - } - - std::string special_chars = "\r\n\t "; - - size_t pos = ipv4.find_first_not_of(special_chars); - if (pos != std::string::npos) { - ipv4.erase(0, pos); - } - - pos = ipv4.find_last_not_of(special_chars); - if (pos != std::string::npos) { - ipv4.erase(pos + 1); - } - } - - static bool is_valid_string(std::string ipv4) { - static std::regex IPV4_STD_REGEX( - "^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-" - "9]?)$"); - if (ipv4.size() > 15 || !std::regex_match(ipv4, IPV4_STD_REGEX)) { - return false; - } - return true; + char buf[IPV4_MAX_TEXT_LENGTH + 1]; + char* start = buf; + char* end = buf; + const auto *src = reinterpret_cast(&value); + vectorized::formatIPv4(src, end); + size_t len = end - start; + return {buf, len}; } private: diff --git a/be/src/vec/runtime/ipv6_value.h b/be/src/vec/runtime/ipv6_value.h index 8839ab4e2b06d2..8aaa8a26b699c7 100644 --- a/be/src/vec/runtime/ipv6_value.h +++ b/be/src/vec/runtime/ipv6_value.h @@ -17,12 +17,11 @@ #pragma once -#include - #include #include #include +#include "vec/common/format_ip.h" #include "vec/core/types.h" #include "vec/data_types/data_type.h" #include "vec/data_types/data_type_number_base.h" @@ -35,224 +34,35 @@ class IPv6Value { explicit IPv6Value(vectorized::IPv6 ipv6) { _value = ipv6; } - [[nodiscard]] const vectorized::IPv6& value() const { return _value; } + const vectorized::IPv6& value() const { return _value; } vectorized::IPv6& value() { return _value; } void set_value(vectorized::IPv6 ipv6) { _value = ipv6; } - bool from_string(std::string ipv6) { return from_string(_value, ipv6); } - - bool from_binary_string(std::string ipv6_binary) { - return from_binary_string(_value, ipv6_binary); - } - - static bool from_string(vectorized::IPv6& x, std::string ipv6) { - remove_ipv6_space(ipv6); - - if (ipv6.empty() || !is_valid_string(ipv6)) { - return false; - } - - std::transform(ipv6.begin(), ipv6.end(), ipv6.begin(), - [](unsigned char ch) { return std::tolower(ch); }); - std::istringstream iss(ipv6); - std::string field; - uint16_t fields[8] = {0}; - uint8_t zero_index = 0; - uint8_t num_field = 0; - uint8_t right_field_num = 0; - - while (num_field < 8) { - if (!getline(iss, field, ':')) { - break; - } - - if (field.empty()) { - zero_index = num_field; - fields[num_field++] = 0; - } else { - try { - if (field.size() > 4 || field > "ffff") { - return false; - } - - fields[num_field++] = std::stoi(field, nullptr, 16); - } catch (const std::exception& /*e*/) { - return false; - } - } - } - - if (zero_index != 0) { - right_field_num = num_field - zero_index - 1; - - for (uint8_t i = 7; i > 7 - right_field_num; --i) { - fields[i] = fields[zero_index + right_field_num + i - 7]; - fields[zero_index + right_field_num + i - 7] = 0; - } - } - - uint64_t high = (static_cast(fields[0]) << 48) | - (static_cast(fields[1]) << 32) | - (static_cast(fields[2]) << 16) | static_cast(fields[3]); - uint64_t low = (static_cast(fields[4]) << 48) | - (static_cast(fields[5]) << 32) | - (static_cast(fields[6]) << 16) | static_cast(fields[7]); - - x = static_cast(high) << 64 | low; - return true; - } + bool from_string(const std::string& ipv6_str) { return from_string(_value, ipv6_str); } - static bool from_binary_string(vectorized::IPv6& x, std::string ipv6_binary_str) { - // Accepts a FixedString(16) value containing the IPv6 address in binary format - if (ipv6_binary_str.size() != 16) { + static bool from_string(vectorized::IPv6& value, const std::string& ipv6_str) { + if (ipv6_str.empty()) { return false; } - - uint64_t high = 0; - uint64_t low = 0; - - const uint8_t* ipv6_binary = reinterpret_cast(ipv6_binary_str.c_str()); - - for (int i = 0; i < 8; ++i) { - high |= (static_cast(ipv6_binary[i]) << (56 - i * 8)); - } - - for (int i = 8; i < 16; ++i) { - low |= (static_cast(ipv6_binary[i]) << (56 - (i - 8) * 8)); - } - - x = static_cast(high) << 64 | low; - return true; - } - - [[nodiscard]] std::string to_string() const { return to_string(_value); } - - static std::string to_string(vectorized::IPv6 x) { - // "0000:0000:0000:0000:0000:0000:0000:0000" - if (x == 0) { - return "::"; - } - - uint64_t low = static_cast(x); - uint64_t high = static_cast(x >> 64); - - uint16_t fields[8] = {static_cast((high >> 48) & 0xFFFF), - static_cast((high >> 32) & 0xFFFF), - static_cast((high >> 16) & 0xFFFF), - static_cast(high & 0xFFFF), - static_cast((low >> 48) & 0xFFFF), - static_cast((low >> 32) & 0xFFFF), - static_cast((low >> 16) & 0xFFFF), - static_cast(low & 0xFFFF)}; - - uint8_t zero_start = 0, zero_end = 0; - - while (zero_start < 8 && zero_end < 8) { - if (fields[zero_start] != 0) { - zero_start++; - zero_end = zero_start; - continue; - } - - while (zero_end < 7 && fields[zero_end + 1] == 0) { - zero_end++; - } - - if (zero_end > zero_start) { - break; - } - - zero_start++; - zero_end = zero_start; - } - - std::stringstream ss; - - if (zero_start == zero_end) { - for (uint8_t i = 0; i < 7; ++i) { - ss << std::hex << fields[i] << ":"; - } - ss << std::hex << fields[7]; - } else { - for (uint8_t i = 0; i < zero_start; ++i) { - ss << std::hex << fields[i] << ":"; - } - - if (zero_end == 7) { - ss << ":"; - } else { - for (uint8_t j = zero_end + 1; j < 8; ++j) { - ss << std::hex << ":" << fields[j]; - } - } - } - - return ss.str(); - } - - [[nodiscard]] std::string to_binary_string() const { return to_binary_string(_value); } - - static std::string to_binary_string(vectorized::IPv6 x) { - uint64_t low = static_cast(x); - uint64_t high = static_cast(x >> 64); - - uint8_t fields[16] = {static_cast((high >> 56) & 0xFF), - static_cast((high >> 48) & 0xFF), - static_cast((high >> 40) & 0xFF), - static_cast((high >> 32) & 0xFF), - static_cast((high >> 24) & 0xFF), - static_cast((high >> 16) & 0xFF), - static_cast((high >> 8) & 0xFF), - static_cast(high & 0xFF), - static_cast((low >> 56) & 0xFF), - static_cast((low >> 48) & 0xFF), - static_cast((low >> 40) & 0xFF), - static_cast((low >> 32) & 0xFF), - static_cast((low >> 24) & 0xFF), - static_cast((low >> 16) & 0xFF), - static_cast((low >> 8) & 0xFF), - static_cast(low & 0xFF)}; - - std::stringstream ss; - - for (int i = 0; i < 16; ++i) { - ss << (char)fields[i]; - } - - return ss.str(); + const char* src = ipv6_str.c_str(); + const char* end = ipv6_str.c_str() + ipv6_str.size() - 1; + while (std::isspace(*src)) ++src; + while (std::isspace(*end)) --end; + return vectorized::parseIPv6whole(src, ++end, reinterpret_cast(&value)); } - static void remove_ipv6_space(std::string& ipv6) { - if (ipv6.empty()) { - return; - } - - std::string special_chars = "\r\n\t "; + std::string to_string() const { return to_string(_value); } - size_t pos = ipv6.find_first_not_of(special_chars); - if (pos != std::string::npos) { - ipv6.erase(0, pos); - } - - pos = ipv6.find_last_not_of(special_chars); - if (pos != std::string::npos) { - ipv6.erase(pos + 1); - } - } - - static bool is_valid_string(std::string ipv6) { - static std::regex IPV6_STD_REGEX("^([0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}$"); - static std::regex IPV6_COMPRESS_REGEX( - "^(([0-9A-Fa-f]{1,4}(:[0-9A-Fa-f]{1,4})*)?)::((([0-9A-Fa-f]{1,4}:)*[0-9A-Fa-f]{1,4}" - ")?)$"); - - if (ipv6.size() > 39 || !(std::regex_match(ipv6, IPV6_STD_REGEX) || - std::regex_match(ipv6, IPV6_COMPRESS_REGEX))) { - return false; - } - return true; + static std::string to_string(vectorized::IPv6 value) { + char buf[IPV6_MAX_TEXT_LENGTH + 1]; + char* start = buf; + char* end = buf; + const auto* src = reinterpret_cast(&value); + vectorized::formatIPv6(src, end); + size_t len = end - start; + return {buf, len}; } private: From bc0c2a59982125ac46c79302b58b71149ac6d65b Mon Sep 17 00:00:00 2001 From: sjyango Date: Tue, 26 Dec 2023 03:52:39 +0000 Subject: [PATCH 2/3] fmt --- be/src/vec/common/format_ip.h | 6 +++--- be/src/vec/runtime/ipv4_value.h | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/be/src/vec/common/format_ip.h b/be/src/vec/common/format_ip.h index ff70500ea1d6f6..71c7c73b50ad95 100644 --- a/be/src/vec/common/format_ip.h +++ b/be/src/vec/common/format_ip.h @@ -216,7 +216,7 @@ void formatIPv6(const unsigned char* src, char*& dst, uint8_t zeroed_tail_bytes_ * @return - true if parsed successfully, false otherwise. */ template -requires(std::is_same::type, char>::value) + requires(std::is_same::type, char>::value) inline bool parseIPv6(T*& src, EOFfunction eof, unsigned char* dst, int32_t first_block = -1) { const auto clear_dst = [dst]() { std::memset(dst, '\0', IPV6_BINARY_LENGTH); @@ -335,7 +335,7 @@ inline bool parseIPv6(T*& src, EOFfunction eof, unsigned char* dst, int32_t firs /// returns pointer to the right after parsed sequence or null on failed parsing inline const char* parseIPv6(const char* src, const char* end, unsigned char* dst) { if (parseIPv6( - src, [&src, end]() { return src == end; }, dst)) + src, [&src, end]() { return src == end; }, dst)) return src; return nullptr; } @@ -348,7 +348,7 @@ inline bool parseIPv6whole(const char* src, const char* end, unsigned char* dst) /// returns pointer to the right after parsed sequence or null on failed parsing inline const char* parseIPv6(const char* src, unsigned char* dst) { if (parseIPv6( - src, []() { return false; }, dst)) + src, []() { return false; }, dst)) return src; return nullptr; } diff --git a/be/src/vec/runtime/ipv4_value.h b/be/src/vec/runtime/ipv4_value.h index d8ea5a6af2b36e..9304aa2c18a846 100644 --- a/be/src/vec/runtime/ipv4_value.h +++ b/be/src/vec/runtime/ipv4_value.h @@ -52,7 +52,8 @@ class IPv4Value { const char* end = ipv4_str.c_str() + ipv4_str.size() - 1; while (std::isspace(*src)) ++src; while (std::isspace(*end)) --end; - if (!vectorized::parseIPv4whole(src, ++end, reinterpret_cast(&parse_value))) { + if (!vectorized::parseIPv4whole(src, ++end, + reinterpret_cast(&parse_value))) { return false; } value = static_cast(parse_value); @@ -63,7 +64,7 @@ class IPv4Value { char buf[IPV4_MAX_TEXT_LENGTH + 1]; char* start = buf; char* end = buf; - const auto *src = reinterpret_cast(&value); + const auto* src = reinterpret_cast(&value); vectorized::formatIPv4(src, end); size_t len = end - start; return {buf, len}; From 0ae016e23ca8807c1363f2a2380cece3dca701b8 Mon Sep 17 00:00:00 2001 From: sjyango Date: Tue, 26 Dec 2023 09:32:41 +0000 Subject: [PATCH 3/3] remove unused header --- be/src/vec/common/format_ip.h | 1 - 1 file changed, 1 deletion(-) diff --git a/be/src/vec/common/format_ip.h b/be/src/vec/common/format_ip.h index 71c7c73b50ad95..58322ecd9fc5f7 100644 --- a/be/src/vec/common/format_ip.h +++ b/be/src/vec/common/format_ip.h @@ -22,7 +22,6 @@ #include #include -#include #include #include