Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions be/src/util/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,4 @@ inline int128_t get_int128_from_unalign(const void* address) {
return value;
}

inline uint128_t get_uint128_from_unalign(const void* address) {
uint128_t value = 0;
memcpy(&value, address, sizeof(uint128_t));
return value;
}
} // namespace doris
162 changes: 161 additions & 1 deletion be/src/vec/common/format_ip.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

#pragma once

#include <vec/common/hex.h>
#include <vec/common/string_utils/string_utils.h>

#include <algorithm>
Expand All @@ -34,7 +35,7 @@ constexpr size_t IPV4_MAX_TEXT_LENGTH = 15; /// Does not count tail zero byte.
constexpr size_t IPV6_MAX_TEXT_LENGTH = 39;
constexpr size_t IPV4_MIN_NUM_VALUE = 0; //num value of '0.0.0.0'
constexpr size_t IPV4_MAX_NUM_VALUE = 4294967295; //num value of '255.255.255.255'
constexpr int IPV4_MAX_OCTET_VALUE = 255; //max vulue of octet
constexpr int IPV4_MAX_OCTET_VALUE = 255; //max value of octet
constexpr size_t IPV4_OCTET_BITS = 8;
constexpr size_t DECIMAL_BASE = 10;
constexpr size_t IPV6_BINARY_LENGTH = 16;
Expand Down Expand Up @@ -198,4 +199,163 @@ inline bool parseIPv4whole(const char* src, unsigned char* dst) {
*/
void formatIPv6(const unsigned char* src, char*& dst, uint8_t zeroed_tail_bytes_count = 0);

/** Unsafe (no bounds-checking for src nor dst), optimized version of parsing IPv6 string.
*
* Parses the input string `src` and stores binary big-endian value into buffer pointed by `dst`,
* which should be long enough. In case of failure zeroes IPV6_BINARY_LENGTH bytes of buffer pointed by `dst`.
*
* WARNING - this function is adapted to work with ReadBuffer, where src is the position reference (ReadBuffer::position())
* and eof is the ReadBuffer::eof() - therefore algorithm below does not rely on buffer's continuity.
* To parse strings use overloads below.
*
* @param src - iterator (reference to pointer) over input string - warning - continuity is not guaranteed.
* @param eof - function returning true if iterator riched the end - warning - can break iterator's continuity.
* @param dst - where to put output bytes, expected to be non-null and at IPV6_BINARY_LENGTH-long.
* @param first_block - preparsed first block
* @return - true if parsed successfully, false otherwise.
*/
template <typename T, typename EOFfunction>
requires(std::is_same<typename std::remove_cv<T>::type, char>::value)
inline bool parseIPv6(T*& src, EOFfunction eof, unsigned char* dst, int32_t first_block = -1) {
const auto clear_dst = [dst]() {
std::memset(dst, '\0', IPV6_BINARY_LENGTH);
return false;
};

if (src == nullptr || eof()) return clear_dst();

int groups = 0; /// number of parsed groups
unsigned char* iter = dst; /// iterator over dst buffer
unsigned char* zptr =
nullptr; /// pointer into dst buffer array where all-zeroes block ("::") is started

std::memset(dst, '\0', IPV6_BINARY_LENGTH);

if (first_block >= 0) {
*iter++ = static_cast<unsigned char>((first_block >> 8) & 0xffu);
*iter++ = static_cast<unsigned char>(first_block & 0xffu);
if (*src == ':') {
zptr = iter;
++src;
}
++groups;
}

bool group_start = true;

while (!eof() && groups < 8) {
if (*src == ':') {
++src;
if (eof()) /// trailing colon is not allowed
return clear_dst();

group_start = true;

if (*src == ':') {
if (zptr != nullptr) /// multiple all-zeroes blocks are not allowed
return clear_dst();
zptr = iter;
++src;
continue;
}
if (groups == 0) /// leading colon is not allowed
return clear_dst();
}

if (*src == '.') /// mixed IPv4 parsing
{
if (groups <= 1 && zptr == nullptr) /// IPv4 block can't be the first
return clear_dst();

if (group_start) /// first octet of IPv4 should be already parsed as an IPv6 group
return clear_dst();

++src;
if (eof()) return clear_dst();

/// last parsed group should be reinterpreted as a decimal value - it's the first octet of IPv4
--groups;
iter -= 2;

UInt16 num = 0;
for (int i = 0; i < 2; ++i) {
unsigned char first = (iter[i] >> 4) & 0x0fu;
unsigned char second = iter[i] & 0x0fu;
if (first > 9 || second > 9) return clear_dst();
(num *= 100) += first * 10 + second;
}
if (num > 255) return clear_dst();

/// parse IPv4 with known first octet
if (!parseIPv4(src, eof, iter, num)) return clear_dst();

if constexpr (std::endian::native == std::endian::little)
std::reverse(iter, iter + IPV4_BINARY_LENGTH);

iter += 4;
groups += 2;
break; /// IPv4 block is the last - end of parsing
}

if (!group_start) /// end of parsing
break;
group_start = false;

UInt16 val = 0; /// current decoded group
int xdigits = 0; /// number of decoded hex digits in current group

for (; !eof() && xdigits < 4; ++src, ++xdigits) {
UInt8 num = unhex(*src);
if (num == 0xFF) break;
(val <<= 4) |= num;
}

if (xdigits == 0) /// end of parsing
break;

*iter++ = static_cast<unsigned char>((val >> 8) & 0xffu);
*iter++ = static_cast<unsigned char>(val & 0xffu);
++groups;
}

/// either all 8 groups or all-zeroes block should be present
if (groups < 8 && zptr == nullptr) return clear_dst();

if (zptr != nullptr) /// process all-zeroes block
{
size_t msize = iter - zptr;
std::memmove(dst + IPV6_BINARY_LENGTH - msize, zptr, msize);
std::memset(zptr, '\0', IPV6_BINARY_LENGTH - (iter - dst));
}

return true;
}

/// returns pointer to the right after parsed sequence or null on failed parsing
inline const char* parseIPv6(const char* src, const char* end, unsigned char* dst) {
if (parseIPv6(
src, [&src, end]() { return src == end; }, dst))
return src;
return nullptr;
}

/// returns true if whole buffer was parsed successfully
inline bool parseIPv6whole(const char* src, const char* end, unsigned char* dst) {
return parseIPv6(src, end, dst) == end;
}

/// returns pointer to the right after parsed sequence or null on failed parsing
inline const char* parseIPv6(const char* src, unsigned char* dst) {
if (parseIPv6(
src, []() { return false; }, dst))
return src;
return nullptr;
}

/// returns true if whole null-terminated string was parsed successfully
inline bool parseIPv6whole(const char* src, unsigned char* dst) {
const char* end = parseIPv6(src, dst);
return end != nullptr && *end == '\0';
}

} // namespace doris::vectorized
89 changes: 21 additions & 68 deletions be/src/vec/runtime/ipv4_value.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,13 @@

#pragma once

#include <stdint.h>

#include <algorithm>
#include <regex>
#include <sstream>
#include <string>

#include "util/string_parser.hpp"
#include "vec/common/format_ip.h"

namespace doris {

Expand All @@ -34,87 +33,41 @@ class IPv4Value {

explicit IPv4Value(vectorized::IPv4 ipv4) { _value = ipv4; }

explicit IPv4Value(std::string ipv4) {}

[[nodiscard]] const vectorized::IPv4& value() const { return _value; }
const vectorized::IPv4& value() const { return _value; }

vectorized::IPv4& value() { return _value; }

void set_value(vectorized::IPv4 ipv4) { _value = ipv4; }

bool from_string(std::string ipv4) { return from_string(_value, ipv4); }

[[nodiscard]] std::string to_string() const { return to_string(_value); }
bool from_string(const std::string& ipv4_str) { return from_string(_value, ipv4_str); }

static bool from_string(vectorized::IPv4& value, std::string ipv4) {
remove_ipv4_space(ipv4);
std::string to_string() const { return to_string(_value); }

// shortest ipv4 string is `0.0.0.0` whose length is 7
if (ipv4.size() < 7 || !is_valid_string(ipv4)) {
static bool from_string(vectorized::IPv4& value, const std::string& ipv4_str) {
if (ipv4_str.empty()) {
return false;
}

vectorized::IPv4 octets[4] = {0};
std::istringstream iss(ipv4);
std::string octet;
uint8_t octet_index = 0;

while (getline(iss, octet, '.')) {
if (octet_index >= 4) {
return false;
}

StringParser::ParseResult result;
vectorized::IPv4 val = StringParser::string_to_unsigned_int<vectorized::IPv4>(
octet.c_str(), octet.length(), &result);
if (result != StringParser::PARSE_SUCCESS || val > 255) {
return false;
}

octets[octet_index++] = val;
}

if (octet_index != 4) {
int64_t parse_value;
const char* src = ipv4_str.c_str();
const char* end = ipv4_str.c_str() + ipv4_str.size() - 1;
while (std::isspace(*src)) ++src;
while (std::isspace(*end)) --end;
if (!vectorized::parseIPv4whole(src, ++end,
reinterpret_cast<unsigned char*>(&parse_value))) {
return false;
}

value = (octets[0] << 24) | (octets[1] << 16) | (octets[2] << 8) | octets[3];
value = static_cast<vectorized::IPv4>(parse_value);
return true;
}

static std::string to_string(vectorized::IPv4 value) {
std::stringstream ss;
ss << ((value >> 24) & 0xFF) << '.' << ((value >> 16) & 0xFF) << '.'
<< ((value >> 8) & 0xFF) << '.' << (value & 0xFF);
return ss.str();
}

static void remove_ipv4_space(std::string& ipv4) {
if (ipv4.empty()) {
return;
}

std::string special_chars = "\r\n\t ";

size_t pos = ipv4.find_first_not_of(special_chars);
if (pos != std::string::npos) {
ipv4.erase(0, pos);
}

pos = ipv4.find_last_not_of(special_chars);
if (pos != std::string::npos) {
ipv4.erase(pos + 1);
}
}

static bool is_valid_string(std::string ipv4) {
static std::regex IPV4_STD_REGEX(
"^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-"
"9]?)$");
if (ipv4.size() > 15 || !std::regex_match(ipv4, IPV4_STD_REGEX)) {
return false;
}
return true;
char buf[IPV4_MAX_TEXT_LENGTH + 1];
char* start = buf;
char* end = buf;
const auto* src = reinterpret_cast<const unsigned char*>(&value);
vectorized::formatIPv4(src, end);
size_t len = end - start;
return {buf, len};
}

private:
Expand Down
Loading