Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 126 additions & 1 deletion be/src/vec/functions/url/domain.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,12 @@

#pragma once

// #include <base/find_symbols.h>
#include <cstring>

#include "vec/common/string_utils/string_utils.h"
#include "vec/functions/url/find_symbols.h"
#include "vec/functions/url/protocol.h"
#include "vec/functions/url/tldLookup.h"

namespace doris::vectorized {

Expand Down Expand Up @@ -144,4 +145,128 @@ struct ExtractDomain {
}
};

struct ExtractTopLevelDomain {
static size_t get_reserve_length_for_element() { return 5; }

static void execute(const char* data, size_t size, const char*& res_data, size_t& res_size) {
res_data = data;
res_size = 0;
StringRef host = get_url_host(data, size);

if (host.size == 0) {
return;
} else {
auto host_view = host.to_string_view();
if (host_view[host_view.size() - 1] == '.') {
host_view.remove_suffix(1);
}

const auto* host_end = host_view.data() + host_view.size();
const char* last_dot = find_last_symbols_or_null<'.'>(host_view.data(), host_end);
if (!last_dot) {
return;
}

/// For IPv4 addresses select nothing.
///
/// NOTE: it is safe to access last_dot[1]
/// since getURLHost() will not return a host if there is symbol after dot.
if (is_numeric_ascii(last_dot[1])) {
return;
}

res_data = last_dot + 1;
res_size = host_end - res_data;
}
}
};

struct ExtractFirstSignificantSubdomain {
static size_t get_reserve_length_for_element() { return 10; }

static void execute(const Pos data, const size_t size, Pos& res_data, size_t& res_size,
Pos* out_domain_end = nullptr) {
res_data = data;
res_size = 0;

Pos tmp;
size_t domain_length = 0;
ExtractDomain<true>::execute(data, size, tmp, domain_length);

if (domain_length == 0) {
return;
}
if (out_domain_end) {
*out_domain_end = tmp + domain_length;
}

/// cut useless dot
if (tmp[domain_length - 1] == '.') {
--domain_length;
}

res_data = tmp;
res_size = domain_length;

const auto* begin = tmp;
const auto* end = begin + domain_length;
std::array<const char*, 3> last_periods {};

const auto* pos = find_first_symbols<'.'>(begin, end);
while (pos < end) {
last_periods[2] = last_periods[1];
last_periods[1] = last_periods[0];
last_periods[0] = pos;
pos = find_first_symbols<'.'>(pos + 1, end);
}

if (!last_periods[0]) {
return;
}

if (!last_periods[1]) {
res_size = last_periods[0] - begin;
return;
}

if (!last_periods[2]) {
last_periods[2] = begin - 1;
}

const auto* end_of_level_domain = find_first_symbols<'/'>(last_periods[0], end);
if (!end_of_level_domain) {
end_of_level_domain = end;
}

auto host_len = static_cast<size_t>(end_of_level_domain - last_periods[1] - 1);
StringRef host {last_periods[1] + 1, host_len};
if (tldLookup::is_valid(host.data, host.size)) {
res_data += last_periods[2] + 1 - begin;
res_size = last_periods[1] - last_periods[2] - 1;
} else {
res_data += last_periods[1] + 1 - begin;
res_size = last_periods[0] - last_periods[1] - 1;
}
}
};

struct CutToFirstSignificantSubdomain {
static size_t get_reserve_length_for_element() { return 15; }

static void execute(const Pos data, const size_t size, Pos& res_data, size_t& res_size) {
res_data = data;
res_size = 0;

Pos tmp_data = data;
size_t tmp_length;
Pos domain_end = data;
ExtractFirstSignificantSubdomain::execute(data, size, tmp_data, tmp_length, &domain_end);

if (tmp_length == 0) {
return;
}
res_data = tmp_data;
res_size = domain_end - tmp_data;
}
};
} // namespace doris::vectorized
Loading
Loading