Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 43 additions & 21 deletions be/src/vec/functions/regexps.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,21 +21,21 @@
#pragma once

#include <hs/hs.h>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: 'hs/hs.h' file not found [clang-diagnostic-error]

#include <hs/hs.h>
         ^

#include <hs/hs_common.h>

#include <boost/container_hash/hash.hpp>
#include <map>
#include <memory>
#include <mutex>
#include <optional>
#include <string>
#include <utility>
#include <vector>

#include "common/exception.h"
#include "common/status.h"
#include "vec/common/string_ref.h"

namespace doris::vectorized {

namespace multiregexps {
namespace doris::vectorized::multiregexps {

template <typename Deleter, Deleter deleter>
struct HyperscanDeleter {
Expand Down Expand Up @@ -75,7 +75,9 @@ class DeferredConstructedRegexps {

Regexps* get() {
std::lock_guard lock(mutex);
if (regexps) return &*regexps;
if (regexps) {
return &*regexps;
}
regexps = constructor();
return &*regexps;
}
Expand Down Expand Up @@ -136,41 +138,59 @@ Regexps constructRegexps(const std::vector<String>& str_patterns,
/// We mark the patterns to provide the callback results.
if constexpr (save_indices) {
ids.reset(new unsigned int[patterns.size()]);
for (size_t i = 0; i < patterns.size(); ++i) ids[i] = static_cast<unsigned>(i + 1);
for (size_t i = 0; i < patterns.size(); ++i) {
ids[i] = static_cast<unsigned>(i + 1);
}
}

for (auto& pattern : patterns) {
LOG(INFO) << "pattern: " << pattern << "\n";
}

hs_error_t err;
if constexpr (!WithEditDistance)
if constexpr (!WithEditDistance) {
err = hs_compile_multi(patterns.data(), flags.data(), ids.get(),
static_cast<unsigned>(patterns.size()), HS_MODE_BLOCK, nullptr, &db,
&compile_error);
else
} else {
err = hs_compile_ext_multi(patterns.data(), flags.data(), ids.get(), ext_exprs_ptrs.data(),
static_cast<unsigned>(patterns.size()), HS_MODE_BLOCK, nullptr,
&db, &compile_error);
}

if (err != HS_SUCCESS) {
if (err != HS_SUCCESS) [[unlikely]] {
/// CompilerError is a unique_ptr, so correct memory free after the exception is thrown.
CompilerError error(compile_error);

if (error->expression < 0)
LOG(FATAL) << "Logical error: " + String(error->message);
else
LOG(FATAL) << "Bad arguments: Pattern " + str_patterns[error->expression] +
"failed with error " + String(error->message);
if (error->expression < 0) { // error has nothing to do with the patterns themselves
throw doris::Exception(
ErrorCode::INTERNAL_ERROR,
fmt::format("Compile regexp expression failed. got {}", error->message));
} else {
throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
fmt::format("Compile regexp expression failed. got {}. some "
"expressions may be illegal",
error->message));
}
}

/// We allocate the scratch space only once, then copy it across multiple threads with hs_clone_scratch
/// function which is faster than allocating scratch space each time in each thread.
hs_scratch_t* scratch = nullptr;
err = hs_alloc_scratch(db, &scratch);

/// If not HS_SUCCESS, it is guaranteed that the memory would not be allocated for scratch.
if (err != HS_SUCCESS) LOG(FATAL) << "Could not allocate scratch space for hyperscan";
if (err != HS_SUCCESS) [[unlikely]] {
if (err == HS_NOMEM) [[unlikely]] {
throw doris::Exception(
ErrorCode::MEM_ALLOC_FAILED,
std::string("Allocating memory failed on compiling regexp expressions."));
} else {
throw doris::Exception(
ErrorCode::INVALID_ARGUMENT,
std::string(
"Compile regexp expression failed with unexpected arguments perhaps"));
}
}

return {db, scratch};
}
Expand All @@ -196,7 +216,9 @@ struct GlobalCacheTable {
static size_t getBucketIndexFor(const std::vector<String> patterns,
std::optional<UInt32> edit_distance) {
size_t hash = 0;
for (const auto& pattern : patterns) boost::hash_combine(hash, pattern);
for (const auto& pattern : patterns) {
boost::hash_combine(hash, pattern);
}
boost::hash_combine(hash, edit_distance);
return hash % CACHE_SIZE;
}
Expand All @@ -212,7 +234,9 @@ DeferredConstructedRegexpsPtr getOrSet(const std::vector<StringRef>& patterns,

std::vector<String> str_patterns;
str_patterns.reserve(patterns.size());
for (const auto& pattern : patterns) str_patterns.emplace_back(pattern.to_string());
for (const auto& pattern : patterns) {
str_patterns.emplace_back(pattern.to_string());
}

size_t bucket_idx = GlobalCacheTable::getBucketIndexFor(str_patterns, edit_distance);

Expand Down Expand Up @@ -249,6 +273,4 @@ DeferredConstructedRegexpsPtr getOrSet(const std::vector<StringRef>& patterns,
return bucket.regexps;
}

} // namespace multiregexps

} // namespace doris::vectorized
} // namespace doris::vectorized::multiregexps
Original file line number Diff line number Diff line change
@@ -1,45 +1,67 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !select --
0

-- !select --
0

-- !select --
0

-- !select --
1

-- !select --
0

-- !select --
0

-- !select --
0

-- !select --
1

-- !select --
1

-- !select --
1

-- !select --
0

-- !select --
1

-- !select --
0

-- !select --
1

-- !select --
1

-- !select --
1

-- !select --
0

-- !select --
0

-- !select --
0

-- !select --
1

-- !select --
1

-- !select --
1
1

Loading