Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion cpp/src/gandiva/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,10 @@ set(SRC_FILES
like_holder.cc
literal_holder.cc
projector.cc
regexp_matches_holder.cc
regex_util.cc
selection_vector.cc
sql_like_holder.cc
tree_expr_builder.cc
to_date_holder.cc
random_generator_holder.cc
Expand Down Expand Up @@ -204,7 +206,8 @@ add_gandiva_test(internals-test
lru_cache_test.cc
to_date_holder_test.cc
simple_arena_test.cc
like_holder_test.cc
sql_like_holder_test.cc
regexp_matches_holder_test.cc
decimal_type_util_test.cc
random_generator_holder_test.cc)

Expand Down
5 changes: 4 additions & 1 deletion cpp/src/gandiva/expr_decomposer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,10 @@ Status ExprDecomposer::Visit(const FieldNode& node) {
// time.
const FunctionNode ExprDecomposer::TryOptimize(const FunctionNode& node) {
if (node.descriptor()->name() == "like") {
return LikeHolder::TryOptimize(node);
return SQLLikeHolder::TryOptimize(node);
} else if (node.descriptor()->name() == "regexp_matches" ||
node.descriptor()->name() == "regexp_like") {
return RegexpMatchesHolder::TryOptimize(node);
} else {
return node;
}
Expand Down
7 changes: 5 additions & 2 deletions cpp/src/gandiva/function_holder_registry.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,10 @@
#include "arrow/status.h"

#include "gandiva/function_holder.h"
#include "gandiva/like_holder.h"
#include "gandiva/node.h"
#include "gandiva/random_generator_holder.h"
#include "gandiva/regexp_matches_holder.h"
#include "gandiva/sql_like_holder.h"
#include "gandiva/to_date_holder.h"

namespace gandiva {
Expand Down Expand Up @@ -62,7 +63,9 @@ class FunctionHolderRegistry {
private:
static map_type& makers() {
static map_type maker_map = {
{"like", LAMBDA_MAKER(LikeHolder)},
{"like", LAMBDA_MAKER(SQLLikeHolder)},
{"regexp_matches", LAMBDA_MAKER(RegexpMatchesHolder)},
{"regexp_like", LAMBDA_MAKER(RegexpMatchesHolder)},
{"to_date", LAMBDA_MAKER(ToDateHolder)},
{"random", LAMBDA_MAKER(RandomGeneratorHolder)},
{"rand", LAMBDA_MAKER(RandomGeneratorHolder)},
Expand Down
4 changes: 4 additions & 0 deletions cpp/src/gandiva/function_registry_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
kResultNullIfNull, "gdv_fn_like_utf8_utf8",
NativeFunction::kNeedsFunctionHolder),

NativeFunction("regexp_matches", {"regexp_like"}, DataTypeVector{utf8(), utf8()},
boolean(), kResultNullIfNull, "gdv_fn_regexp_matches_utf8_utf8",
NativeFunction::kNeedsFunctionHolder),

NativeFunction("substr", {"substring"},
DataTypeVector{utf8(), int64() /*offset*/, int64() /*length*/},
utf8(), kResultNullIfNull, "substr_utf8_int64_int64",
Expand Down
18 changes: 17 additions & 1 deletion cpp/src/gandiva/gdv_function_stubs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@
#include "gandiva/engine.h"
#include "gandiva/exported_funcs.h"
#include "gandiva/in_holder.h"
#include "gandiva/like_holder.h"
#include "gandiva/random_generator_holder.h"
#include "gandiva/sql_like_holder.h"
#include "gandiva/to_date_holder.h"

/// Stub functions that can be accessed from LLVM or the pre-compiled library.
Expand All @@ -37,6 +37,11 @@ bool gdv_fn_like_utf8_utf8(int64_t ptr, const char* data, int data_len,
return (*holder)(std::string(data, data_len));
}

bool gdv_fn_regexp_matches_utf8_utf8(int64_t ptr, const char* data, int data_len,
const char* pattern, int pattern_len) {
return gdv_fn_like_utf8_utf8(ptr, data, data_len, pattern, pattern_len);
}

double gdv_fn_random(int64_t ptr) {
gandiva::RandomGeneratorHolder* holder =
reinterpret_cast<gandiva::RandomGeneratorHolder*>(ptr);
Expand Down Expand Up @@ -187,6 +192,17 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const {
types->i1_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_like_utf8_utf8));

// gdv_fn_regexp_matches_utf8_utf8
args = {types->i64_type(), // int64_t ptr
types->i8_ptr_type(), // const char* data
types->i32_type(), // int data_len
types->i8_ptr_type(), // const char* pattern
types->i32_type()}; // int pattern_len

engine->AddGlobalMappingForFunc(
"gdv_fn_regexp_matches_utf8_utf8", types->i1_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_regexp_matches_utf8_utf8));

// gdv_fn_to_date_utf8_utf8_int32
args = {types->i64_type(), // int64_t execution_context
types->i64_type(), // int64_t holder_ptr
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/gandiva/gdv_function_stubs.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ extern "C" {
bool gdv_fn_like_utf8_utf8(int64_t ptr, const char* data, int data_len,
const char* pattern, int pattern_len);

bool gdv_fn_regexp_matches_utf8_utf8(int64_t ptr, const char* data, int data_len,
const char* pattern, int pattern_len);

int64_t gdv_fn_to_date_utf8_utf8_int32(int64_t context, int64_t ptr, const char* data,
int data_len, bool in1_validity,
const char* pattern, int pattern_len,
Expand Down
60 changes: 8 additions & 52 deletions cpp/src/gandiva/like_holder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,76 +17,32 @@

#include "gandiva/like_holder.h"

#include <regex>
#include "gandiva/node.h"
#include "gandiva/regex_util.h"

namespace gandiva {

RE2 LikeHolder::starts_with_regex_(R"((\w|\s)*\.\*)");
RE2 LikeHolder::ends_with_regex_(R"(\.\*(\w|\s)*)");

// Short-circuit pattern matches for the two common sub cases :
// - starts_with and ends_with.
const FunctionNode LikeHolder::TryOptimize(const FunctionNode& node) {
std::shared_ptr<LikeHolder> holder;
auto status = Make(node, &holder);
if (status.ok()) {
std::string& pattern = holder->pattern_;
auto literal_type = node.children().at(1)->return_type();

if (RE2::FullMatch(pattern, starts_with_regex_)) {
auto prefix = pattern.substr(0, pattern.length() - 2); // trim .*
auto prefix_node =
std::make_shared<LiteralNode>(literal_type, LiteralHolder(prefix), false);
return FunctionNode("starts_with", {node.children().at(0), prefix_node},
node.return_type());
} else if (RE2::FullMatch(pattern, ends_with_regex_)) {
auto suffix = pattern.substr(2); // skip .*
auto suffix_node =
std::make_shared<LiteralNode>(literal_type, LiteralHolder(suffix), false);
return FunctionNode("ends_with", {node.children().at(0), suffix_node},
node.return_type());
}
}

// Could not optimize, return original node.
return node;
}

static bool IsArrowStringLiteral(arrow::Type::type type) {
return type == arrow::Type::STRING || type == arrow::Type::BINARY;
}

Status LikeHolder::Make(const FunctionNode& node, std::shared_ptr<LikeHolder>* holder) {
Status LikeHolder::Make(const FunctionNode& node, std::string* pattern) {
ARROW_RETURN_IF(node.children().size() != 2,
Status::Invalid("'like' function requires two parameters"));
Status::Invalid("'" + node.descriptor()->name() +
"' function requires two parameters"));

auto literal = dynamic_cast<LiteralNode*>(node.children().at(1).get());
ARROW_RETURN_IF(
literal == nullptr,
Status::Invalid("'like' function requires a literal as the second parameter"));
Status::Invalid("'" + node.descriptor()->name() +
"' function requires a literal as the second parameter"));

auto literal_type = literal->return_type()->id();
ARROW_RETURN_IF(
!IsArrowStringLiteral(literal_type),
Status::Invalid(
"'like' function requires a string literal as the second parameter"));
Status::Invalid("'" + node.descriptor()->name() +
" function requires a string literal as the second parameter"));

return Make(arrow::util::get<std::string>(literal->holder()), holder);
}

Status LikeHolder::Make(const std::string& sql_pattern,
std::shared_ptr<LikeHolder>* holder) {
std::string pcre_pattern;
ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern));

auto lholder = std::shared_ptr<LikeHolder>(new LikeHolder(pcre_pattern));
ARROW_RETURN_IF(!lholder->regex_.ok(),
Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed"));

*holder = lholder;
*pattern = arrow::util::get<std::string>(literal->holder());
return Status::OK();
}

} // namespace gandiva
24 changes: 4 additions & 20 deletions cpp/src/gandiva/like_holder.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,29 +31,13 @@

namespace gandiva {

/// Function Holder for SQL 'like'
/// Base class for Function Holder for pattern matching SQL functions like
/// 'like' and 'regexp_matches'
class GANDIVA_EXPORT LikeHolder : public FunctionHolder {
public:
~LikeHolder() override = default;
static Status Make(const FunctionNode& node, std::string* pattern);

static Status Make(const FunctionNode& node, std::shared_ptr<LikeHolder>* holder);

static Status Make(const std::string& sql_pattern, std::shared_ptr<LikeHolder>* holder);

// Try and optimise a function node with a "like" pattern.
static const FunctionNode TryOptimize(const FunctionNode& node);

/// Return true if the data matches the pattern.
bool operator()(const std::string& data) { return RE2::FullMatch(data, regex_); }

private:
explicit LikeHolder(const std::string& pattern) : pattern_(pattern), regex_(pattern) {}

std::string pattern_; // posix pattern string, to help debugging
RE2 regex_; // compiled regex for the pattern

static RE2 starts_with_regex_; // pre-compiled pattern for matching starts_with
static RE2 ends_with_regex_; // pre-compiled pattern for matching ends_with
virtual bool operator()(const std::string& data) = 0;
};

} // namespace gandiva
Expand Down
73 changes: 73 additions & 0 deletions cpp/src/gandiva/regexp_matches_holder.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "gandiva/regexp_matches_holder.h"

#include <regex>
#include "gandiva/node.h"
#include "gandiva/regex_util.h"

namespace gandiva {

RE2 RegexpMatchesHolder::starts_with_regex_(R"(\^([\w\s]+)(\.\*)?)");
RE2 RegexpMatchesHolder::ends_with_regex_(R"((\.\*)?([\w\s]+)\$)");

// Short-circuit pattern matches for the two common sub cases :
// - starts_with and ends_with.
const FunctionNode RegexpMatchesHolder::TryOptimize(const FunctionNode& node) {
std::shared_ptr<RegexpMatchesHolder> holder;
auto status = Make(node, &holder);
if (status.ok()) {
std::string& pattern = holder->pattern_;
auto literal_type = node.children().at(1)->return_type();
std::string substr;
if (RE2::FullMatch(pattern, starts_with_regex_, &substr)) {
auto prefix_node =
std::make_shared<LiteralNode>(literal_type, LiteralHolder(substr), false);
return FunctionNode("starts_with", {node.children().at(0), prefix_node},
node.return_type());
} else if (RE2::FullMatch(pattern, ends_with_regex_, (void*)NULL, &substr)) {
auto suffix_node =
std::make_shared<LiteralNode>(literal_type, LiteralHolder(substr), false);
return FunctionNode("ends_with", {node.children().at(0), suffix_node},
node.return_type());
}
}

// Could not optimize, return original node.
return node;
}

Status RegexpMatchesHolder::Make(const FunctionNode& node,
std::shared_ptr<RegexpMatchesHolder>* holder) {
std::string pcre_pattern;
ARROW_RETURN_NOT_OK(LikeHolder::Make(node, &pcre_pattern));
return Make(pcre_pattern, holder);
}

Status RegexpMatchesHolder::Make(const std::string& pcre_pattern,
std::shared_ptr<RegexpMatchesHolder>* holder) {
auto lholder =
std::shared_ptr<RegexpMatchesHolder>(new RegexpMatchesHolder(pcre_pattern));
ARROW_RETURN_IF(!lholder->regex_.ok(),
Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed"));

*holder = lholder;
return Status::OK();
}

} // namespace gandiva
61 changes: 61 additions & 0 deletions cpp/src/gandiva/regexp_matches_holder.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#ifndef GANDIVA_REGEXP_MATCHES_HOLDER_H
#define GANDIVA_REGEXP_MATCHES_HOLDER_H

#include <memory>
#include <string>

#include <re2/re2.h>

#include "gandiva/like_holder.h"

namespace gandiva {

/// Function Holder for 'regexp_matches' and 'regexp_like' functions
class GANDIVA_EXPORT RegexpMatchesHolder : public LikeHolder {
public:
~RegexpMatchesHolder() override = default;

static Status Make(const FunctionNode& node,
std::shared_ptr<RegexpMatchesHolder>* holder);

static Status Make(const std::string& pcre_pattern,
std::shared_ptr<RegexpMatchesHolder>* holder);

// Try and optimise a function node with a "regexp_matches" pattern.
static const FunctionNode TryOptimize(const FunctionNode& node);

/// Return true if there is a match in the data.
bool operator()(const std::string& data) override {
return RE2::PartialMatch(data, regex_);
}

private:
explicit RegexpMatchesHolder(const std::string& pattern)
: pattern_(pattern), regex_(pattern) {}

std::string pattern_; // posix pattern string, to help debugging
RE2 regex_; // compiled regex for the pattern

static RE2 starts_with_regex_; // pre-compiled pattern for matching starts_with
static RE2 ends_with_regex_; // pre-compiled pattern for matching ends_with
};
} // namespace gandiva

#endif // GANDIVA_REGEXP_MATCHES_HOLDER_H
Loading