diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index 60bf63a914a..e88a00f6ee1 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -71,8 +71,10 @@ set(SRC_FILES like_holder.cc literal_holder.cc projector.cc + regexp_matches_holder.cc regex_util.cc selection_vector.cc + sql_like_holder.cc tree_expr_builder.cc to_date_holder.cc random_generator_holder.cc @@ -204,7 +206,8 @@ add_gandiva_test(internals-test lru_cache_test.cc to_date_holder_test.cc simple_arena_test.cc - like_holder_test.cc + sql_like_holder_test.cc + regexp_matches_holder_test.cc decimal_type_util_test.cc random_generator_holder_test.cc) diff --git a/cpp/src/gandiva/expr_decomposer.cc b/cpp/src/gandiva/expr_decomposer.cc index 0902468e31e..521446175ef 100644 --- a/cpp/src/gandiva/expr_decomposer.cc +++ b/cpp/src/gandiva/expr_decomposer.cc @@ -53,7 +53,10 @@ Status ExprDecomposer::Visit(const FieldNode& node) { // time. const FunctionNode ExprDecomposer::TryOptimize(const FunctionNode& node) { if (node.descriptor()->name() == "like") { - return LikeHolder::TryOptimize(node); + return SQLLikeHolder::TryOptimize(node); + } else if (node.descriptor()->name() == "regexp_matches" || + node.descriptor()->name() == "regexp_like") { + return RegexpMatchesHolder::TryOptimize(node); } else { return node; } diff --git a/cpp/src/gandiva/function_holder_registry.h b/cpp/src/gandiva/function_holder_registry.h index a2baa024b99..6a489c3502c 100644 --- a/cpp/src/gandiva/function_holder_registry.h +++ b/cpp/src/gandiva/function_holder_registry.h @@ -26,9 +26,10 @@ #include "arrow/status.h" #include "gandiva/function_holder.h" -#include "gandiva/like_holder.h" #include "gandiva/node.h" #include "gandiva/random_generator_holder.h" +#include "gandiva/regexp_matches_holder.h" +#include "gandiva/sql_like_holder.h" #include "gandiva/to_date_holder.h" namespace gandiva { @@ -62,7 +63,9 @@ class FunctionHolderRegistry { private: static map_type& makers() { static map_type maker_map = { - {"like", LAMBDA_MAKER(LikeHolder)}, + {"like", LAMBDA_MAKER(SQLLikeHolder)}, + {"regexp_matches", LAMBDA_MAKER(RegexpMatchesHolder)}, + {"regexp_like", LAMBDA_MAKER(RegexpMatchesHolder)}, {"to_date", LAMBDA_MAKER(ToDateHolder)}, {"random", LAMBDA_MAKER(RandomGeneratorHolder)}, {"rand", LAMBDA_MAKER(RandomGeneratorHolder)}, diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index 3a7066d87e0..a725df6b5bf 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -76,6 +76,10 @@ std::vector GetStringFunctionRegistry() { kResultNullIfNull, "gdv_fn_like_utf8_utf8", NativeFunction::kNeedsFunctionHolder), + NativeFunction("regexp_matches", {"regexp_like"}, DataTypeVector{utf8(), utf8()}, + boolean(), kResultNullIfNull, "gdv_fn_regexp_matches_utf8_utf8", + NativeFunction::kNeedsFunctionHolder), + NativeFunction("substr", {"substring"}, DataTypeVector{utf8(), int64() /*offset*/, int64() /*length*/}, utf8(), kResultNullIfNull, "substr_utf8_int64_int64", diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc index da7a03b312c..d0466ed1caa 100644 --- a/cpp/src/gandiva/gdv_function_stubs.cc +++ b/cpp/src/gandiva/gdv_function_stubs.cc @@ -23,8 +23,8 @@ #include "gandiva/engine.h" #include "gandiva/exported_funcs.h" #include "gandiva/in_holder.h" -#include "gandiva/like_holder.h" #include "gandiva/random_generator_holder.h" +#include "gandiva/sql_like_holder.h" #include "gandiva/to_date_holder.h" /// Stub functions that can be accessed from LLVM or the pre-compiled library. @@ -37,6 +37,11 @@ bool gdv_fn_like_utf8_utf8(int64_t ptr, const char* data, int data_len, return (*holder)(std::string(data, data_len)); } +bool gdv_fn_regexp_matches_utf8_utf8(int64_t ptr, const char* data, int data_len, + const char* pattern, int pattern_len) { + return gdv_fn_like_utf8_utf8(ptr, data, data_len, pattern, pattern_len); +} + double gdv_fn_random(int64_t ptr) { gandiva::RandomGeneratorHolder* holder = reinterpret_cast(ptr); @@ -187,6 +192,17 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const { types->i1_type() /*return_type*/, args, reinterpret_cast(gdv_fn_like_utf8_utf8)); + // gdv_fn_regexp_matches_utf8_utf8 + args = {types->i64_type(), // int64_t ptr + types->i8_ptr_type(), // const char* data + types->i32_type(), // int data_len + types->i8_ptr_type(), // const char* pattern + types->i32_type()}; // int pattern_len + + engine->AddGlobalMappingForFunc( + "gdv_fn_regexp_matches_utf8_utf8", types->i1_type() /*return_type*/, args, + reinterpret_cast(gdv_fn_regexp_matches_utf8_utf8)); + // gdv_fn_to_date_utf8_utf8_int32 args = {types->i64_type(), // int64_t execution_context types->i64_type(), // int64_t holder_ptr diff --git a/cpp/src/gandiva/gdv_function_stubs.h b/cpp/src/gandiva/gdv_function_stubs.h index fcdf7d6ac66..f8bdcdc38c9 100644 --- a/cpp/src/gandiva/gdv_function_stubs.h +++ b/cpp/src/gandiva/gdv_function_stubs.h @@ -26,6 +26,9 @@ extern "C" { bool gdv_fn_like_utf8_utf8(int64_t ptr, const char* data, int data_len, const char* pattern, int pattern_len); +bool gdv_fn_regexp_matches_utf8_utf8(int64_t ptr, const char* data, int data_len, + const char* pattern, int pattern_len); + int64_t gdv_fn_to_date_utf8_utf8_int32(int64_t context, int64_t ptr, const char* data, int data_len, bool in1_validity, const char* pattern, int pattern_len, diff --git a/cpp/src/gandiva/like_holder.cc b/cpp/src/gandiva/like_holder.cc index 404105b1070..b25358baa5f 100644 --- a/cpp/src/gandiva/like_holder.cc +++ b/cpp/src/gandiva/like_holder.cc @@ -17,76 +17,32 @@ #include "gandiva/like_holder.h" -#include #include "gandiva/node.h" -#include "gandiva/regex_util.h" namespace gandiva { -RE2 LikeHolder::starts_with_regex_(R"((\w|\s)*\.\*)"); -RE2 LikeHolder::ends_with_regex_(R"(\.\*(\w|\s)*)"); - -// Short-circuit pattern matches for the two common sub cases : -// - starts_with and ends_with. -const FunctionNode LikeHolder::TryOptimize(const FunctionNode& node) { - std::shared_ptr holder; - auto status = Make(node, &holder); - if (status.ok()) { - std::string& pattern = holder->pattern_; - auto literal_type = node.children().at(1)->return_type(); - - if (RE2::FullMatch(pattern, starts_with_regex_)) { - auto prefix = pattern.substr(0, pattern.length() - 2); // trim .* - auto prefix_node = - std::make_shared(literal_type, LiteralHolder(prefix), false); - return FunctionNode("starts_with", {node.children().at(0), prefix_node}, - node.return_type()); - } else if (RE2::FullMatch(pattern, ends_with_regex_)) { - auto suffix = pattern.substr(2); // skip .* - auto suffix_node = - std::make_shared(literal_type, LiteralHolder(suffix), false); - return FunctionNode("ends_with", {node.children().at(0), suffix_node}, - node.return_type()); - } - } - - // Could not optimize, return original node. - return node; -} - static bool IsArrowStringLiteral(arrow::Type::type type) { return type == arrow::Type::STRING || type == arrow::Type::BINARY; } -Status LikeHolder::Make(const FunctionNode& node, std::shared_ptr* holder) { +Status LikeHolder::Make(const FunctionNode& node, std::string* pattern) { ARROW_RETURN_IF(node.children().size() != 2, - Status::Invalid("'like' function requires two parameters")); + Status::Invalid("'" + node.descriptor()->name() + + "' function requires two parameters")); auto literal = dynamic_cast(node.children().at(1).get()); ARROW_RETURN_IF( literal == nullptr, - Status::Invalid("'like' function requires a literal as the second parameter")); + Status::Invalid("'" + node.descriptor()->name() + + "' function requires a literal as the second parameter")); auto literal_type = literal->return_type()->id(); ARROW_RETURN_IF( !IsArrowStringLiteral(literal_type), - Status::Invalid( - "'like' function requires a string literal as the second parameter")); + Status::Invalid("'" + node.descriptor()->name() + + " function requires a string literal as the second parameter")); - return Make(arrow::util::get(literal->holder()), holder); -} - -Status LikeHolder::Make(const std::string& sql_pattern, - std::shared_ptr* holder) { - std::string pcre_pattern; - ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern)); - - auto lholder = std::shared_ptr(new LikeHolder(pcre_pattern)); - ARROW_RETURN_IF(!lholder->regex_.ok(), - Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed")); - - *holder = lholder; + *pattern = arrow::util::get(literal->holder()); return Status::OK(); } - } // namespace gandiva diff --git a/cpp/src/gandiva/like_holder.h b/cpp/src/gandiva/like_holder.h index eab30bf732f..866d74f4e85 100644 --- a/cpp/src/gandiva/like_holder.h +++ b/cpp/src/gandiva/like_holder.h @@ -31,29 +31,13 @@ namespace gandiva { -/// Function Holder for SQL 'like' +/// Base class for Function Holder for pattern matching SQL functions like +/// 'like' and 'regexp_matches' class GANDIVA_EXPORT LikeHolder : public FunctionHolder { public: - ~LikeHolder() override = default; + static Status Make(const FunctionNode& node, std::string* pattern); - static Status Make(const FunctionNode& node, std::shared_ptr* holder); - - static Status Make(const std::string& sql_pattern, std::shared_ptr* holder); - - // Try and optimise a function node with a "like" pattern. - static const FunctionNode TryOptimize(const FunctionNode& node); - - /// Return true if the data matches the pattern. - bool operator()(const std::string& data) { return RE2::FullMatch(data, regex_); } - - private: - explicit LikeHolder(const std::string& pattern) : pattern_(pattern), regex_(pattern) {} - - std::string pattern_; // posix pattern string, to help debugging - RE2 regex_; // compiled regex for the pattern - - static RE2 starts_with_regex_; // pre-compiled pattern for matching starts_with - static RE2 ends_with_regex_; // pre-compiled pattern for matching ends_with + virtual bool operator()(const std::string& data) = 0; }; } // namespace gandiva diff --git a/cpp/src/gandiva/regexp_matches_holder.cc b/cpp/src/gandiva/regexp_matches_holder.cc new file mode 100644 index 00000000000..6b10678573d --- /dev/null +++ b/cpp/src/gandiva/regexp_matches_holder.cc @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/regexp_matches_holder.h" + +#include +#include "gandiva/node.h" +#include "gandiva/regex_util.h" + +namespace gandiva { + +RE2 RegexpMatchesHolder::starts_with_regex_(R"(\^([\w\s]+)(\.\*)?)"); +RE2 RegexpMatchesHolder::ends_with_regex_(R"((\.\*)?([\w\s]+)\$)"); + +// Short-circuit pattern matches for the two common sub cases : +// - starts_with and ends_with. +const FunctionNode RegexpMatchesHolder::TryOptimize(const FunctionNode& node) { + std::shared_ptr holder; + auto status = Make(node, &holder); + if (status.ok()) { + std::string& pattern = holder->pattern_; + auto literal_type = node.children().at(1)->return_type(); + std::string substr; + if (RE2::FullMatch(pattern, starts_with_regex_, &substr)) { + auto prefix_node = + std::make_shared(literal_type, LiteralHolder(substr), false); + return FunctionNode("starts_with", {node.children().at(0), prefix_node}, + node.return_type()); + } else if (RE2::FullMatch(pattern, ends_with_regex_, (void*)NULL, &substr)) { + auto suffix_node = + std::make_shared(literal_type, LiteralHolder(substr), false); + return FunctionNode("ends_with", {node.children().at(0), suffix_node}, + node.return_type()); + } + } + + // Could not optimize, return original node. + return node; +} + +Status RegexpMatchesHolder::Make(const FunctionNode& node, + std::shared_ptr* holder) { + std::string pcre_pattern; + ARROW_RETURN_NOT_OK(LikeHolder::Make(node, &pcre_pattern)); + return Make(pcre_pattern, holder); +} + +Status RegexpMatchesHolder::Make(const std::string& pcre_pattern, + std::shared_ptr* holder) { + auto lholder = + std::shared_ptr(new RegexpMatchesHolder(pcre_pattern)); + ARROW_RETURN_IF(!lholder->regex_.ok(), + Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed")); + + *holder = lholder; + return Status::OK(); +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/regexp_matches_holder.h b/cpp/src/gandiva/regexp_matches_holder.h new file mode 100644 index 00000000000..d2ca892a177 --- /dev/null +++ b/cpp/src/gandiva/regexp_matches_holder.h @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef GANDIVA_REGEXP_MATCHES_HOLDER_H +#define GANDIVA_REGEXP_MATCHES_HOLDER_H + +#include +#include + +#include + +#include "gandiva/like_holder.h" + +namespace gandiva { + +/// Function Holder for 'regexp_matches' and 'regexp_like' functions +class GANDIVA_EXPORT RegexpMatchesHolder : public LikeHolder { + public: + ~RegexpMatchesHolder() override = default; + + static Status Make(const FunctionNode& node, + std::shared_ptr* holder); + + static Status Make(const std::string& pcre_pattern, + std::shared_ptr* holder); + + // Try and optimise a function node with a "regexp_matches" pattern. + static const FunctionNode TryOptimize(const FunctionNode& node); + + /// Return true if there is a match in the data. + bool operator()(const std::string& data) override { + return RE2::PartialMatch(data, regex_); + } + + private: + explicit RegexpMatchesHolder(const std::string& pattern) + : pattern_(pattern), regex_(pattern) {} + + std::string pattern_; // posix pattern string, to help debugging + RE2 regex_; // compiled regex for the pattern + + static RE2 starts_with_regex_; // pre-compiled pattern for matching starts_with + static RE2 ends_with_regex_; // pre-compiled pattern for matching ends_with +}; +} // namespace gandiva + +#endif // GANDIVA_REGEXP_MATCHES_HOLDER_H diff --git a/cpp/src/gandiva/regexp_matches_holder_test.cc b/cpp/src/gandiva/regexp_matches_holder_test.cc new file mode 100644 index 00000000000..d2ece61a754 --- /dev/null +++ b/cpp/src/gandiva/regexp_matches_holder_test.cc @@ -0,0 +1,191 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/regexp_matches_holder.h" +#include "gandiva/regex_util.h" + +#include +#include + +#include + +namespace gandiva { + +class TestRegexpMatchesHolder : public ::testing::Test { + public: + FunctionNode BuildRegexpMatches(std::string pattern) { + auto field = std::make_shared(arrow::field("in", arrow::utf8())); + auto pattern_node = + std::make_shared(arrow::utf8(), LiteralHolder(pattern), false); + return FunctionNode("regexp_matches", {field, pattern_node}, arrow::boolean()); + } +}; + +TEST_F(TestRegexpMatchesHolder, TestString) { + std::shared_ptr regexp_matches_holder; + + auto status = RegexpMatchesHolder::Make("ab", ®exp_matches_holder); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& regexp_matches = *regexp_matches_holder; + EXPECT_TRUE(regexp_matches("ab")); + EXPECT_TRUE(regexp_matches("abc")); + EXPECT_TRUE(regexp_matches("abcd")); + EXPECT_TRUE(regexp_matches("cab")); + + EXPECT_FALSE(regexp_matches("a")); +} + +TEST_F(TestRegexpMatchesHolder, TestDotStar) { + std::shared_ptr regexp_matches_holder; + + auto status = RegexpMatchesHolder::Make("a.*b", ®exp_matches_holder); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& regexp_matches = *regexp_matches_holder; + EXPECT_TRUE(regexp_matches("ab")); + EXPECT_TRUE(regexp_matches("adeb")); + EXPECT_TRUE(regexp_matches("abc")); + EXPECT_TRUE(regexp_matches("cabc")); + EXPECT_TRUE(regexp_matches("caebf")); + + EXPECT_FALSE(regexp_matches("ba")); + EXPECT_FALSE(regexp_matches("a")); +} + +TEST_F(TestRegexpMatchesHolder, TestDot) { + std::shared_ptr regexp_matches_holder; + + auto status = RegexpMatchesHolder::Make("ab.", ®exp_matches_holder); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& regexp_matches = *regexp_matches_holder; + EXPECT_TRUE(regexp_matches("abc")); + EXPECT_TRUE(regexp_matches("abd")); + EXPECT_TRUE(regexp_matches("abcd")); + EXPECT_TRUE(regexp_matches("dabc")); + + EXPECT_FALSE(regexp_matches("a")); + EXPECT_FALSE(regexp_matches("ab")); +} + +TEST_F(TestRegexpMatchesHolder, TestAnchors) { + std::shared_ptr regexp_matches_holder; + + auto status = RegexpMatchesHolder::Make("^ab.*c$", ®exp_matches_holder); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& regexp_matches = *regexp_matches_holder; + EXPECT_TRUE(regexp_matches("abdc")); + EXPECT_TRUE(regexp_matches("abc")); + + EXPECT_FALSE(regexp_matches("abcd")); + EXPECT_FALSE(regexp_matches("dabc")); +} + +TEST_F(TestRegexpMatchesHolder, TestIgnoreCase) { + std::shared_ptr regexp_matches_holder; + + auto status = RegexpMatchesHolder::Make("(?i)ab", ®exp_matches_holder); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& regexp_matches = *regexp_matches_holder; + EXPECT_TRUE(regexp_matches("abc")); + EXPECT_TRUE(regexp_matches("daBc")); + EXPECT_TRUE(regexp_matches("CAB")); + + EXPECT_FALSE(regexp_matches("ba")); +} + +TEST_F(TestRegexpMatchesHolder, TestCharacterClass) { + std::shared_ptr regexp_matches_holder; + + auto status = RegexpMatchesHolder::Make("[ab]c", ®exp_matches_holder); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& regexp_matches = *regexp_matches_holder; + EXPECT_TRUE(regexp_matches("acd")); + EXPECT_TRUE(regexp_matches("ebc")); + EXPECT_TRUE(regexp_matches("abc")); + + EXPECT_FALSE(regexp_matches("ab")); +} + +TEST_F(TestRegexpMatchesHolder, TestEscapeCharacter) { + std::shared_ptr regexp_matches_holder; + + auto status = RegexpMatchesHolder::Make("\\.\\*", ®exp_matches_holder); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& regexp_matches = *regexp_matches_holder; + EXPECT_TRUE(regexp_matches(".*")); + + EXPECT_FALSE(regexp_matches("ab")); +} + +TEST_F(TestRegexpMatchesHolder, TestNonAsciiMatches) { + std::shared_ptr regexp_matches_holder; + + auto status = RegexpMatchesHolder::Make(".*çåå†.*", ®exp_matches_holder); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& regexp_matches = *regexp_matches_holder; + EXPECT_TRUE(regexp_matches("açåå†b")); + + EXPECT_FALSE(regexp_matches("ab")); +} + +TEST_F(TestRegexpMatchesHolder, TestOptimise) { + // optimise for 'starts_with' + auto fnode = RegexpMatchesHolder::TryOptimize(BuildRegexpMatches("^abc")); + EXPECT_EQ(fnode.descriptor()->name(), "starts_with"); + EXPECT_EQ(fnode.ToString(), "bool starts_with((string) in, (const string) abc)"); + + fnode = RegexpMatchesHolder::TryOptimize(BuildRegexpMatches("^abc.*")); + EXPECT_EQ(fnode.descriptor()->name(), "starts_with"); + EXPECT_EQ(fnode.ToString(), "bool starts_with((string) in, (const string) abc)"); + + fnode = RegexpMatchesHolder::TryOptimize(BuildRegexpMatches("^ab cd")); + EXPECT_EQ(fnode.descriptor()->name(), "starts_with"); + EXPECT_EQ(fnode.ToString(), "bool starts_with((string) in, (const string) ab cd)"); + + // optimise for 'ends_with' + fnode = RegexpMatchesHolder::TryOptimize(BuildRegexpMatches("xyz$")); + EXPECT_EQ(fnode.descriptor()->name(), "ends_with"); + EXPECT_EQ(fnode.ToString(), "bool ends_with((string) in, (const string) xyz)"); + + fnode = RegexpMatchesHolder::TryOptimize(BuildRegexpMatches(".*xyz$")); + EXPECT_EQ(fnode.descriptor()->name(), "ends_with"); + EXPECT_EQ(fnode.ToString(), "bool ends_with((string) in, (const string) xyz)"); + + // no optimisation for others. + fnode = RegexpMatchesHolder::TryOptimize(BuildRegexpMatches("^xyz$")); + EXPECT_EQ(fnode.descriptor()->name(), "regexp_matches"); + + fnode = RegexpMatchesHolder::TryOptimize(BuildRegexpMatches("^xy.*z")); + EXPECT_EQ(fnode.descriptor()->name(), "regexp_matches"); + + fnode = RegexpMatchesHolder::TryOptimize(BuildRegexpMatches("^.*")); + EXPECT_EQ(fnode.descriptor()->name(), "regexp_matches"); + + fnode = RegexpMatchesHolder::TryOptimize(BuildRegexpMatches("x.yz$")); + EXPECT_EQ(fnode.descriptor()->name(), "regexp_matches"); + + fnode = RegexpMatchesHolder::TryOptimize(BuildRegexpMatches("^[xyz]")); + EXPECT_EQ(fnode.descriptor()->name(), "regexp_matches"); +} +} // namespace gandiva diff --git a/cpp/src/gandiva/sql_like_holder.cc b/cpp/src/gandiva/sql_like_holder.cc new file mode 100644 index 00000000000..c0d8aa26825 --- /dev/null +++ b/cpp/src/gandiva/sql_like_holder.cc @@ -0,0 +1,77 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/sql_like_holder.h" + +#include +#include "gandiva/node.h" +#include "gandiva/regex_util.h" + +namespace gandiva { + +RE2 SQLLikeHolder::starts_with_regex_(R"((\w|\s)*\.\*)"); +RE2 SQLLikeHolder::ends_with_regex_(R"(\.\*(\w|\s)*)"); + +// Short-circuit pattern matches for the two common sub cases : +// - starts_with and ends_with. +const FunctionNode SQLLikeHolder::TryOptimize(const FunctionNode& node) { + std::shared_ptr holder; + auto status = Make(node, &holder); + if (status.ok()) { + std::string& pattern = holder->pattern_; + auto literal_type = node.children().at(1)->return_type(); + + if (RE2::FullMatch(pattern, starts_with_regex_)) { + auto prefix = pattern.substr(0, pattern.length() - 2); // trim .* + auto prefix_node = + std::make_shared(literal_type, LiteralHolder(prefix), false); + return FunctionNode("starts_with", {node.children().at(0), prefix_node}, + node.return_type()); + } else if (RE2::FullMatch(pattern, ends_with_regex_)) { + auto suffix = pattern.substr(2); // skip .* + auto suffix_node = + std::make_shared(literal_type, LiteralHolder(suffix), false); + return FunctionNode("ends_with", {node.children().at(0), suffix_node}, + node.return_type()); + } + } + + // Could not optimize, return original node. + return node; +} + +Status SQLLikeHolder::Make(const FunctionNode& node, + std::shared_ptr* holder) { + std::string sql_pattern; + ARROW_RETURN_NOT_OK(LikeHolder::Make(node, &sql_pattern)); + return Make(sql_pattern, holder); +} + +Status SQLLikeHolder::Make(const std::string& sql_pattern, + std::shared_ptr* holder) { + std::string pcre_pattern; + ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern)); + + auto lholder = std::shared_ptr(new SQLLikeHolder(pcre_pattern)); + ARROW_RETURN_IF(!lholder->regex_.ok(), + Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed")); + + *holder = lholder; + return Status::OK(); +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/sql_like_holder.h b/cpp/src/gandiva/sql_like_holder.h new file mode 100644 index 00000000000..44e9750a80a --- /dev/null +++ b/cpp/src/gandiva/sql_like_holder.h @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef GANDIVA_SQL_LIKE_HOLDER_H +#define GANDIVA_SQL_LIKE_HOLDER_H + +#include +#include + +#include + +#include "gandiva/like_holder.h" + +namespace gandiva { + +/// Function Holder for SQL 'like' +class GANDIVA_EXPORT SQLLikeHolder : public LikeHolder { + public: + ~SQLLikeHolder() override = default; + + static Status Make(const FunctionNode& node, std::shared_ptr* holder); + + static Status Make(const std::string& sql_pattern, + std::shared_ptr* holder); + + // Try and optimise a function node with a "like" pattern. + static const FunctionNode TryOptimize(const FunctionNode& node); + + /// Return true if the data matches the pattern. + bool operator()(const std::string& data) override { + return RE2::FullMatch(data, regex_); + } + + private: + explicit SQLLikeHolder(const std::string& pattern) + : pattern_(pattern), regex_(pattern) {} + + std::string pattern_; // posix pattern string, to help debugging + RE2 regex_; // compiled regex for the pattern + + static RE2 starts_with_regex_; // pre-compiled pattern for matching starts_with + static RE2 ends_with_regex_; // pre-compiled pattern for matching ends_with +}; + +} // namespace gandiva + +#endif // GANDIVA_SQL_LIKE_HOLDER_H diff --git a/cpp/src/gandiva/like_holder_test.cc b/cpp/src/gandiva/sql_like_holder_test.cc similarity index 66% rename from cpp/src/gandiva/like_holder_test.cc rename to cpp/src/gandiva/sql_like_holder_test.cc index 817473d7bb2..60d2ddb0f7a 100644 --- a/cpp/src/gandiva/like_holder_test.cc +++ b/cpp/src/gandiva/sql_like_holder_test.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "gandiva/like_holder.h" +#include "gandiva/sql_like_holder.h" #include "gandiva/regex_util.h" #include @@ -25,7 +25,7 @@ namespace gandiva { -class TestLikeHolder : public ::testing::Test { +class TestSQLLikeHolder : public ::testing::Test { public: FunctionNode BuildLike(std::string pattern) { auto field = std::make_shared(arrow::field("in", arrow::utf8())); @@ -35,13 +35,13 @@ class TestLikeHolder : public ::testing::Test { } }; -TEST_F(TestLikeHolder, TestMatchAny) { - std::shared_ptr like_holder; +TEST_F(TestSQLLikeHolder, TestMatchAny) { + std::shared_ptr sql_like_holder; - auto status = LikeHolder::Make("ab%", &like_holder); + auto status = SQLLikeHolder::Make("ab%", &sql_like_holder); EXPECT_EQ(status.ok(), true) << status.message(); - auto& like = *like_holder; + auto& like = *sql_like_holder; EXPECT_TRUE(like("ab")); EXPECT_TRUE(like("abc")); EXPECT_TRUE(like("abcd")); @@ -50,13 +50,13 @@ TEST_F(TestLikeHolder, TestMatchAny) { EXPECT_FALSE(like("cab")); } -TEST_F(TestLikeHolder, TestMatchOne) { - std::shared_ptr like_holder; +TEST_F(TestSQLLikeHolder, TestMatchOne) { + std::shared_ptr sql_like_holder; - auto status = LikeHolder::Make("ab_", &like_holder); + auto status = SQLLikeHolder::Make("ab_", &sql_like_holder); EXPECT_EQ(status.ok(), true) << status.message(); - auto& like = *like_holder; + auto& like = *sql_like_holder; EXPECT_TRUE(like("abc")); EXPECT_TRUE(like("abd")); @@ -65,18 +65,18 @@ TEST_F(TestLikeHolder, TestMatchOne) { EXPECT_FALSE(like("dabc")); } -TEST_F(TestLikeHolder, TestPcreSpecial) { - std::shared_ptr like_holder; +TEST_F(TestSQLLikeHolder, TestPcreSpecial) { + std::shared_ptr sql_like_holder; - auto status = LikeHolder::Make(".*ab_", &like_holder); + auto status = SQLLikeHolder::Make(".*ab_", &sql_like_holder); EXPECT_EQ(status.ok(), true) << status.message(); - auto& like = *like_holder; + auto& like = *sql_like_holder; EXPECT_TRUE(like(".*abc")); // . and * aren't special in sql regex EXPECT_FALSE(like("xxabc")); } -TEST_F(TestLikeHolder, TestRegexEscape) { +TEST_F(TestSQLLikeHolder, TestRegexEscape) { std::string res; auto status = RegexUtil::SqlLikePatternToPcre("#%hello#_abc_def##", '#', res); EXPECT_TRUE(status.ok()) << status.message(); @@ -84,44 +84,44 @@ TEST_F(TestLikeHolder, TestRegexEscape) { EXPECT_EQ(res, "%hello_abc.def#"); } -TEST_F(TestLikeHolder, TestDot) { - std::shared_ptr like_holder; +TEST_F(TestSQLLikeHolder, TestDot) { + std::shared_ptr sql_like_holder; - auto status = LikeHolder::Make("abc.", &like_holder); + auto status = SQLLikeHolder::Make("abc.", &sql_like_holder); EXPECT_EQ(status.ok(), true) << status.message(); - auto& like = *like_holder; + auto& like = *sql_like_holder; EXPECT_FALSE(like("abcd")); } -TEST_F(TestLikeHolder, TestOptimise) { +TEST_F(TestSQLLikeHolder, TestOptimise) { // optimise for 'starts_with' - auto fnode = LikeHolder::TryOptimize(BuildLike("xy 123z%")); + auto fnode = SQLLikeHolder::TryOptimize(BuildLike("xy 123z%")); EXPECT_EQ(fnode.descriptor()->name(), "starts_with"); EXPECT_EQ(fnode.ToString(), "bool starts_with((string) in, (const string) xy 123z)"); // optimise for 'ends_with' - fnode = LikeHolder::TryOptimize(BuildLike("%xyz")); + fnode = SQLLikeHolder::TryOptimize(BuildLike("%xyz")); EXPECT_EQ(fnode.descriptor()->name(), "ends_with"); EXPECT_EQ(fnode.ToString(), "bool ends_with((string) in, (const string) xyz)"); // no optimisation for others. - fnode = LikeHolder::TryOptimize(BuildLike("xyz_")); + fnode = SQLLikeHolder::TryOptimize(BuildLike("xyz_")); EXPECT_EQ(fnode.descriptor()->name(), "like"); - fnode = LikeHolder::TryOptimize(BuildLike("_xyz")); + fnode = SQLLikeHolder::TryOptimize(BuildLike("_xyz")); EXPECT_EQ(fnode.descriptor()->name(), "like"); - fnode = LikeHolder::TryOptimize(BuildLike("%xyz%")); + fnode = SQLLikeHolder::TryOptimize(BuildLike("%xyz%")); EXPECT_EQ(fnode.descriptor()->name(), "like"); - fnode = LikeHolder::TryOptimize(BuildLike("_xyz_")); + fnode = SQLLikeHolder::TryOptimize(BuildLike("_xyz_")); EXPECT_EQ(fnode.descriptor()->name(), "like"); - fnode = LikeHolder::TryOptimize(BuildLike("%xyz_")); + fnode = SQLLikeHolder::TryOptimize(BuildLike("%xyz_")); EXPECT_EQ(fnode.descriptor()->name(), "like"); - fnode = LikeHolder::TryOptimize(BuildLike("x_yz%")); + fnode = SQLLikeHolder::TryOptimize(BuildLike("x_yz%")); EXPECT_EQ(fnode.descriptor()->name(), "like"); }