From 978b69d54230f38dc9c0b415aa7f2a912cf9ac7e Mon Sep 17 00:00:00 2001 From: frank400 Date: Tue, 30 Mar 2021 15:32:44 -0300 Subject: [PATCH] Implement like function for regex expressions --- cpp/src/gandiva/function_holder_registry.h | 2 + cpp/src/gandiva/function_registry_string.cc | 5 +++ cpp/src/gandiva/like_holder.cc | 19 ++++++++-- cpp/src/gandiva/like_holder.h | 3 ++ cpp/src/gandiva/like_holder_test.cc | 15 ++++++++ cpp/src/gandiva/tests/utf8_test.cc | 41 +++++++++++++++++++++ 6 files changed, 82 insertions(+), 3 deletions(-) diff --git a/cpp/src/gandiva/function_holder_registry.h b/cpp/src/gandiva/function_holder_registry.h index e1c5630e841..7e76347e0de 100644 --- a/cpp/src/gandiva/function_holder_registry.h +++ b/cpp/src/gandiva/function_holder_registry.h @@ -62,6 +62,8 @@ class FunctionHolderRegistry { static map_type& makers() { static map_type maker_map = { {"like", LAMBDA_MAKER(LikeHolder)}, + {"regexp_like", LAMBDA_MAKER(LikeHolder)}, + {"regexp_matches", LAMBDA_MAKER(LikeHolder)}, {"to_date", LAMBDA_MAKER(ToDateHolder)}, {"random", LAMBDA_MAKER(RandomGeneratorHolder)}, {"rand", LAMBDA_MAKER(RandomGeneratorHolder)}, diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index ff438db674e..952ef0fe49d 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -100,6 +100,11 @@ std::vector GetStringFunctionRegistry() { kResultNullIfNull, "gdv_fn_like_utf8_utf8", NativeFunction::kNeedsFunctionHolder), + NativeFunction("regexp_like", {"regexp_matches"}, + DataTypeVector{utf8(), utf8()}, boolean(), + kResultNullIfNull, "gdv_fn_like_utf8_utf8", + NativeFunction::kNeedsFunctionHolder), + NativeFunction("ltrim", {}, DataTypeVector{utf8(), utf8()}, utf8(), kResultNullIfNull, "ltrim_utf8_utf8", NativeFunction::kNeedsContext), diff --git a/cpp/src/gandiva/like_holder.cc b/cpp/src/gandiva/like_holder.cc index 688a4ffa130..214581c27fc 100644 --- a/cpp/src/gandiva/like_holder.cc +++ b/cpp/src/gandiva/like_holder.cc @@ -81,20 +81,33 @@ Status LikeHolder::Make(const FunctionNode& node, std::shared_ptr* h Status::Invalid( "'like' function requires a string literal as the second parameter")); + // Checks if it should compile the pattern directly as an regex expression. + auto function_name = node.descriptor()->name(); + if (function_name == "regexp_matches" || function_name == "regexp_like") { + return Make(arrow::util::get(literal->holder()), holder, true); + } return Make(arrow::util::get(literal->holder()), holder); } Status LikeHolder::Make(const std::string& sql_pattern, std::shared_ptr* holder) { + std::shared_ptr lholder; std::string pcre_pattern; ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern)); - - auto lholder = std::shared_ptr(new LikeHolder(pcre_pattern)); + lholder = std::shared_ptr(new LikeHolder(pcre_pattern)); ARROW_RETURN_IF(!lholder->regex_.ok(), Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed")); - *holder = lholder; return Status::OK(); } +Status LikeHolder::Make(const std::string& pattern, + std::shared_ptr* holder, bool is_regex) { + std::shared_ptr lholder; + lholder = std::shared_ptr(new LikeHolder(pattern)); + ARROW_RETURN_IF(!lholder->regex_.ok(), + Status::Invalid("Building RE2 pattern '", pattern, "' failed")); + *holder = lholder; + return Status::OK(); +} } // namespace gandiva diff --git a/cpp/src/gandiva/like_holder.h b/cpp/src/gandiva/like_holder.h index 82c9e3b29a6..1c3a96fafc4 100644 --- a/cpp/src/gandiva/like_holder.h +++ b/cpp/src/gandiva/like_holder.h @@ -39,6 +39,9 @@ class GANDIVA_EXPORT LikeHolder : public FunctionHolder { static Status Make(const std::string& sql_pattern, std::shared_ptr* holder); + static Status Make( + const std::string& pattern, std::shared_ptr* holder, bool is_regex); + // Try and optimise a function node with a "like" pattern. static const FunctionNode TryOptimize(const FunctionNode& node); diff --git a/cpp/src/gandiva/like_holder_test.cc b/cpp/src/gandiva/like_holder_test.cc index ce6697e72d6..fc084901ef5 100644 --- a/cpp/src/gandiva/like_holder_test.cc +++ b/cpp/src/gandiva/like_holder_test.cc @@ -65,6 +65,21 @@ TEST_F(TestLikeHolder, TestMatchOne) { EXPECT_FALSE(like("dabc")); } +TEST_F(TestLikeHolder, TestMatchOneRegex) { + std::shared_ptr like_holder; + + auto status = LikeHolder::Make("ab.", &like_holder, true); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& like = *like_holder; + EXPECT_TRUE(like("abc")); + EXPECT_TRUE(like("abd")); + + EXPECT_FALSE(like("a")); + EXPECT_FALSE(like("abcd")); + EXPECT_FALSE(like("dabc")); +} + TEST_F(TestLikeHolder, TestPcreSpecial) { std::shared_ptr like_holder; diff --git a/cpp/src/gandiva/tests/utf8_test.cc b/cpp/src/gandiva/tests/utf8_test.cc index 103992d23fe..d0093a53586 100644 --- a/cpp/src/gandiva/tests/utf8_test.cc +++ b/cpp/src/gandiva/tests/utf8_test.cc @@ -221,6 +221,47 @@ TEST_F(TestUtf8, TestLike) { EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0)); } +TEST_F(TestUtf8, TestRegexpLike) { + // schema for input fields + auto field_a = field("a", utf8()); + auto schema = arrow::schema({field_a}); + + // output fields + auto res = field("res", boolean()); + + // build expressions. + // like(literal(s), a) + + auto node_a = TreeExprBuilder::MakeField(field_a); + auto literal_s = TreeExprBuilder::MakeStringLiteral(".*spark.*"); + auto is_like = TreeExprBuilder::MakeFunction("regexp_like", {node_a, literal_s}, boolean()); + auto expr = TreeExprBuilder::MakeExpression(is_like, res); + + // Build a projector for the expressions. + std::shared_ptr projector; + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + // Create a row-batch with some sample data + int num_records = 4; + auto array_a = MakeArrowArrayUtf8({"park", "sparkle", "bright spark and fire", "spark"}, + {true, true, true, true}); + + // expected output + auto exp = MakeArrowArrayBool({false, true, true, true}, {true, true, true, true}); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a}); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_TRUE(status.ok()) << status.message(); + + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0)); +} + TEST_F(TestUtf8, TestBeginsEnds) { // schema for input fields auto field_a = field("a", utf8());