From b1d9a7a15ea75869ada5de59a4e1502d4671867e Mon Sep 17 00:00:00 2001 From: Ivan Chesnov Date: Wed, 3 Apr 2024 11:10:03 +0300 Subject: [PATCH 1/5] GH-40968 add set_dot_nl(true) for Like option --- cpp/src/gandiva/regex_functions_holder.cc | 22 ++++--------- cpp/src/gandiva/regex_functions_holder.h | 5 ++- .../gandiva/regex_functions_holder_test.cc | 32 +++++++++++++++---- 3 files changed, 33 insertions(+), 26 deletions(-) diff --git a/cpp/src/gandiva/regex_functions_holder.cc b/cpp/src/gandiva/regex_functions_holder.cc index 03a4af90d89..cf0d7930f14 100644 --- a/cpp/src/gandiva/regex_functions_holder.cc +++ b/cpp/src/gandiva/regex_functions_holder.cc @@ -99,13 +99,14 @@ Result> LikeHolder::Make(const FunctionNode& node) { "'like' function requires a string literal as the second parameter")); RE2::Options regex_op; + regex_op.set_dot_nl(true); // set dotall mode for the regex. if (node.descriptor()->name() == "ilike") { regex_op.set_case_sensitive(false); // set case-insensitive for ilike function. return Make(std::get(literal->holder()), regex_op); } if (node.children().size() == 2) { - return Make(std::get(literal->holder())); + return Make(std::get(literal->holder()), regex_op); } else { auto escape_char = dynamic_cast(node.children().at(2).get()); ARROW_RETURN_IF( @@ -118,24 +119,13 @@ Result> LikeHolder::Make(const FunctionNode& node) { Status::Invalid( "'like' function requires a string literal as the third parameter")); return Make(std::get(literal->holder()), - std::get(escape_char->holder())); + std::get(escape_char->holder()), regex_op); } } -Result> LikeHolder::Make(const std::string& sql_pattern) { - std::string pcre_pattern; - ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern)); - - auto lholder = std::shared_ptr(new LikeHolder(pcre_pattern)); - ARROW_RETURN_IF(!lholder->regex_.ok(), - Status::Invalid("Building RE2 pattern '", pcre_pattern, - "' failed with: ", lholder->regex_.error())); - - return lholder; -} - Result> LikeHolder::Make(const std::string& sql_pattern, - const std::string& escape_char) { + const std::string& escape_char, + RE2::Options regex_op) { ARROW_RETURN_IF(escape_char.length() > 1, Status::Invalid("The length of escape char ", escape_char, " in 'like' function is greater than 1")); @@ -147,7 +137,7 @@ Result> LikeHolder::Make(const std::string& sql_patt ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern)); } - auto lholder = std::shared_ptr(new LikeHolder(pcre_pattern)); + auto lholder = std::shared_ptr(new LikeHolder(pcre_pattern, regex_op)); ARROW_RETURN_IF(!lholder->regex_.ok(), Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed with: ", lholder->regex_.error())); diff --git a/cpp/src/gandiva/regex_functions_holder.h b/cpp/src/gandiva/regex_functions_holder.h index 36d942510bb..82836fa5fe0 100644 --- a/cpp/src/gandiva/regex_functions_holder.h +++ b/cpp/src/gandiva/regex_functions_holder.h @@ -37,10 +37,9 @@ class GANDIVA_EXPORT LikeHolder : public FunctionHolder { static Result> Make(const FunctionNode& node); - static Result> Make(const std::string& sql_pattern); - static Result> Make(const std::string& sql_pattern, - const std::string& escape_char); + const std::string& escape_char, + RE2::Options regex_op); static Result> Make(const std::string& sql_pattern, RE2::Options regex_op); diff --git a/cpp/src/gandiva/regex_functions_holder_test.cc b/cpp/src/gandiva/regex_functions_holder_test.cc index 534be5987a2..620278ed894 100644 --- a/cpp/src/gandiva/regex_functions_holder_test.cc +++ b/cpp/src/gandiva/regex_functions_holder_test.cc @@ -46,6 +46,7 @@ class TestLikeHolder : public ::testing::Test { }; TEST_F(TestLikeHolder, TestMatchAny) { + regex_op.set_dot_nl(true); EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab%", regex_op)); auto& like = *like_holder; @@ -58,6 +59,7 @@ TEST_F(TestLikeHolder, TestMatchAny) { } TEST_F(TestLikeHolder, TestMatchOne) { + regex_op.set_dot_nl(true); EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab_", regex_op)); auto& like = *like_holder; @@ -70,6 +72,7 @@ TEST_F(TestLikeHolder, TestMatchOne) { } TEST_F(TestLikeHolder, TestPcreSpecial) { + regex_op.set_dot_nl(true); EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make(".*ab_", regex_op)); auto& like = *like_holder; @@ -77,6 +80,14 @@ TEST_F(TestLikeHolder, TestPcreSpecial) { EXPECT_FALSE(like("xxabc")); } +TEST_F(TestLikeHolder, TestPcreSpecialWithNewLine) { + regex_op.set_dot_nl(true); + EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("%Space1.%", regex_op)); + + auto& like = *like_holder; + EXPECT_TRUE(like("[name: \"Space1.protect\"\nargs: \"count\"\ncolumn_name: \"pass_count\"]")); +} + TEST_F(TestLikeHolder, TestRegexEscape) { std::string res; ARROW_EXPECT_OK(RegexUtil::SqlLikePatternToPcre("#%hello#_abc_def##", '#', res)); @@ -85,6 +96,7 @@ TEST_F(TestLikeHolder, TestRegexEscape) { } TEST_F(TestLikeHolder, TestDot) { + regex_op.set_dot_nl(true); EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("abc.", regex_op)); auto& like = *like_holder; @@ -92,13 +104,14 @@ TEST_F(TestLikeHolder, TestDot) { } TEST_F(TestLikeHolder, TestMatchSubString) { - EXPECT_OK_AND_ASSIGN(auto like_holder, LikeHolder::Make("%abc%", "\\")); + regex_op.set_dot_nl(true); + EXPECT_OK_AND_ASSIGN(auto like_holder, LikeHolder::Make("%abc%", "\\", regex_op)); auto& like = *like_holder; EXPECT_TRUE(like("abc")); EXPECT_FALSE(like("xxabdc")); - EXPECT_OK_AND_ASSIGN(like_holder, LikeHolder::Make("%ab-.^$*+?()[]{}|—/c\\%%", "\\")); + EXPECT_OK_AND_ASSIGN(like_holder, LikeHolder::Make("%ab-.^$*+?()[]{}|—/c\\%%", "\\", regex_op)); auto& like_reserved_char = *like_holder; EXPECT_TRUE(like_reserved_char("XXab-.^$*+?()[]{}|—/c%d")); @@ -173,7 +186,8 @@ TEST_F(TestLikeHolder, TestOptimise) { } TEST_F(TestLikeHolder, TestMatchOneEscape) { - EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\_", "\\")); + regex_op.set_dot_nl(true); + EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\_", "\\", regex_op)); auto& like = *like_holder; @@ -187,7 +201,8 @@ TEST_F(TestLikeHolder, TestMatchOneEscape) { } TEST_F(TestLikeHolder, TestMatchManyEscape) { - EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\%", "\\")); + regex_op.set_dot_nl(true); + EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\%", "\\", regex_op)); auto& like = *like_holder; @@ -201,7 +216,8 @@ TEST_F(TestLikeHolder, TestMatchManyEscape) { } TEST_F(TestLikeHolder, TestMatchEscape) { - EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\\\", "\\")); + regex_op.set_dot_nl(true); + EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\\\", "\\", regex_op)); auto& like = *like_holder; @@ -211,7 +227,8 @@ TEST_F(TestLikeHolder, TestMatchEscape) { } TEST_F(TestLikeHolder, TestEmptyEscapeChar) { - EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\_", "")); + regex_op.set_dot_nl(true); + EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\_", "", regex_op)); auto& like = *like_holder; @@ -223,7 +240,8 @@ TEST_F(TestLikeHolder, TestEmptyEscapeChar) { } TEST_F(TestLikeHolder, TestMultipleEscapeChar) { - ASSERT_RAISES(Invalid, LikeHolder::Make("ab\\_", "\\\\").status()); + regex_op.set_dot_nl(true); + ASSERT_RAISES(Invalid, LikeHolder::Make("ab\\_", "\\\\", regex_op).status()); } class TestILikeHolder : public ::testing::Test { From f63772fa270f7dd71d5ee03c329f4b8a47b44028 Mon Sep 17 00:00:00 2001 From: Ivan Chesnov Date: Wed, 3 Apr 2024 11:28:57 +0300 Subject: [PATCH 2/5] fixed codestyle --- cpp/src/gandiva/regex_functions_holder.cc | 2 +- cpp/src/gandiva/regex_functions_holder_test.cc | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/cpp/src/gandiva/regex_functions_holder.cc b/cpp/src/gandiva/regex_functions_holder.cc index cf0d7930f14..ced355362d9 100644 --- a/cpp/src/gandiva/regex_functions_holder.cc +++ b/cpp/src/gandiva/regex_functions_holder.cc @@ -99,7 +99,7 @@ Result> LikeHolder::Make(const FunctionNode& node) { "'like' function requires a string literal as the second parameter")); RE2::Options regex_op; - regex_op.set_dot_nl(true); // set dotall mode for the regex. + regex_op.set_dot_nl(true); // set dotall mode for the regex. if (node.descriptor()->name() == "ilike") { regex_op.set_case_sensitive(false); // set case-insensitive for ilike function. diff --git a/cpp/src/gandiva/regex_functions_holder_test.cc b/cpp/src/gandiva/regex_functions_holder_test.cc index 620278ed894..0bc2c24c1da 100644 --- a/cpp/src/gandiva/regex_functions_holder_test.cc +++ b/cpp/src/gandiva/regex_functions_holder_test.cc @@ -85,7 +85,8 @@ TEST_F(TestLikeHolder, TestPcreSpecialWithNewLine) { EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("%Space1.%", regex_op)); auto& like = *like_holder; - EXPECT_TRUE(like("[name: \"Space1.protect\"\nargs: \"count\"\ncolumn_name: \"pass_count\"]")); + EXPECT_TRUE( + like("[name: \"Space1.protect\"\nargs: \"count\"\ncolumn_name: \"pass_count\"]")); } TEST_F(TestLikeHolder, TestRegexEscape) { @@ -111,7 +112,8 @@ TEST_F(TestLikeHolder, TestMatchSubString) { EXPECT_TRUE(like("abc")); EXPECT_FALSE(like("xxabdc")); - EXPECT_OK_AND_ASSIGN(like_holder, LikeHolder::Make("%ab-.^$*+?()[]{}|—/c\\%%", "\\", regex_op)); + EXPECT_OK_AND_ASSIGN(like_holder, + LikeHolder::Make("%ab-.^$*+?()[]{}|—/c\\%%", "\\", regex_op)); auto& like_reserved_char = *like_holder; EXPECT_TRUE(like_reserved_char("XXab-.^$*+?()[]{}|—/c%d")); @@ -217,7 +219,8 @@ TEST_F(TestLikeHolder, TestMatchManyEscape) { TEST_F(TestLikeHolder, TestMatchEscape) { regex_op.set_dot_nl(true); - EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\\\", "\\", regex_op)); + EXPECT_OK_AND_ASSIGN(auto const like_holder, + LikeHolder::Make("ab\\\\", "\\", regex_op)); auto& like = *like_holder; From 20fc8b83d286d2a8bac1ff05d62ecba7af656433 Mon Sep 17 00:00:00 2001 From: Ivan Chesnov Date: Wed, 3 Apr 2024 11:33:07 +0300 Subject: [PATCH 3/5] fixed codestyle --- cpp/src/gandiva/regex_functions_holder_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/gandiva/regex_functions_holder_test.cc b/cpp/src/gandiva/regex_functions_holder_test.cc index 0bc2c24c1da..3451ef86c42 100644 --- a/cpp/src/gandiva/regex_functions_holder_test.cc +++ b/cpp/src/gandiva/regex_functions_holder_test.cc @@ -112,7 +112,7 @@ TEST_F(TestLikeHolder, TestMatchSubString) { EXPECT_TRUE(like("abc")); EXPECT_FALSE(like("xxabdc")); - EXPECT_OK_AND_ASSIGN(like_holder, + EXPECT_OK_AND_ASSIGN(like_holder, LikeHolder::Make("%ab-.^$*+?()[]{}|—/c\\%%", "\\", regex_op)); auto& like_reserved_char = *like_holder; @@ -219,7 +219,7 @@ TEST_F(TestLikeHolder, TestMatchManyEscape) { TEST_F(TestLikeHolder, TestMatchEscape) { regex_op.set_dot_nl(true); - EXPECT_OK_AND_ASSIGN(auto const like_holder, + EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\\\", "\\", regex_op)); auto& like = *like_holder; From 4695c8726f454c046e68b353fa04ec8ef2a386cd Mon Sep 17 00:00:00 2001 From: Ivan Chesnov Date: Tue, 9 Apr 2024 11:58:18 +0300 Subject: [PATCH 4/5] refactored tests --- .../gandiva/regex_functions_holder_test.cc | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/cpp/src/gandiva/regex_functions_holder_test.cc b/cpp/src/gandiva/regex_functions_holder_test.cc index 3451ef86c42..64657e88c64 100644 --- a/cpp/src/gandiva/regex_functions_holder_test.cc +++ b/cpp/src/gandiva/regex_functions_holder_test.cc @@ -28,6 +28,8 @@ namespace gandiva { class TestLikeHolder : public ::testing::Test { public: RE2::Options regex_op; + void SetUp() { regex_op.set_dot_nl(true); } + FunctionNode BuildLike(std::string pattern) { auto field = std::make_shared(arrow::field("in", arrow::utf8())); auto pattern_node = @@ -46,7 +48,6 @@ class TestLikeHolder : public ::testing::Test { }; TEST_F(TestLikeHolder, TestMatchAny) { - regex_op.set_dot_nl(true); EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab%", regex_op)); auto& like = *like_holder; @@ -59,7 +60,6 @@ TEST_F(TestLikeHolder, TestMatchAny) { } TEST_F(TestLikeHolder, TestMatchOne) { - regex_op.set_dot_nl(true); EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab_", regex_op)); auto& like = *like_holder; @@ -72,7 +72,6 @@ TEST_F(TestLikeHolder, TestMatchOne) { } TEST_F(TestLikeHolder, TestPcreSpecial) { - regex_op.set_dot_nl(true); EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make(".*ab_", regex_op)); auto& like = *like_holder; @@ -81,7 +80,6 @@ TEST_F(TestLikeHolder, TestPcreSpecial) { } TEST_F(TestLikeHolder, TestPcreSpecialWithNewLine) { - regex_op.set_dot_nl(true); EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("%Space1.%", regex_op)); auto& like = *like_holder; @@ -97,15 +95,20 @@ TEST_F(TestLikeHolder, TestRegexEscape) { } TEST_F(TestLikeHolder, TestDot) { - regex_op.set_dot_nl(true); EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("abc.", regex_op)); auto& like = *like_holder; EXPECT_FALSE(like("abcd")); } +TEST_F(TestLikeHolder, TestMatchWithNewLine) { + EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("%abc%", regex_op)); + + auto& like = *like_holder; + EXPECT_TRUE(like("abc\nd")); +} + TEST_F(TestLikeHolder, TestMatchSubString) { - regex_op.set_dot_nl(true); EXPECT_OK_AND_ASSIGN(auto like_holder, LikeHolder::Make("%abc%", "\\", regex_op)); auto& like = *like_holder; @@ -188,7 +191,6 @@ TEST_F(TestLikeHolder, TestOptimise) { } TEST_F(TestLikeHolder, TestMatchOneEscape) { - regex_op.set_dot_nl(true); EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\_", "\\", regex_op)); auto& like = *like_holder; @@ -203,7 +205,6 @@ TEST_F(TestLikeHolder, TestMatchOneEscape) { } TEST_F(TestLikeHolder, TestMatchManyEscape) { - regex_op.set_dot_nl(true); EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\%", "\\", regex_op)); auto& like = *like_holder; @@ -218,7 +219,6 @@ TEST_F(TestLikeHolder, TestMatchManyEscape) { } TEST_F(TestLikeHolder, TestMatchEscape) { - regex_op.set_dot_nl(true); EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\\\", "\\", regex_op)); @@ -230,7 +230,6 @@ TEST_F(TestLikeHolder, TestMatchEscape) { } TEST_F(TestLikeHolder, TestEmptyEscapeChar) { - regex_op.set_dot_nl(true); EXPECT_OK_AND_ASSIGN(auto const like_holder, LikeHolder::Make("ab\\_", "", regex_op)); auto& like = *like_holder; @@ -243,7 +242,6 @@ TEST_F(TestLikeHolder, TestEmptyEscapeChar) { } TEST_F(TestLikeHolder, TestMultipleEscapeChar) { - regex_op.set_dot_nl(true); ASSERT_RAISES(Invalid, LikeHolder::Make("ab\\_", "\\\\", regex_op).status()); } From 9b9e3c72c438aaa26e1a1b3940f45aae6f43dab9 Mon Sep 17 00:00:00 2001 From: Ivan Chesnov Date: Tue, 9 Apr 2024 15:00:53 +0300 Subject: [PATCH 5/5] restore Make method for LIKE --- cpp/src/gandiva/regex_functions_holder.cc | 14 ++++++++++++++ cpp/src/gandiva/regex_functions_holder.h | 2 ++ 2 files changed, 16 insertions(+) diff --git a/cpp/src/gandiva/regex_functions_holder.cc b/cpp/src/gandiva/regex_functions_holder.cc index ced355362d9..ef07a9ef0bc 100644 --- a/cpp/src/gandiva/regex_functions_holder.cc +++ b/cpp/src/gandiva/regex_functions_holder.cc @@ -123,6 +123,20 @@ Result> LikeHolder::Make(const FunctionNode& node) { } } +Result> LikeHolder::Make(const std::string& sql_pattern) { + std::string pcre_pattern; + ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern)); + + RE2::Options regex_op; + regex_op.set_dot_nl(true); // set dotall mode for the regex. + auto lholder = std::shared_ptr(new LikeHolder(pcre_pattern, regex_op)); + ARROW_RETURN_IF(!lholder->regex_.ok(), + Status::Invalid("Building RE2 pattern '", pcre_pattern, + "' failed with: ", lholder->regex_.error())); + + return lholder; +} + Result> LikeHolder::Make(const std::string& sql_pattern, const std::string& escape_char, RE2::Options regex_op) { diff --git a/cpp/src/gandiva/regex_functions_holder.h b/cpp/src/gandiva/regex_functions_holder.h index 82836fa5fe0..354c2b53d95 100644 --- a/cpp/src/gandiva/regex_functions_holder.h +++ b/cpp/src/gandiva/regex_functions_holder.h @@ -37,6 +37,8 @@ class GANDIVA_EXPORT LikeHolder : public FunctionHolder { static Result> Make(const FunctionNode& node); + static Result> Make(const std::string& sql_pattern); + static Result> Make(const std::string& sql_pattern, const std::string& escape_char, RE2::Options regex_op);