From 2efd43e2b3981c477a0d0ce71a51f938df0d428b Mon Sep 17 00:00:00 2001 From: frank400 Date: Tue, 27 Apr 2021 16:40:47 -0300 Subject: [PATCH 1/8] Implement ilike function --- cpp/src/gandiva/CMakeLists.txt | 2 + cpp/src/gandiva/expr_decomposer.cc | 2 + cpp/src/gandiva/function_holder_registry.h | 2 + cpp/src/gandiva/function_registry_string.cc | 4 + cpp/src/gandiva/gdv_function_stubs.cc | 18 +++ cpp/src/gandiva/gdv_function_stubs.h | 3 + cpp/src/gandiva/ilike_holder.cc | 101 ++++++++++++++++ cpp/src/gandiva/ilike_holder.h | 61 ++++++++++ cpp/src/gandiva/ilike_holder_test.cc | 122 ++++++++++++++++++++ 9 files changed, 315 insertions(+) create mode 100644 cpp/src/gandiva/ilike_holder.cc create mode 100644 cpp/src/gandiva/ilike_holder.h create mode 100644 cpp/src/gandiva/ilike_holder_test.cc diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index 44b6fab14c3..85b1bfd473a 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -83,6 +83,7 @@ set(SRC_FILES llvm_generator.cc llvm_types.cc like_holder.cc + ilike_holder.cc literal_holder.cc projector.cc regex_util.cc @@ -230,6 +231,7 @@ add_gandiva_test(internals-test to_date_holder_test.cc simple_arena_test.cc like_holder_test.cc + ilike_holder_test.cc decimal_type_util_test.cc random_generator_holder_test.cc hash_utils_test.cc diff --git a/cpp/src/gandiva/expr_decomposer.cc b/cpp/src/gandiva/expr_decomposer.cc index 1c09d28f5e0..7691135417b 100644 --- a/cpp/src/gandiva/expr_decomposer.cc +++ b/cpp/src/gandiva/expr_decomposer.cc @@ -54,6 +54,8 @@ Status ExprDecomposer::Visit(const FieldNode& node) { const FunctionNode ExprDecomposer::TryOptimize(const FunctionNode& node) { if (node.descriptor()->name() == "like") { return LikeHolder::TryOptimize(node); + } else if (node.descriptor()->name() == "ilike") { + return IlikeHolder::TryOptimize(node); } else { return node; } diff --git a/cpp/src/gandiva/function_holder_registry.h b/cpp/src/gandiva/function_holder_registry.h index e1c5630e841..0ccdc214846 100644 --- a/cpp/src/gandiva/function_holder_registry.h +++ b/cpp/src/gandiva/function_holder_registry.h @@ -25,6 +25,7 @@ #include "arrow/status.h" #include "gandiva/function_holder.h" +#include "gandiva/ilike_holder.h" #include "gandiva/like_holder.h" #include "gandiva/node.h" #include "gandiva/random_generator_holder.h" @@ -62,6 +63,7 @@ class FunctionHolderRegistry { static map_type& makers() { static map_type maker_map = { {"like", LAMBDA_MAKER(LikeHolder)}, + {"ilike", LAMBDA_MAKER(IlikeHolder)}, {"to_date", LAMBDA_MAKER(ToDateHolder)}, {"random", LAMBDA_MAKER(RandomGeneratorHolder)}, {"rand", LAMBDA_MAKER(RandomGeneratorHolder)}, diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index e8c0739b3d4..aa9c1d994ab 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -131,6 +131,10 @@ std::vector GetStringFunctionRegistry() { kResultNullIfNull, "gdv_fn_like_utf8_utf8_utf8", NativeFunction::kNeedsFunctionHolder), + NativeFunction("ilike", {}, DataTypeVector{utf8(), utf8()}, boolean(), + kResultNullIfNull, "gdv_fn_ilike_utf8_utf8", + NativeFunction::kNeedsFunctionHolder), + NativeFunction("ltrim", {}, DataTypeVector{utf8(), utf8()}, utf8(), kResultNullIfNull, "ltrim_utf8_utf8", NativeFunction::kNeedsContext), diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc index 26b8654fb7e..bd8aaed188c 100644 --- a/cpp/src/gandiva/gdv_function_stubs.cc +++ b/cpp/src/gandiva/gdv_function_stubs.cc @@ -29,6 +29,7 @@ #include "gandiva/exported_funcs.h" #include "gandiva/formatting_utils.h" #include "gandiva/hash_utils.h" +#include "gandiva/ilike_holder.h" #include "gandiva/in_holder.h" #include "gandiva/like_holder.h" #include "gandiva/precompiled/types.h" @@ -52,6 +53,12 @@ bool gdv_fn_like_utf8_utf8_utf8(int64_t ptr, const char* data, int data_len, return (*holder)(std::string(data, data_len)); } +bool gdv_fn_ilike_utf8_utf8(int64_t ptr, const char* data, int data_len, + const char* pattern, int pattern_len) { + gandiva::IlikeHolder* holder = reinterpret_cast(ptr); + return (*holder)(std::string(data, data_len)); +} + double gdv_fn_random(int64_t ptr) { gandiva::RandomGeneratorHolder* holder = reinterpret_cast(ptr); @@ -752,6 +759,17 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const { types->i1_type() /*return_type*/, args, reinterpret_cast(gdv_fn_like_utf8_utf8_utf8)); + // gdv_fn_ilike_utf8_utf8 + args = {types->i64_type(), // int64_t ptr + types->i8_ptr_type(), // const char* data + types->i32_type(), // int data_len + types->i8_ptr_type(), // const char* pattern + types->i32_type()}; // int pattern_len + + engine->AddGlobalMappingForFunc("gdv_fn_ilike_utf8_utf8", + types->i1_type() /*return_type*/, args, + reinterpret_cast(gdv_fn_ilike_utf8_utf8)); + // gdv_fn_to_date_utf8_utf8 args = {types->i64_type(), // int64_t execution_context types->i64_type(), // int64_t holder_ptr diff --git a/cpp/src/gandiva/gdv_function_stubs.h b/cpp/src/gandiva/gdv_function_stubs.h index d4a127dd1cf..7b9ee7912d7 100644 --- a/cpp/src/gandiva/gdv_function_stubs.h +++ b/cpp/src/gandiva/gdv_function_stubs.h @@ -50,6 +50,9 @@ bool gdv_fn_like_utf8_utf8_utf8(int64_t ptr, const char* data, int data_len, const char* pattern, int pattern_len, const char* escape_char, int escape_char_len); +bool gdv_fn_ilike_utf8_utf8(int64_t ptr, const char* data, int data_len, + const char* pattern, int pattern_len); + int64_t gdv_fn_to_date_utf8_utf8_int32(int64_t context, int64_t ptr, const char* data, int data_len, bool in1_validity, const char* pattern, int pattern_len, diff --git a/cpp/src/gandiva/ilike_holder.cc b/cpp/src/gandiva/ilike_holder.cc new file mode 100644 index 00000000000..ad22b106cc8 --- /dev/null +++ b/cpp/src/gandiva/ilike_holder.cc @@ -0,0 +1,101 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/ilike_holder.h" + +#include +#include "gandiva/node.h" +#include "gandiva/regex_util.h" + +namespace gandiva { +RE2::Options IlikeHolder::regex_op_ = RE2::Options(); + +RE2 IlikeHolder::starts_with_regex_(R"((?i)(\w|\s)*\.\*)"); +RE2 IlikeHolder::ends_with_regex_(R"((?i)\.\*(\w|\s)*)"); +RE2 IlikeHolder::is_substr_regex_(R"((?i)\.\*(\w|\s)*\.\*)"); + +// Short-circuit pattern matches for the following common sub cases : +// - starts_with, ends_with and is_substr +const FunctionNode IlikeHolder::TryOptimize(const FunctionNode& node) { + std::shared_ptr holder; + auto status = Make(node, &holder); + if (status.ok()) { + std::string& pattern = holder->pattern_; + auto literal_type = node.children().at(1)->return_type(); + + if (RE2::FullMatch(pattern, starts_with_regex_)) { + auto prefix = pattern.substr(0, pattern.length() - 2); // trim .* + auto prefix_node = + std::make_shared(literal_type, LiteralHolder(prefix), false); + return FunctionNode("starts_with", {node.children().at(0), prefix_node}, + node.return_type()); + } else if (RE2::FullMatch(pattern, ends_with_regex_)) { + auto suffix = pattern.substr(2); // skip .* + auto suffix_node = + std::make_shared(literal_type, LiteralHolder(suffix), false); + return FunctionNode("ends_with", {node.children().at(0), suffix_node}, + node.return_type()); + } else if (RE2::FullMatch(pattern, is_substr_regex_)) { + auto substr = + pattern.substr(2, pattern.length() - 4); // trim starting and ending .* + auto substr_node = + std::make_shared(literal_type, LiteralHolder(substr), false); + return FunctionNode("is_substr", {node.children().at(0), substr_node}, + node.return_type()); + } + } + + // Could not optimize, return original node. + return node; +} + +static bool IsArrowStringLiteral(arrow::Type::type type) { + return type == arrow::Type::STRING || type == arrow::Type::BINARY; +} + +Status IlikeHolder::Make(const FunctionNode& node, std::shared_ptr* holder) { + ARROW_RETURN_IF(node.children().size() != 2, + Status::Invalid("'ilike' function requires two parameters")); + + auto literal = dynamic_cast(node.children().at(1).get()); + ARROW_RETURN_IF( + literal == nullptr, + Status::Invalid("'ilike' function requires a literal as the second parameter")); + + auto literal_type = literal->return_type()->id(); + ARROW_RETURN_IF( + !IsArrowStringLiteral(literal_type), + Status::Invalid( + "'ilike' function requires a string literal as the second parameter")); + + return Make(arrow::util::get(literal->holder()), holder); +} + +Status IlikeHolder::Make(const std::string& sql_pattern, + std::shared_ptr* holder) { + std::string pcre_pattern; + ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern)); + + regex_op_.set_case_sensitive(false); // set insensitive case. + auto lholder = std::shared_ptr(new IlikeHolder(pcre_pattern)); + ARROW_RETURN_IF(!lholder->regex_.ok(), + Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed")); + + *holder = lholder; + return Status::OK(); +} +} // namespace gandiva diff --git a/cpp/src/gandiva/ilike_holder.h b/cpp/src/gandiva/ilike_holder.h new file mode 100644 index 00000000000..ce961ac23c3 --- /dev/null +++ b/cpp/src/gandiva/ilike_holder.h @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include + +#include "arrow/status.h" + +#include "gandiva/function_holder.h" +#include "gandiva/like_holder.h" +#include "gandiva/node.h" +#include "gandiva/visibility.h" + +namespace gandiva { + +/// Function Holder for SQL 'ilike' +class GANDIVA_EXPORT IlikeHolder : public FunctionHolder { + public: + ~IlikeHolder() override = default; + static Status Make(const FunctionNode& node, std::shared_ptr* holder); + + static Status Make(const std::string& sql_pattern, + std::shared_ptr* holder); + + // Try and optimise a function node with a "ilike" pattern. + static const FunctionNode TryOptimize(const FunctionNode& node); + + /// Return true if the data matches the pattern. + bool operator()(const std::string& data) { return RE2::FullMatch(data, regex_); } + + private: + explicit IlikeHolder(const std::string& pattern) + : pattern_(pattern), regex_(pattern, regex_op_) {} + + std::string pattern_; // posix pattern string, to help debugging + RE2 regex_; // compiled regex for the pattern + + static RE2 starts_with_regex_; // pre-compiled pattern for matching starts_with + static RE2 ends_with_regex_; // pre-compiled pattern for matching ends_with + static RE2 is_substr_regex_; // pre-compiled pattern for matching is_substr + static RE2::Options regex_op_; // regex option for insensitive case +}; +} // namespace gandiva diff --git a/cpp/src/gandiva/ilike_holder_test.cc b/cpp/src/gandiva/ilike_holder_test.cc new file mode 100644 index 00000000000..5650ecfc054 --- /dev/null +++ b/cpp/src/gandiva/ilike_holder_test.cc @@ -0,0 +1,122 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/ilike_holder.h" +#include "gandiva/regex_util.h" + +#include +#include + +#include + +namespace gandiva { + +class TestILikeHolder : public ::testing::Test { + public: + FunctionNode BuildILike(std::string pattern) { + auto field = std::make_shared(arrow::field("in", arrow::utf8())); + auto pattern_node = + std::make_shared(arrow::utf8(), LiteralHolder(pattern), false); + return FunctionNode("ilike", {field, pattern_node}, arrow::boolean()); + } +}; + +TEST_F(TestILikeHolder, TestMatchAny) { + std::shared_ptr ilike_holder; + + auto status = IlikeHolder::Make("ab%", &ilike_holder); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& like = *ilike_holder; + EXPECT_TRUE(like("ab")); + EXPECT_TRUE(like("aBc")); + EXPECT_TRUE(like("ABCD")); + + EXPECT_FALSE(like("a")); + EXPECT_FALSE(like("cab")); +} + +TEST_F(TestILikeHolder, TestMatchOne) { + std::shared_ptr ilike_holder; + + auto status = IlikeHolder::Make("Ab_", &ilike_holder); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& like = *ilike_holder; + EXPECT_TRUE(like("abc")); + EXPECT_TRUE(like("aBd")); + + EXPECT_FALSE(like("A")); + EXPECT_FALSE(like("Abcd")); + EXPECT_FALSE(like("DaBc")); +} + +TEST_F(TestILikeHolder, TestPcreSpecial) { + std::shared_ptr ilike_holder; + + auto status = IlikeHolder::Make(".*aB_", &ilike_holder); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& like = *ilike_holder; + EXPECT_TRUE(like(".*Abc")); // . and * aren't special in sql regex + EXPECT_FALSE(like("xxAbc")); +} + +TEST_F(TestILikeHolder, TestDot) { + std::shared_ptr ilike_holder; + + auto status = IlikeHolder::Make("aBc.", &ilike_holder); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& like = *ilike_holder; + EXPECT_FALSE(like("abcd")); +} + +TEST_F(TestILikeHolder, TestOptimise) { + // optimise for 'starts_with' + auto fnode = IlikeHolder::TryOptimize(BuildILike("xy 123z%")); + EXPECT_EQ(fnode.descriptor()->name(), "starts_with"); + EXPECT_EQ(fnode.ToString(), "bool starts_with((string) in, (const string) xy 123z)"); + + // optimise for 'ends_with' + fnode = IlikeHolder::TryOptimize(BuildILike("%xyz")); + EXPECT_EQ(fnode.descriptor()->name(), "ends_with"); + EXPECT_EQ(fnode.ToString(), "bool ends_with((string) in, (const string) xyz)"); + + // optimise for 'is_substr' + fnode = IlikeHolder::TryOptimize(BuildILike("%abc%")); + EXPECT_EQ(fnode.descriptor()->name(), "is_substr"); + EXPECT_EQ(fnode.ToString(), "bool is_substr((string) in, (const string) abc)"); + + // no optimisation for others. + fnode = IlikeHolder::TryOptimize(BuildILike("xyz_")); + EXPECT_EQ(fnode.descriptor()->name(), "ilike"); + + fnode = IlikeHolder::TryOptimize(BuildILike("_xyz")); + EXPECT_EQ(fnode.descriptor()->name(), "ilike"); + + fnode = IlikeHolder::TryOptimize(BuildILike("_xyz_")); + EXPECT_EQ(fnode.descriptor()->name(), "ilike"); + + fnode = IlikeHolder::TryOptimize(BuildILike("%xyz_")); + EXPECT_EQ(fnode.descriptor()->name(), "ilike"); + + fnode = IlikeHolder::TryOptimize(BuildILike("x_yz%")); + EXPECT_EQ(fnode.descriptor()->name(), "ilike"); +} + +} // namespace gandiva From b78085a146cb87c751eda252897d877bbef84b03 Mon Sep 17 00:00:00 2001 From: frank400 Date: Wed, 28 Apr 2021 10:02:48 -0300 Subject: [PATCH 2/8] Fix miss include --- cpp/src/gandiva/ilike_holder.cc | 2 -- cpp/src/gandiva/ilike_holder.h | 1 - 2 files changed, 3 deletions(-) diff --git a/cpp/src/gandiva/ilike_holder.cc b/cpp/src/gandiva/ilike_holder.cc index ad22b106cc8..8e9d13a7298 100644 --- a/cpp/src/gandiva/ilike_holder.cc +++ b/cpp/src/gandiva/ilike_holder.cc @@ -22,8 +22,6 @@ #include "gandiva/regex_util.h" namespace gandiva { -RE2::Options IlikeHolder::regex_op_ = RE2::Options(); - RE2 IlikeHolder::starts_with_regex_(R"((?i)(\w|\s)*\.\*)"); RE2 IlikeHolder::ends_with_regex_(R"((?i)\.\*(\w|\s)*)"); RE2 IlikeHolder::is_substr_regex_(R"((?i)\.\*(\w|\s)*\.\*)"); diff --git a/cpp/src/gandiva/ilike_holder.h b/cpp/src/gandiva/ilike_holder.h index ce961ac23c3..61811b143d6 100644 --- a/cpp/src/gandiva/ilike_holder.h +++ b/cpp/src/gandiva/ilike_holder.h @@ -25,7 +25,6 @@ #include "arrow/status.h" #include "gandiva/function_holder.h" -#include "gandiva/like_holder.h" #include "gandiva/node.h" #include "gandiva/visibility.h" From 4be6cc611a5f4919368d757c5b7ea019186fe317 Mon Sep 17 00:00:00 2001 From: frank400 Date: Wed, 28 Apr 2021 10:20:44 -0300 Subject: [PATCH 3/8] Fix redefined function --- cpp/src/gandiva/ilike_holder.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/gandiva/ilike_holder.cc b/cpp/src/gandiva/ilike_holder.cc index 8e9d13a7298..87c466c5c9c 100644 --- a/cpp/src/gandiva/ilike_holder.cc +++ b/cpp/src/gandiva/ilike_holder.cc @@ -61,7 +61,7 @@ const FunctionNode IlikeHolder::TryOptimize(const FunctionNode& node) { return node; } -static bool IsArrowStringLiteral(arrow::Type::type type) { +static bool IsStringLiteral(arrow::Type::type type) { return type == arrow::Type::STRING || type == arrow::Type::BINARY; } @@ -76,7 +76,7 @@ Status IlikeHolder::Make(const FunctionNode& node, std::shared_ptr* auto literal_type = literal->return_type()->id(); ARROW_RETURN_IF( - !IsArrowStringLiteral(literal_type), + !IsStringLiteral(literal_type), Status::Invalid( "'ilike' function requires a string literal as the second parameter")); From c6a8372cd120a522df7db094576df55ecdf2afe5 Mon Sep 17 00:00:00 2001 From: frank400 Date: Wed, 28 Apr 2021 14:11:31 -0300 Subject: [PATCH 4/8] Delete unnecessary holder --- cpp/src/gandiva/CMakeLists.txt | 4 +- cpp/src/gandiva/expr_decomposer.cc | 4 +- cpp/src/gandiva/function_holder_registry.h | 3 +- cpp/src/gandiva/gdv_function_stubs.cc | 3 +- cpp/src/gandiva/ilike_holder.cc | 99 ----------------- cpp/src/gandiva/ilike_holder.h | 60 ---------- cpp/src/gandiva/ilike_holder_test.cc | 122 --------------------- cpp/src/gandiva/like_holder.cc | 19 ++++ cpp/src/gandiva/like_holder.h | 6 + cpp/src/gandiva/like_holder_test.cc | 99 +++++++++++++++++ 10 files changed, 128 insertions(+), 291 deletions(-) delete mode 100644 cpp/src/gandiva/ilike_holder.cc delete mode 100644 cpp/src/gandiva/ilike_holder.h delete mode 100644 cpp/src/gandiva/ilike_holder_test.cc diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index 85b1bfd473a..9e6a0433424 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -83,8 +83,7 @@ set(SRC_FILES llvm_generator.cc llvm_types.cc like_holder.cc - ilike_holder.cc - literal_holder.cc + literal_holder.cc projector.cc regex_util.cc selection_vector.cc @@ -231,7 +230,6 @@ add_gandiva_test(internals-test to_date_holder_test.cc simple_arena_test.cc like_holder_test.cc - ilike_holder_test.cc decimal_type_util_test.cc random_generator_holder_test.cc hash_utils_test.cc diff --git a/cpp/src/gandiva/expr_decomposer.cc b/cpp/src/gandiva/expr_decomposer.cc index 7691135417b..4f198e0ed82 100644 --- a/cpp/src/gandiva/expr_decomposer.cc +++ b/cpp/src/gandiva/expr_decomposer.cc @@ -52,10 +52,8 @@ Status ExprDecomposer::Visit(const FieldNode& node) { // eg. replacing 'like' with 'starts_with' can save function calls at evaluation // time. const FunctionNode ExprDecomposer::TryOptimize(const FunctionNode& node) { - if (node.descriptor()->name() == "like") { + if (node.descriptor()->name() == "like" || node.descriptor()->name() == "ilike") { return LikeHolder::TryOptimize(node); - } else if (node.descriptor()->name() == "ilike") { - return IlikeHolder::TryOptimize(node); } else { return node; } diff --git a/cpp/src/gandiva/function_holder_registry.h b/cpp/src/gandiva/function_holder_registry.h index 0ccdc214846..225c73207fc 100644 --- a/cpp/src/gandiva/function_holder_registry.h +++ b/cpp/src/gandiva/function_holder_registry.h @@ -25,7 +25,6 @@ #include "arrow/status.h" #include "gandiva/function_holder.h" -#include "gandiva/ilike_holder.h" #include "gandiva/like_holder.h" #include "gandiva/node.h" #include "gandiva/random_generator_holder.h" @@ -63,7 +62,7 @@ class FunctionHolderRegistry { static map_type& makers() { static map_type maker_map = { {"like", LAMBDA_MAKER(LikeHolder)}, - {"ilike", LAMBDA_MAKER(IlikeHolder)}, + {"ilike", LAMBDA_MAKER(LikeHolder)}, {"to_date", LAMBDA_MAKER(ToDateHolder)}, {"random", LAMBDA_MAKER(RandomGeneratorHolder)}, {"rand", LAMBDA_MAKER(RandomGeneratorHolder)}, diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc index bd8aaed188c..fc4cb9c7ed6 100644 --- a/cpp/src/gandiva/gdv_function_stubs.cc +++ b/cpp/src/gandiva/gdv_function_stubs.cc @@ -29,7 +29,6 @@ #include "gandiva/exported_funcs.h" #include "gandiva/formatting_utils.h" #include "gandiva/hash_utils.h" -#include "gandiva/ilike_holder.h" #include "gandiva/in_holder.h" #include "gandiva/like_holder.h" #include "gandiva/precompiled/types.h" @@ -55,7 +54,7 @@ bool gdv_fn_like_utf8_utf8_utf8(int64_t ptr, const char* data, int data_len, bool gdv_fn_ilike_utf8_utf8(int64_t ptr, const char* data, int data_len, const char* pattern, int pattern_len) { - gandiva::IlikeHolder* holder = reinterpret_cast(ptr); + gandiva::LikeHolder* holder = reinterpret_cast(ptr); return (*holder)(std::string(data, data_len)); } diff --git a/cpp/src/gandiva/ilike_holder.cc b/cpp/src/gandiva/ilike_holder.cc deleted file mode 100644 index 87c466c5c9c..00000000000 --- a/cpp/src/gandiva/ilike_holder.cc +++ /dev/null @@ -1,99 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "gandiva/ilike_holder.h" - -#include -#include "gandiva/node.h" -#include "gandiva/regex_util.h" - -namespace gandiva { -RE2 IlikeHolder::starts_with_regex_(R"((?i)(\w|\s)*\.\*)"); -RE2 IlikeHolder::ends_with_regex_(R"((?i)\.\*(\w|\s)*)"); -RE2 IlikeHolder::is_substr_regex_(R"((?i)\.\*(\w|\s)*\.\*)"); - -// Short-circuit pattern matches for the following common sub cases : -// - starts_with, ends_with and is_substr -const FunctionNode IlikeHolder::TryOptimize(const FunctionNode& node) { - std::shared_ptr holder; - auto status = Make(node, &holder); - if (status.ok()) { - std::string& pattern = holder->pattern_; - auto literal_type = node.children().at(1)->return_type(); - - if (RE2::FullMatch(pattern, starts_with_regex_)) { - auto prefix = pattern.substr(0, pattern.length() - 2); // trim .* - auto prefix_node = - std::make_shared(literal_type, LiteralHolder(prefix), false); - return FunctionNode("starts_with", {node.children().at(0), prefix_node}, - node.return_type()); - } else if (RE2::FullMatch(pattern, ends_with_regex_)) { - auto suffix = pattern.substr(2); // skip .* - auto suffix_node = - std::make_shared(literal_type, LiteralHolder(suffix), false); - return FunctionNode("ends_with", {node.children().at(0), suffix_node}, - node.return_type()); - } else if (RE2::FullMatch(pattern, is_substr_regex_)) { - auto substr = - pattern.substr(2, pattern.length() - 4); // trim starting and ending .* - auto substr_node = - std::make_shared(literal_type, LiteralHolder(substr), false); - return FunctionNode("is_substr", {node.children().at(0), substr_node}, - node.return_type()); - } - } - - // Could not optimize, return original node. - return node; -} - -static bool IsStringLiteral(arrow::Type::type type) { - return type == arrow::Type::STRING || type == arrow::Type::BINARY; -} - -Status IlikeHolder::Make(const FunctionNode& node, std::shared_ptr* holder) { - ARROW_RETURN_IF(node.children().size() != 2, - Status::Invalid("'ilike' function requires two parameters")); - - auto literal = dynamic_cast(node.children().at(1).get()); - ARROW_RETURN_IF( - literal == nullptr, - Status::Invalid("'ilike' function requires a literal as the second parameter")); - - auto literal_type = literal->return_type()->id(); - ARROW_RETURN_IF( - !IsStringLiteral(literal_type), - Status::Invalid( - "'ilike' function requires a string literal as the second parameter")); - - return Make(arrow::util::get(literal->holder()), holder); -} - -Status IlikeHolder::Make(const std::string& sql_pattern, - std::shared_ptr* holder) { - std::string pcre_pattern; - ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern)); - - regex_op_.set_case_sensitive(false); // set insensitive case. - auto lholder = std::shared_ptr(new IlikeHolder(pcre_pattern)); - ARROW_RETURN_IF(!lholder->regex_.ok(), - Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed")); - - *holder = lholder; - return Status::OK(); -} -} // namespace gandiva diff --git a/cpp/src/gandiva/ilike_holder.h b/cpp/src/gandiva/ilike_holder.h deleted file mode 100644 index 61811b143d6..00000000000 --- a/cpp/src/gandiva/ilike_holder.h +++ /dev/null @@ -1,60 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include - -#include - -#include "arrow/status.h" - -#include "gandiva/function_holder.h" -#include "gandiva/node.h" -#include "gandiva/visibility.h" - -namespace gandiva { - -/// Function Holder for SQL 'ilike' -class GANDIVA_EXPORT IlikeHolder : public FunctionHolder { - public: - ~IlikeHolder() override = default; - static Status Make(const FunctionNode& node, std::shared_ptr* holder); - - static Status Make(const std::string& sql_pattern, - std::shared_ptr* holder); - - // Try and optimise a function node with a "ilike" pattern. - static const FunctionNode TryOptimize(const FunctionNode& node); - - /// Return true if the data matches the pattern. - bool operator()(const std::string& data) { return RE2::FullMatch(data, regex_); } - - private: - explicit IlikeHolder(const std::string& pattern) - : pattern_(pattern), regex_(pattern, regex_op_) {} - - std::string pattern_; // posix pattern string, to help debugging - RE2 regex_; // compiled regex for the pattern - - static RE2 starts_with_regex_; // pre-compiled pattern for matching starts_with - static RE2 ends_with_regex_; // pre-compiled pattern for matching ends_with - static RE2 is_substr_regex_; // pre-compiled pattern for matching is_substr - static RE2::Options regex_op_; // regex option for insensitive case -}; -} // namespace gandiva diff --git a/cpp/src/gandiva/ilike_holder_test.cc b/cpp/src/gandiva/ilike_holder_test.cc deleted file mode 100644 index 5650ecfc054..00000000000 --- a/cpp/src/gandiva/ilike_holder_test.cc +++ /dev/null @@ -1,122 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "gandiva/ilike_holder.h" -#include "gandiva/regex_util.h" - -#include -#include - -#include - -namespace gandiva { - -class TestILikeHolder : public ::testing::Test { - public: - FunctionNode BuildILike(std::string pattern) { - auto field = std::make_shared(arrow::field("in", arrow::utf8())); - auto pattern_node = - std::make_shared(arrow::utf8(), LiteralHolder(pattern), false); - return FunctionNode("ilike", {field, pattern_node}, arrow::boolean()); - } -}; - -TEST_F(TestILikeHolder, TestMatchAny) { - std::shared_ptr ilike_holder; - - auto status = IlikeHolder::Make("ab%", &ilike_holder); - EXPECT_EQ(status.ok(), true) << status.message(); - - auto& like = *ilike_holder; - EXPECT_TRUE(like("ab")); - EXPECT_TRUE(like("aBc")); - EXPECT_TRUE(like("ABCD")); - - EXPECT_FALSE(like("a")); - EXPECT_FALSE(like("cab")); -} - -TEST_F(TestILikeHolder, TestMatchOne) { - std::shared_ptr ilike_holder; - - auto status = IlikeHolder::Make("Ab_", &ilike_holder); - EXPECT_EQ(status.ok(), true) << status.message(); - - auto& like = *ilike_holder; - EXPECT_TRUE(like("abc")); - EXPECT_TRUE(like("aBd")); - - EXPECT_FALSE(like("A")); - EXPECT_FALSE(like("Abcd")); - EXPECT_FALSE(like("DaBc")); -} - -TEST_F(TestILikeHolder, TestPcreSpecial) { - std::shared_ptr ilike_holder; - - auto status = IlikeHolder::Make(".*aB_", &ilike_holder); - EXPECT_EQ(status.ok(), true) << status.message(); - - auto& like = *ilike_holder; - EXPECT_TRUE(like(".*Abc")); // . and * aren't special in sql regex - EXPECT_FALSE(like("xxAbc")); -} - -TEST_F(TestILikeHolder, TestDot) { - std::shared_ptr ilike_holder; - - auto status = IlikeHolder::Make("aBc.", &ilike_holder); - EXPECT_EQ(status.ok(), true) << status.message(); - - auto& like = *ilike_holder; - EXPECT_FALSE(like("abcd")); -} - -TEST_F(TestILikeHolder, TestOptimise) { - // optimise for 'starts_with' - auto fnode = IlikeHolder::TryOptimize(BuildILike("xy 123z%")); - EXPECT_EQ(fnode.descriptor()->name(), "starts_with"); - EXPECT_EQ(fnode.ToString(), "bool starts_with((string) in, (const string) xy 123z)"); - - // optimise for 'ends_with' - fnode = IlikeHolder::TryOptimize(BuildILike("%xyz")); - EXPECT_EQ(fnode.descriptor()->name(), "ends_with"); - EXPECT_EQ(fnode.ToString(), "bool ends_with((string) in, (const string) xyz)"); - - // optimise for 'is_substr' - fnode = IlikeHolder::TryOptimize(BuildILike("%abc%")); - EXPECT_EQ(fnode.descriptor()->name(), "is_substr"); - EXPECT_EQ(fnode.ToString(), "bool is_substr((string) in, (const string) abc)"); - - // no optimisation for others. - fnode = IlikeHolder::TryOptimize(BuildILike("xyz_")); - EXPECT_EQ(fnode.descriptor()->name(), "ilike"); - - fnode = IlikeHolder::TryOptimize(BuildILike("_xyz")); - EXPECT_EQ(fnode.descriptor()->name(), "ilike"); - - fnode = IlikeHolder::TryOptimize(BuildILike("_xyz_")); - EXPECT_EQ(fnode.descriptor()->name(), "ilike"); - - fnode = IlikeHolder::TryOptimize(BuildILike("%xyz_")); - EXPECT_EQ(fnode.descriptor()->name(), "ilike"); - - fnode = IlikeHolder::TryOptimize(BuildILike("x_yz%")); - EXPECT_EQ(fnode.descriptor()->name(), "ilike"); -} - -} // namespace gandiva diff --git a/cpp/src/gandiva/like_holder.cc b/cpp/src/gandiva/like_holder.cc index 5a3510e3652..48e390f197f 100644 --- a/cpp/src/gandiva/like_holder.cc +++ b/cpp/src/gandiva/like_holder.cc @@ -80,6 +80,12 @@ Status LikeHolder::Make(const FunctionNode& node, std::shared_ptr* h !IsArrowStringLiteral(literal_type), Status::Invalid( "'like' function requires a string literal as the second parameter")); + if (node.descriptor()->name() == "ilike") { + RE2::Options regex_op; + regex_op.set_case_sensitive(false); // set case-insensitive for ilike function. + + return Make(arrow::util::get(literal->holder()), holder, regex_op); + } if (node.children().size() == 2) { return Make(arrow::util::get(literal->holder()), holder); } else { @@ -132,4 +138,17 @@ Status LikeHolder::Make(const std::string& sql_pattern, const std::string& escap return Status::OK(); } +Status LikeHolder::Make(const std::string& sql_pattern, + std::shared_ptr* holder, RE2::Options regex_op) { + std::string pcre_pattern; + ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern)); + + auto lholder = std::shared_ptr(new LikeHolder(pcre_pattern, regex_op)); + ARROW_RETURN_IF(!lholder->regex_.ok(), + Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed")); + + *holder = lholder; + return Status::OK(); +} + } // namespace gandiva diff --git a/cpp/src/gandiva/like_holder.h b/cpp/src/gandiva/like_holder.h index c7982e91437..8c83200fc4c 100644 --- a/cpp/src/gandiva/like_holder.h +++ b/cpp/src/gandiva/like_holder.h @@ -42,6 +42,9 @@ class GANDIVA_EXPORT LikeHolder : public FunctionHolder { static Status Make(const std::string& sql_pattern, const std::string& escape_char, std::shared_ptr* holder); + static Status Make(const std::string& sql_pattern, std::shared_ptr* holder, + RE2::Options regex_op); + // Try and optimise a function node with a "like" pattern. static const FunctionNode TryOptimize(const FunctionNode& node); @@ -51,6 +54,9 @@ class GANDIVA_EXPORT LikeHolder : public FunctionHolder { private: explicit LikeHolder(const std::string& pattern) : pattern_(pattern), regex_(pattern) {} + explicit LikeHolder(const std::string& pattern, RE2::Options regex_op) + : pattern_(pattern), regex_(pattern, regex_op) {} + std::string pattern_; // posix pattern string, to help debugging RE2 regex_; // compiled regex for the pattern diff --git a/cpp/src/gandiva/like_holder_test.cc b/cpp/src/gandiva/like_holder_test.cc index 18e585fc502..925a62d34bb 100644 --- a/cpp/src/gandiva/like_holder_test.cc +++ b/cpp/src/gandiva/like_holder_test.cc @@ -211,4 +211,103 @@ TEST_F(TestLikeHolder, TestMultipleEscapeChar) { auto status = LikeHolder::Make("ab\\_", "\\\\", &like_holder); EXPECT_EQ(status.ok(), false) << status.message(); } +class TestILikeHolder : public ::testing::Test { + public: + RE2::Options regex_op; + FunctionNode BuildILike(std::string pattern) { + auto field = std::make_shared(arrow::field("in", arrow::utf8())); + auto pattern_node = + std::make_shared(arrow::utf8(), LiteralHolder(pattern), false); + return FunctionNode("ilike", {field, pattern_node}, arrow::boolean()); + } +}; + +TEST_F(TestILikeHolder, TestMatchAny) { + std::shared_ptr like_holder; + + regex_op.set_case_sensitive(false); + auto status = LikeHolder::Make("ab%", &like_holder, regex_op); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& like = *like_holder; + EXPECT_TRUE(like("ab")); + EXPECT_TRUE(like("aBc")); + EXPECT_TRUE(like("ABCD")); + + EXPECT_FALSE(like("a")); + EXPECT_FALSE(like("cab")); +} + +TEST_F(TestILikeHolder, TestMatchOne) { + std::shared_ptr like_holder; + + regex_op.set_case_sensitive(false); + auto status = LikeHolder::Make("Ab_", &like_holder, regex_op); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& like = *like_holder; + EXPECT_TRUE(like("abc")); + EXPECT_TRUE(like("aBd")); + + EXPECT_FALSE(like("A")); + EXPECT_FALSE(like("Abcd")); + EXPECT_FALSE(like("DaBc")); +} + +TEST_F(TestILikeHolder, TestPcreSpecial) { + std::shared_ptr like_holder; + + regex_op.set_case_sensitive(false); + auto status = LikeHolder::Make(".*aB_", &like_holder, regex_op); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& like = *like_holder; + EXPECT_TRUE(like(".*Abc")); // . and * aren't special in sql regex + EXPECT_FALSE(like("xxAbc")); +} + +TEST_F(TestILikeHolder, TestDot) { + std::shared_ptr like_holder; + + regex_op.set_case_sensitive(false); + auto status = LikeHolder::Make("aBc.", &like_holder, regex_op); + EXPECT_EQ(status.ok(), true) << status.message(); + + auto& like = *like_holder; + EXPECT_FALSE(like("abcd")); +} + +TEST_F(TestILikeHolder, TestOptimise) { + // optimise for 'starts_with' + auto fnode = LikeHolder::TryOptimize(BuildILike("xy 123z%")); + EXPECT_EQ(fnode.descriptor()->name(), "starts_with"); + EXPECT_EQ(fnode.ToString(), "bool starts_with((string) in, (const string) xy 123z)"); + + // optimise for 'ends_with' + fnode = LikeHolder::TryOptimize(BuildILike("%xyz")); + EXPECT_EQ(fnode.descriptor()->name(), "ends_with"); + EXPECT_EQ(fnode.ToString(), "bool ends_with((string) in, (const string) xyz)"); + + // optimise for 'is_substr' + fnode = LikeHolder::TryOptimize(BuildILike("%abc%")); + EXPECT_EQ(fnode.descriptor()->name(), "is_substr"); + EXPECT_EQ(fnode.ToString(), "bool is_substr((string) in, (const string) abc)"); + + // no optimisation for others. + fnode = LikeHolder::TryOptimize(BuildILike("xyz_")); + EXPECT_EQ(fnode.descriptor()->name(), "ilike"); + + fnode = LikeHolder::TryOptimize(BuildILike("_xyz")); + EXPECT_EQ(fnode.descriptor()->name(), "ilike"); + + fnode = LikeHolder::TryOptimize(BuildILike("_xyz_")); + EXPECT_EQ(fnode.descriptor()->name(), "ilike"); + + fnode = LikeHolder::TryOptimize(BuildILike("%xyz_")); + EXPECT_EQ(fnode.descriptor()->name(), "ilike"); + + fnode = LikeHolder::TryOptimize(BuildILike("x_yz%")); + EXPECT_EQ(fnode.descriptor()->name(), "ilike"); +} + } // namespace gandiva From a48414931bd0bca07d7a0ec77eb78038a22cc551 Mon Sep 17 00:00:00 2001 From: frank400 Date: Wed, 28 Apr 2021 14:13:55 -0300 Subject: [PATCH 5/8] Fix checkstyle on cmake file --- cpp/src/gandiva/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index 9e6a0433424..44b6fab14c3 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -83,7 +83,7 @@ set(SRC_FILES llvm_generator.cc llvm_types.cc like_holder.cc - literal_holder.cc + literal_holder.cc projector.cc regex_util.cc selection_vector.cc From c2363b10fd246b2605c27f6590ce08aea030f717 Mon Sep 17 00:00:00 2001 From: frank400 Date: Thu, 29 Apr 2021 11:01:37 -0300 Subject: [PATCH 6/8] Disable TryOptimize for ilike --- cpp/src/gandiva/expr_decomposer.cc | 2 +- cpp/src/gandiva/like_holder_test.cc | 33 ----------------------------- 2 files changed, 1 insertion(+), 34 deletions(-) diff --git a/cpp/src/gandiva/expr_decomposer.cc b/cpp/src/gandiva/expr_decomposer.cc index 4f198e0ed82..1c09d28f5e0 100644 --- a/cpp/src/gandiva/expr_decomposer.cc +++ b/cpp/src/gandiva/expr_decomposer.cc @@ -52,7 +52,7 @@ Status ExprDecomposer::Visit(const FieldNode& node) { // eg. replacing 'like' with 'starts_with' can save function calls at evaluation // time. const FunctionNode ExprDecomposer::TryOptimize(const FunctionNode& node) { - if (node.descriptor()->name() == "like" || node.descriptor()->name() == "ilike") { + if (node.descriptor()->name() == "like") { return LikeHolder::TryOptimize(node); } else { return node; diff --git a/cpp/src/gandiva/like_holder_test.cc b/cpp/src/gandiva/like_holder_test.cc index 925a62d34bb..342f3c56c17 100644 --- a/cpp/src/gandiva/like_holder_test.cc +++ b/cpp/src/gandiva/like_holder_test.cc @@ -277,37 +277,4 @@ TEST_F(TestILikeHolder, TestDot) { EXPECT_FALSE(like("abcd")); } -TEST_F(TestILikeHolder, TestOptimise) { - // optimise for 'starts_with' - auto fnode = LikeHolder::TryOptimize(BuildILike("xy 123z%")); - EXPECT_EQ(fnode.descriptor()->name(), "starts_with"); - EXPECT_EQ(fnode.ToString(), "bool starts_with((string) in, (const string) xy 123z)"); - - // optimise for 'ends_with' - fnode = LikeHolder::TryOptimize(BuildILike("%xyz")); - EXPECT_EQ(fnode.descriptor()->name(), "ends_with"); - EXPECT_EQ(fnode.ToString(), "bool ends_with((string) in, (const string) xyz)"); - - // optimise for 'is_substr' - fnode = LikeHolder::TryOptimize(BuildILike("%abc%")); - EXPECT_EQ(fnode.descriptor()->name(), "is_substr"); - EXPECT_EQ(fnode.ToString(), "bool is_substr((string) in, (const string) abc)"); - - // no optimisation for others. - fnode = LikeHolder::TryOptimize(BuildILike("xyz_")); - EXPECT_EQ(fnode.descriptor()->name(), "ilike"); - - fnode = LikeHolder::TryOptimize(BuildILike("_xyz")); - EXPECT_EQ(fnode.descriptor()->name(), "ilike"); - - fnode = LikeHolder::TryOptimize(BuildILike("_xyz_")); - EXPECT_EQ(fnode.descriptor()->name(), "ilike"); - - fnode = LikeHolder::TryOptimize(BuildILike("%xyz_")); - EXPECT_EQ(fnode.descriptor()->name(), "ilike"); - - fnode = LikeHolder::TryOptimize(BuildILike("x_yz%")); - EXPECT_EQ(fnode.descriptor()->name(), "ilike"); -} - } // namespace gandiva From 97e6e2d83d9fcbbb2de3a3738cfb3bef0fb191d4 Mon Sep 17 00:00:00 2001 From: frank400 Date: Fri, 30 Apr 2021 12:51:28 -0300 Subject: [PATCH 7/8] Remove unnecessary Make method --- cpp/src/gandiva/like_holder.cc | 12 +++++++++--- cpp/src/gandiva/like_holder.h | 2 +- cpp/src/gandiva/like_holder_test.cc | 9 +++++---- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/cpp/src/gandiva/like_holder.cc b/cpp/src/gandiva/like_holder.cc index 48e390f197f..05e03b51aa8 100644 --- a/cpp/src/gandiva/like_holder.cc +++ b/cpp/src/gandiva/like_holder.cc @@ -80,8 +80,9 @@ Status LikeHolder::Make(const FunctionNode& node, std::shared_ptr* h !IsArrowStringLiteral(literal_type), Status::Invalid( "'like' function requires a string literal as the second parameter")); + + RE2::Options regex_op; if (node.descriptor()->name() == "ilike") { - RE2::Options regex_op; regex_op.set_case_sensitive(false); // set case-insensitive for ilike function. return Make(arrow::util::get(literal->holder()), holder, regex_op); @@ -143,12 +144,17 @@ Status LikeHolder::Make(const std::string& sql_pattern, std::string pcre_pattern; ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern)); - auto lholder = std::shared_ptr(new LikeHolder(pcre_pattern, regex_op)); + std::shared_ptr lholder; + if (regex_op.case_sensitive()) { + lholder = std::shared_ptr(new LikeHolder(pcre_pattern)); + } else { + lholder = std::shared_ptr(new LikeHolder(pcre_pattern, regex_op)); + } + ARROW_RETURN_IF(!lholder->regex_.ok(), Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed")); *holder = lholder; return Status::OK(); } - } // namespace gandiva diff --git a/cpp/src/gandiva/like_holder.h b/cpp/src/gandiva/like_holder.h index 8c83200fc4c..73e58017de1 100644 --- a/cpp/src/gandiva/like_holder.h +++ b/cpp/src/gandiva/like_holder.h @@ -54,7 +54,7 @@ class GANDIVA_EXPORT LikeHolder : public FunctionHolder { private: explicit LikeHolder(const std::string& pattern) : pattern_(pattern), regex_(pattern) {} - explicit LikeHolder(const std::string& pattern, RE2::Options regex_op) + LikeHolder(const std::string& pattern, RE2::Options regex_op) : pattern_(pattern), regex_(pattern, regex_op) {} std::string pattern_; // posix pattern string, to help debugging diff --git a/cpp/src/gandiva/like_holder_test.cc b/cpp/src/gandiva/like_holder_test.cc index 342f3c56c17..a52533a1138 100644 --- a/cpp/src/gandiva/like_holder_test.cc +++ b/cpp/src/gandiva/like_holder_test.cc @@ -27,6 +27,7 @@ namespace gandiva { class TestLikeHolder : public ::testing::Test { public: + RE2::Options regex_op; FunctionNode BuildLike(std::string pattern) { auto field = std::make_shared(arrow::field("in", arrow::utf8())); auto pattern_node = @@ -48,7 +49,7 @@ class TestLikeHolder : public ::testing::Test { TEST_F(TestLikeHolder, TestMatchAny) { std::shared_ptr like_holder; - auto status = LikeHolder::Make("ab%", &like_holder); + auto status = LikeHolder::Make("ab%", &like_holder, regex_op); EXPECT_EQ(status.ok(), true) << status.message(); auto& like = *like_holder; @@ -63,7 +64,7 @@ TEST_F(TestLikeHolder, TestMatchAny) { TEST_F(TestLikeHolder, TestMatchOne) { std::shared_ptr like_holder; - auto status = LikeHolder::Make("ab_", &like_holder); + auto status = LikeHolder::Make("ab_", &like_holder, regex_op); EXPECT_EQ(status.ok(), true) << status.message(); auto& like = *like_holder; @@ -78,7 +79,7 @@ TEST_F(TestLikeHolder, TestMatchOne) { TEST_F(TestLikeHolder, TestPcreSpecial) { std::shared_ptr like_holder; - auto status = LikeHolder::Make(".*ab_", &like_holder); + auto status = LikeHolder::Make(".*ab_", &like_holder, regex_op); EXPECT_EQ(status.ok(), true) << status.message(); auto& like = *like_holder; @@ -97,7 +98,7 @@ TEST_F(TestLikeHolder, TestRegexEscape) { TEST_F(TestLikeHolder, TestDot) { std::shared_ptr like_holder; - auto status = LikeHolder::Make("abc.", &like_holder); + auto status = LikeHolder::Make("abc.", &like_holder, regex_op); EXPECT_EQ(status.ok(), true) << status.message(); auto& like = *like_holder; From f160880d215f39411525be66fc9b86c7fafd43c4 Mon Sep 17 00:00:00 2001 From: frank400 Date: Mon, 3 May 2021 09:50:35 -0300 Subject: [PATCH 8/8] Optimize holder constructor call --- cpp/src/gandiva/like_holder.cc | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/cpp/src/gandiva/like_holder.cc b/cpp/src/gandiva/like_holder.cc index 05e03b51aa8..af9ac67d66a 100644 --- a/cpp/src/gandiva/like_holder.cc +++ b/cpp/src/gandiva/like_holder.cc @@ -145,11 +145,7 @@ Status LikeHolder::Make(const std::string& sql_pattern, ARROW_RETURN_NOT_OK(RegexUtil::SqlLikePatternToPcre(sql_pattern, pcre_pattern)); std::shared_ptr lholder; - if (regex_op.case_sensitive()) { - lholder = std::shared_ptr(new LikeHolder(pcre_pattern)); - } else { - lholder = std::shared_ptr(new LikeHolder(pcre_pattern, regex_op)); - } + lholder = std::shared_ptr(new LikeHolder(pcre_pattern, regex_op)); ARROW_RETURN_IF(!lholder->regex_.ok(), Status::Invalid("Building RE2 pattern '", pcre_pattern, "' failed"));