From dc1fd8ddc6ab995436af492463a1477bcc258133 Mon Sep 17 00:00:00 2001 From: zzzxl1993 <474696115@qq.com> Date: Wed, 22 Nov 2023 15:19:19 +0800 Subject: [PATCH] [feature] match_phrase_prefix feature added --- be/src/exec/olap_common.h | 2 + be/src/exec/olap_utils.h | 14 ++- be/src/olap/match_predicate.cpp | 5 +- .../query/phrase_prefix_query.cpp | 63 ++++++++++++ .../query/phrase_prefix_query.h | 54 ++++++++++ .../inverted_index/query/prefix_query.cpp | 80 +++++++++++++++ .../inverted_index/query/prefix_query.h | 40 ++++++++ .../segment_v2/inverted_index_query_type.h | 4 + .../segment_v2/inverted_index_reader.cpp | 33 ++++++- .../rowset/segment_v2/inverted_index_reader.h | 6 ++ be/src/vec/functions/match.cpp | 1 + be/src/vec/functions/match.h | 17 ++++ .../org/apache/doris/nereids/DorisLexer.g4 | 1 + .../org/apache/doris/nereids/DorisParser.g4 | 2 +- fe/fe-core/src/main/cup/sql_parser.cup | 5 +- .../apache/doris/analysis/MatchPredicate.java | 11 +++ .../nereids/parser/LogicalPlanBuilder.java | 7 ++ .../nereids/trees/expressions/Match.java | 2 + .../trees/expressions/MatchPhrasePrefix.java | 49 ++++++++++ .../visitor/ExpressionVisitor.java | 5 + .../org/apache/doris/qe/SessionVariable.java | 8 ++ fe/fe-core/src/main/jflex/sql_scanner.flex | 1 + gensrc/thrift/Opcodes.thrift | 1 + gensrc/thrift/PaloInternalService.thrift | 2 + .../test_index_match_phrase_prefix.out | 31 ++++++ .../test_index_match_phrase_prefix.groovy | 98 +++++++++++++++++++ 26 files changed, 534 insertions(+), 8 deletions(-) create mode 100644 be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.cpp create mode 100644 be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h create mode 100644 be/src/olap/rowset/segment_v2/inverted_index/query/prefix_query.cpp create mode 100644 be/src/olap/rowset/segment_v2/inverted_index/query/prefix_query.h create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/MatchPhrasePrefix.java create mode 100644 regression-test/data/inverted_index_p0/test_index_match_phrase_prefix.out create mode 100644 regression-test/suites/inverted_index_p0/test_index_match_phrase_prefix.groovy diff --git a/be/src/exec/olap_common.h b/be/src/exec/olap_common.h index 91a27d980f826d..acf81a48eb45e7 100644 --- a/be/src/exec/olap_common.h +++ b/be/src/exec/olap_common.h @@ -304,6 +304,8 @@ class ColumnValueRange { condition.__set_condition_op("match_all"); } else if (value.first == MatchType::MATCH_PHRASE) { condition.__set_condition_op("match_phrase"); + } else if (value.first == MatchType::MATCH_PHRASE_PREFIX) { + condition.__set_condition_op("match_phrase_prefix"); } else if (value.first == MatchType::MATCH_ELEMENT_EQ) { condition.__set_condition_op("match_element_eq"); } else if (value.first == MatchType::MATCH_ELEMENT_LT) { diff --git a/be/src/exec/olap_utils.h b/be/src/exec/olap_utils.h index 6af46ee91b55a9..5efcc012364e39 100644 --- a/be/src/exec/olap_utils.h +++ b/be/src/exec/olap_utils.h @@ -169,6 +169,7 @@ enum class MatchType { MATCH_ELEMENT_GT = 5, MATCH_ELEMENT_LE = 6, MATCH_ELEMENT_GE = 7, + MATCH_PHRASE_PREFIX = 8, }; inline MatchType to_match_type(TExprOpcode::type type) { @@ -182,6 +183,9 @@ inline MatchType to_match_type(TExprOpcode::type type) { case TExprOpcode::type::MATCH_PHRASE: return MatchType::MATCH_PHRASE; break; + case TExprOpcode::type::MATCH_PHRASE_PREFIX: + return MatchType::MATCH_PHRASE_PREFIX; + break; case TExprOpcode::type::MATCH_ELEMENT_EQ: return MatchType::MATCH_ELEMENT_EQ; break; @@ -211,6 +215,8 @@ inline MatchType to_match_type(const std::string& condition_op) { return MatchType::MATCH_ALL; } else if (condition_op.compare("match_phrase") == 0) { return MatchType::MATCH_PHRASE; + } else if (condition_op.compare("match_phrase_prefix") == 0) { + return MatchType::MATCH_PHRASE_PREFIX; } else if (condition_op.compare("match_element_eq") == 0) { return MatchType::MATCH_ELEMENT_EQ; } else if (condition_op.compare("match_element_lt") == 0) { @@ -228,6 +234,7 @@ inline MatchType to_match_type(const std::string& condition_op) { inline bool is_match_condition(const std::string& op) { if (0 == strcasecmp(op.c_str(), "match_any") || 0 == strcasecmp(op.c_str(), "match_all") || 0 == strcasecmp(op.c_str(), "match_phrase") || + 0 == strcasecmp(op.c_str(), "match_phrase_prefix") || 0 == strcasecmp(op.c_str(), "match_element_eq") || 0 == strcasecmp(op.c_str(), "match_element_lt") || 0 == strcasecmp(op.c_str(), "match_element_gt") || @@ -240,9 +247,10 @@ inline bool is_match_condition(const std::string& op) { inline bool is_match_operator(const TExprOpcode::type& op_type) { return TExprOpcode::MATCH_ANY == op_type || TExprOpcode::MATCH_ALL == op_type || - TExprOpcode::MATCH_PHRASE == op_type || TExprOpcode::MATCH_ELEMENT_EQ == op_type || - TExprOpcode::MATCH_ELEMENT_LT == op_type || TExprOpcode::MATCH_ELEMENT_GT == op_type || - TExprOpcode::MATCH_ELEMENT_LE == op_type || TExprOpcode::MATCH_ELEMENT_GE == op_type; + TExprOpcode::MATCH_PHRASE == op_type || TExprOpcode::MATCH_PHRASE_PREFIX == op_type || + TExprOpcode::MATCH_ELEMENT_EQ == op_type || TExprOpcode::MATCH_ELEMENT_LT == op_type || + TExprOpcode::MATCH_ELEMENT_GT == op_type || TExprOpcode::MATCH_ELEMENT_LE == op_type || + TExprOpcode::MATCH_ELEMENT_GE == op_type; } } // namespace doris diff --git a/be/src/olap/match_predicate.cpp b/be/src/olap/match_predicate.cpp index f98acae787a9ef..aa4d993a62eea4 100644 --- a/be/src/olap/match_predicate.cpp +++ b/be/src/olap/match_predicate.cpp @@ -107,6 +107,9 @@ InvertedIndexQueryType MatchPredicate::_to_inverted_index_query_type(MatchType m case MatchType::MATCH_PHRASE: ret = InvertedIndexQueryType::MATCH_PHRASE_QUERY; break; + case MatchType::MATCH_PHRASE_PREFIX: + ret = InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY; + break; case MatchType::MATCH_ELEMENT_EQ: ret = InvertedIndexQueryType::EQUAL_QUERY; break; @@ -129,7 +132,7 @@ InvertedIndexQueryType MatchPredicate::_to_inverted_index_query_type(MatchType m } bool MatchPredicate::_skip_evaluate(InvertedIndexIterator* iterator) const { - if (_match_type == MatchType::MATCH_PHRASE && + if ((_match_type == MatchType::MATCH_PHRASE || _match_type == MatchType::MATCH_PHRASE_PREFIX) && iterator->get_inverted_index_reader_type() == InvertedIndexReaderType::FULLTEXT && get_parser_phrase_support_string_from_properties(iterator->get_index_properties()) == INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO) { diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.cpp new file mode 100644 index 00000000000000..4b0340cda4a011 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.cpp @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "phrase_prefix_query.h" + +#include "olap/rowset//segment_v2/inverted_index/query/prefix_query.h" + +namespace doris { + +namespace segment_v2 { + +PhrasePrefixQuery::PhrasePrefixQuery(const std::shared_ptr& searcher) + : _searcher(searcher) {} + +void PhrasePrefixQuery::add(const std::wstring& field_name, const std::vector& terms) { + if (terms.empty()) { + return; + } + + for (size_t i = 0; i < terms.size(); i++) { + if (i < terms.size() - 1) { + std::wstring ws = StringUtil::string_to_wstring(terms[i]); + Term* t = _CLNEW Term(field_name.c_str(), ws.c_str()); + _query.add(t); + _CLDECDELETE(t); + } else { + std::vector prefix_terms; + PrefixQuery::get_prefix_terms(_searcher->getReader(), field_name, terms[i], + prefix_terms, _max_expansions); + if (prefix_terms.empty()) { + continue; + } + _query.add(prefix_terms); + for (auto& t : prefix_terms) { + _CLDECDELETE(t); + } + } + } +} + +void PhrasePrefixQuery::search(roaring::Roaring& roaring) { + _searcher->_search(&_query, [&roaring](const int32_t docid, const float_t /*score*/) { + roaring.add(docid); + }); +} + +} // namespace segment_v2 + +} // namespace doris \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h new file mode 100644 index 00000000000000..28007620ce581e --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include + +#include "CLucene/search/MultiPhraseQuery.h" +#include "roaring/roaring.hh" + +CL_NS_USE(index) +CL_NS_USE(search) + +namespace doris { + +namespace segment_v2 { + +class PhrasePrefixQuery { +public: + PhrasePrefixQuery(const std::shared_ptr& searcher); + ~PhrasePrefixQuery() = default; + + void set_max_expansions(int32_t max_expansions) { _max_expansions = max_expansions; } + + void add(const std::wstring& field_name, const std::vector& terms); + void search(roaring::Roaring& roaring); + +private: + std::shared_ptr _searcher; + MultiPhraseQuery _query; + + int32_t _max_expansions = 50; +}; + +} // namespace segment_v2 + +} // namespace doris \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/prefix_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/prefix_query.cpp new file mode 100644 index 00000000000000..7d23d6eb60f348 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/prefix_query.cpp @@ -0,0 +1,80 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "prefix_query.h" + +namespace doris { + +void PrefixQuery::get_prefix_terms(IndexReader* reader, const std::wstring& field_name, + const std::string& prefix, + std::vector& prefix_terms, + int32_t max_expansions) { + std::wstring ws_prefix = StringUtil::string_to_wstring(prefix); + + Term* prefix_term = _CLNEW Term(field_name.c_str(), ws_prefix.c_str()); + TermEnum* enumerator = reader->terms(prefix_term); + + int32_t count = 0; + Term* lastTerm = nullptr; + try { + const TCHAR* prefixText = prefix_term->text(); + const TCHAR* prefixField = prefix_term->field(); + const TCHAR* tmp = nullptr; + size_t i = 0; + size_t prefixLen = prefix_term->textLength(); + do { + lastTerm = enumerator->term(); + if (lastTerm != nullptr && lastTerm->field() == prefixField) { + size_t termLen = lastTerm->textLength(); + if (prefixLen > termLen) { + break; + } + + tmp = lastTerm->text(); + + for (i = prefixLen - 1; i != -1; --i) { + if (tmp[i] != prefixText[i]) { + tmp = nullptr; + break; + } + } + if (tmp == nullptr) { + break; + } + + if (max_expansions > 0 && count >= max_expansions) { + break; + } + + Term* t = _CLNEW Term(field_name.c_str(), tmp); + prefix_terms.push_back(t); + count++; + } else { + break; + } + _CLDECDELETE(lastTerm); + } while (enumerator->next()); + } + _CLFINALLY({ + enumerator->close(); + _CLDELETE(enumerator); + _CLDECDELETE(lastTerm); + _CLDECDELETE(prefix_term); + }); +} + +} // namespace doris \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/prefix_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/prefix_query.h new file mode 100644 index 00000000000000..5deb0c1a3628ad --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/prefix_query.h @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include + +CL_NS_USE(index) + +namespace doris { + +class PrefixQuery { +public: + PrefixQuery() = default; + ~PrefixQuery() = default; + + static void get_prefix_terms(IndexReader* reader, const std::wstring& field_name, + const std::string& prefix, + std::vector& prefix_terms, + int32_t max_expansions = 50); +}; + +} // namespace doris \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index_query_type.h b/be/src/olap/rowset/segment_v2/inverted_index_query_type.h index 64171c7739d2d6..6d91c3e2ecf708 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_query_type.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_query_type.h @@ -76,6 +76,7 @@ enum class InvertedIndexQueryType { MATCH_ANY_QUERY = 5, MATCH_ALL_QUERY = 6, MATCH_PHRASE_QUERY = 7, + MATCH_PHRASE_PREFIX_QUERY = 8, }; inline std::string query_type_to_string(InvertedIndexQueryType query_type) { @@ -107,6 +108,9 @@ inline std::string query_type_to_string(InvertedIndexQueryType query_type) { case InvertedIndexQueryType::MATCH_PHRASE_QUERY: { return "MPHRASE"; } + case InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY: { + return "MPHRASEPREFIX"; + } default: return ""; } diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index d3ea5246814192..64427bf0612eeb 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -40,6 +40,8 @@ #include #include +#include "inverted_index_query_type.h" + #ifdef __clang__ #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wshadow-field" @@ -57,6 +59,7 @@ #include "olap/olap_common.h" #include "olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h" #include "olap/rowset/segment_v2/inverted_index/query/conjunction_query.h" +#include "olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h" #include "olap/rowset/segment_v2/inverted_index_cache.h" #include "olap/rowset/segment_v2/inverted_index_compound_directory.h" #include "olap/types.h" @@ -79,7 +82,8 @@ bool InvertedIndexReader::_is_range_query(InvertedIndexQueryType query_type) { bool InvertedIndexReader::_is_match_query(InvertedIndexQueryType query_type) { return (query_type == InvertedIndexQueryType::MATCH_ANY_QUERY || query_type == InvertedIndexQueryType::MATCH_ALL_QUERY || - query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY); + query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY || + query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY); } bool InvertedIndexReader::indexExists(io::Path& index_file_path) { @@ -256,7 +260,8 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run search_str, get_parser_string_from_properties(_index_meta.properties())); if (query_type == InvertedIndexQueryType::MATCH_ALL_QUERY || query_type == InvertedIndexQueryType::MATCH_ANY_QUERY || - query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) { + query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY || + query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY) { LOG(WARNING) << msg; return Status::OK(); } else { @@ -276,6 +281,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run roaring::Roaring query_match_bitmap; bool null_bitmap_already_read = false; if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY || + query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY || query_type == InvertedIndexQueryType::MATCH_ALL_QUERY || query_type == InvertedIndexQueryType::EQUAL_QUERY) { std::string str_tokens; @@ -321,6 +327,10 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run res = normal_index_search(stats, query_type, *searcher_ptr, null_bitmap_already_read, query, term_match_bitmap); + } else if (query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY) { + res = match_phrase_prefix_index_search(stats, runtime_state, field_ws, + analyse_result, *searcher_ptr, + term_match_bitmap); } else { res = match_all_index_search(stats, runtime_state, field_ws, analyse_result, *searcher_ptr, term_match_bitmap); @@ -466,6 +476,25 @@ Status FullTextIndexReader::match_all_index_search( return Status::OK(); } +Status FullTextIndexReader::match_phrase_prefix_index_search( + OlapReaderStatistics* stats, RuntimeState* runtime_state, const std::wstring& field_ws, + const std::vector& analyse_result, + const FulltextIndexSearcherPtr& index_searcher, + const std::shared_ptr& term_match_bitmap) { + TQueryOptions queryOptions = runtime_state->query_options(); + try { + SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); + PhrasePrefixQuery query(index_searcher); + query.set_max_expansions(queryOptions.inverted_index_max_expansions); + query.add(field_ws, analyse_result); + query.search(*term_match_bitmap); + } catch (const CLuceneError& e) { + return Status::Error("CLuceneError occured: {}", + e.what()); + } + return Status::OK(); +} + void FullTextIndexReader::check_null_bitmap(const FulltextIndexSearcherPtr& index_searcher, bool& null_bitmap_already_read) { // try to reuse index_searcher's directory to read null_bitmap to cache diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h b/be/src/olap/rowset/segment_v2/inverted_index_reader.h index 248bbcc45c0ee6..1dc5095b54a097 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h @@ -166,6 +166,12 @@ class FullTextIndexReader : public InvertedIndexReader { const FulltextIndexSearcherPtr& index_searcher, const std::shared_ptr& term_match_bitmap); + Status match_phrase_prefix_index_search( + OlapReaderStatistics* stats, RuntimeState* runtime_state, const std::wstring& field_ws, + const std::vector& analyse_result, + const FulltextIndexSearcherPtr& index_searcher, + const std::shared_ptr& term_match_bitmap); + void check_null_bitmap(const FulltextIndexSearcherPtr& index_searcher, bool& null_bitmap_already_read); }; diff --git a/be/src/vec/functions/match.cpp b/be/src/vec/functions/match.cpp index 91e2c164856dfc..6b8f6a4d8eaf0a 100644 --- a/be/src/vec/functions/match.cpp +++ b/be/src/vec/functions/match.cpp @@ -314,6 +314,7 @@ void register_function_match(SimpleFunctionFactory& factory) { factory.register_function(); factory.register_function(); factory.register_function(); + factory.register_function(); factory.register_function(); factory.register_function(); factory.register_function(); diff --git a/be/src/vec/functions/match.h b/be/src/vec/functions/match.h index bfb2b6f036b912..ee32ee0eaf28a1 100644 --- a/be/src/vec/functions/match.h +++ b/be/src/vec/functions/match.h @@ -128,6 +128,23 @@ class FunctionMatchPhrase : public FunctionMatchBase { ColumnUInt8::Container& result) const override; }; +class FunctionMatchPhrasePrefix : public FunctionMatchBase { +public: + static constexpr auto name = "match_phrase_prefix"; + static FunctionPtr create() { return std::make_shared(); } + + String get_name() const override { return name; } + + Status execute_match(const std::string& column_name, const std::string& match_query_str, + size_t input_rows_count, const ColumnString* string_col, + InvertedIndexCtx* inverted_index_ctx, + const ColumnArray::Offsets64* array_offsets, + ColumnUInt8::Container& result) const override { + return Status::Error( + "FunctionMatchPhrasePrefix not support execute_match"); + } +}; + class FunctionMatchElementEQ : public FunctionMatchBase { public: static constexpr auto name = "match_element_eq"; diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 index 194b9da7caa1c9..f3e7e8c583b995 100644 --- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 +++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisLexer.g4 @@ -344,6 +344,7 @@ MATCH_ELEMENT_GT: 'ELEMENT_GT'; MATCH_ELEMENT_LE: 'ELEMENT_LE'; MATCH_ELEMENT_LT: 'ELEMENT_LT'; MATCH_PHRASE: 'MATCH_PHRASE'; +MATCH_PHRASE_PREFIX: 'MATCH_PHRASE_PREFIX'; MATERIALIZED: 'MATERIALIZED'; MAX: 'MAX'; MAXVALUE: 'MAXVALUE'; diff --git a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 index faa4f1b2e2b8cd..36fe9a6d3d5dce 100644 --- a/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 +++ b/fe/fe-core/src/main/antlr4/org/apache/doris/nereids/DorisParser.g4 @@ -582,7 +582,7 @@ rowConstructor predicate : NOT? kind=BETWEEN lower=valueExpression AND upper=valueExpression | NOT? kind=(LIKE | REGEXP | RLIKE) pattern=valueExpression - | NOT? kind=(MATCH | MATCH_ANY | MATCH_ALL | MATCH_PHRASE) pattern=valueExpression + | NOT? kind=(MATCH | MATCH_ANY | MATCH_ALL | MATCH_PHRASE | MATCH_PHRASE_PREFIX) pattern=valueExpression | NOT? kind=IN LEFT_PAREN query RIGHT_PAREN | NOT? kind=IN LEFT_PAREN expression (COMMA expression)* RIGHT_PAREN | IS NOT? kind=NULL diff --git a/fe/fe-core/src/main/cup/sql_parser.cup b/fe/fe-core/src/main/cup/sql_parser.cup index 3a7b5b1b5fc5d4..01c0e7a2618d44 100644 --- a/fe/fe-core/src/main/cup/sql_parser.cup +++ b/fe/fe-core/src/main/cup/sql_parser.cup @@ -478,6 +478,7 @@ terminal String KW_MATCH_ANY, KW_MATCH_ALL, KW_MATCH_PHRASE, + KW_MATCH_PHRASE_PREFIX, KW_MATCH_ELEMENT_EQ, KW_MATCH_ELEMENT_LT, KW_MATCH_ELEMENT_GT, @@ -975,7 +976,7 @@ precedence left KW_AND; precedence left KW_NOT, NOT; precedence left KW_BETWEEN, KW_IN, KW_IS, KW_EXISTS; precedence left KW_LIKE, KW_REGEXP; -precedence left KW_MATCH_ANY, KW_MATCH_ALL, KW_MATCH_PHRASE, KW_MATCH, KW_MATCH_ELEMENT_EQ, KW_MATCH_ELEMENT_LT, KW_MATCH_ELEMENT_GT, KW_MATCH_ELEMENT_LE, KW_MATCH_ELEMENT_GE; +precedence left KW_MATCH_ANY, KW_MATCH_ALL, KW_MATCH_PHRASE, KW_MATCH_PHRASE_PREFIX, KW_MATCH, KW_MATCH_ELEMENT_EQ, KW_MATCH_ELEMENT_LT, KW_MATCH_ELEMENT_GT, KW_MATCH_ELEMENT_LE, KW_MATCH_ELEMENT_GE; precedence left EQUAL, LESSTHAN, GREATERTHAN; precedence left ADD, SUBTRACT; precedence left AT, STAR, DIVIDE, MOD, KW_DIV; @@ -7022,6 +7023,8 @@ match_predicate ::= {: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_ALL, e1, e2); :} | expr:e1 KW_MATCH_PHRASE expr:e2 {: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_PHRASE, e1, e2); :} + | expr:e1 KW_MATCH_PHRASE_PREFIX expr:e2 + {: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_PHRASE_PREFIX, e1, e2); :} | expr:e1 KW_MATCH_ELEMENT_EQ expr:e2 {: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_ELEMENT_EQ, e1, e2); :} | expr:e1 KW_MATCH_ELEMENT_LT expr:e2 diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java index 10579614524e1d..49a0796c19b878 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java @@ -50,6 +50,7 @@ public enum Operator { MATCH_ANY("MATCH_ANY", "match_any", TExprOpcode.MATCH_ANY), MATCH_ALL("MATCH_ALL", "match_all", TExprOpcode.MATCH_ALL), MATCH_PHRASE("MATCH_PHRASE", "match_phrase", TExprOpcode.MATCH_PHRASE), + MATCH_PHRASE_PREFIX("MATCH_PHRASE_PREFIX", "match_phrase_prefix", TExprOpcode.MATCH_PHRASE_PREFIX), MATCH_ELEMENT_EQ("MATCH_ELEMENT_EQ", "match_element_eq", TExprOpcode.MATCH_ELEMENT_EQ), MATCH_ELEMENT_LT("MATCH_ELEMENT_LT", "match_element_lt", TExprOpcode.MATCH_ELEMENT_LT), MATCH_ELEMENT_GT("MATCH_ELEMENT_GT", "match_element_gt", TExprOpcode.MATCH_ELEMENT_GT), @@ -147,6 +148,16 @@ public static void initBuiltins(FunctionSet functionSet) { symbolNotUsed, Lists.newArrayList(new ArrayType(t), t), Type.BOOLEAN)); + functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator( + Operator.MATCH_PHRASE_PREFIX.getName(), + symbolNotUsed, + Lists.newArrayList(t, t), + Type.BOOLEAN)); + functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator( + Operator.MATCH_PHRASE_PREFIX.getName(), + symbolNotUsed, + Lists.newArrayList(new ArrayType(t), t), + Type.BOOLEAN)); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java index c3eb5c5a18bf6c..37424f929618a5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java @@ -224,6 +224,7 @@ import org.apache.doris.nereids.trees.expressions.MatchAll; import org.apache.doris.nereids.trees.expressions.MatchAny; import org.apache.doris.nereids.trees.expressions.MatchPhrase; +import org.apache.doris.nereids.trees.expressions.MatchPhrasePrefix; import org.apache.doris.nereids.trees.expressions.Mod; import org.apache.doris.nereids.trees.expressions.Multiply; import org.apache.doris.nereids.trees.expressions.NamedExpression; @@ -2782,6 +2783,12 @@ private Expression withPredicate(Expression valueExpression, PredicateContext ct getExpression(ctx.pattern) ); break; + case DorisParser.MATCH_PHRASE_PREFIX: + outExpression = new MatchPhrasePrefix( + valueExpression, + getExpression(ctx.pattern) + ); + break; default: throw new ParseException("Unsupported predicate type: " + ctx.kind.getText(), ctx); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Match.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Match.java index 5ff384a8a74f1b..cafe2824fa7499 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Match.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Match.java @@ -49,6 +49,8 @@ public Operator op() throws AnalysisException { return Operator.MATCH_ALL; case "MATCH_PHRASE": return Operator.MATCH_PHRASE; + case "MATCH_PHRASE_PREFIX": + return Operator.MATCH_PHRASE_PREFIX; default: throw new AnalysisException("UnSupported type for match: " + symbol); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/MatchPhrasePrefix.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/MatchPhrasePrefix.java new file mode 100644 index 00000000000000..748da21ce30c68 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/MatchPhrasePrefix.java @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions; + +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * like expression: a MATCH_PHRASE_PREFIX 'hello w'. + */ +public class MatchPhrasePrefix extends Match { + public MatchPhrasePrefix(Expression left, Expression right) { + super(ImmutableList.of(left, right), "MATCH_PHRASE_PREFIX"); + } + + private MatchPhrasePrefix(List children) { + super(children, "MATCH_PHRASE_PREFIX"); + } + + @Override + public MatchPhrasePrefix withChildren(List children) { + Preconditions.checkArgument(children.size() == 2); + return new MatchPhrasePrefix(children); + } + + @Override + public R accept(ExpressionVisitor visitor, C context) { + return visitor.visitMatchPhrasePrefix(this, context); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java index e9298c4893d765..eef9b56f2c73e5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java @@ -58,6 +58,7 @@ import org.apache.doris.nereids.trees.expressions.MatchAll; import org.apache.doris.nereids.trees.expressions.MatchAny; import org.apache.doris.nereids.trees.expressions.MatchPhrase; +import org.apache.doris.nereids.trees.expressions.MatchPhrasePrefix; import org.apache.doris.nereids.trees.expressions.Mod; import org.apache.doris.nereids.trees.expressions.Multiply; import org.apache.doris.nereids.trees.expressions.NamedExpression; @@ -489,6 +490,10 @@ public R visitMatchPhrase(MatchPhrase matchPhrase, C context) { return visitMatch(matchPhrase, context); } + public R visitMatchPhrasePrefix(MatchPhrasePrefix matchPhrasePrefix, C context) { + return visitMatch(matchPhrasePrefix, context); + } + /* ******************************************************************************************** * Unbound expressions * ********************************************************************************************/ diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index ff957244c921cf..2d0dac488679ac 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -423,6 +423,7 @@ public class SessionVariable implements Serializable, Writable { public static final String ENABLE_UNIQUE_KEY_PARTIAL_UPDATE = "enable_unique_key_partial_update"; public static final String INVERTED_INDEX_CONJUNCTION_OPT_THRESHOLD = "inverted_index_conjunction_opt_threshold"; + public static final String INVERTED_INDEX_MAX_EXPANSIONS = "inverted_index_max_expansions"; public static final String AUTO_ANALYZE_START_TIME = "auto_analyze_start_time"; @@ -1316,6 +1317,12 @@ public void setEnableLeftZigZag(boolean enableLeftZigZag) { + " use a skiplist to optimize the intersection."}) public int invertedIndexConjunctionOptThreshold = 1000; + @VariableMgr.VarAttr(name = INVERTED_INDEX_MAX_EXPANSIONS, + description = {"这个参数用来限制查询时扩展的词项(terms)的数量,以此来控制查询的性能", + "This parameter is used to limit the number of term expansions during a query," + + " thereby controlling query performance"}) + public int invertedIndexMaxExpansions = 50; + @VariableMgr.VarAttr(name = SQL_DIALECT, needForward = true, checker = "checkSqlDialect", description = {"解析sql使用的方言", "The dialect used to parse sql."}) public String sqlDialect = "doris"; @@ -2635,6 +2642,7 @@ public TQueryOptions toThrift() { tResult.setEnableMemtableOnSinkNode(enableMemtableOnSinkNode); tResult.setInvertedIndexConjunctionOptThreshold(invertedIndexConjunctionOptThreshold); + tResult.setInvertedIndexMaxExpansions(invertedIndexMaxExpansions); tResult.setFasterFloatConvert(fasterFloatConvert); diff --git a/fe/fe-core/src/main/jflex/sql_scanner.flex b/fe/fe-core/src/main/jflex/sql_scanner.flex index 7ad9845701c813..d05d4afd266076 100644 --- a/fe/fe-core/src/main/jflex/sql_scanner.flex +++ b/fe/fe-core/src/main/jflex/sql_scanner.flex @@ -319,6 +319,7 @@ import org.apache.doris.qe.SqlModeHelper; keywordMap.put("match_any", new Integer(SqlParserSymbols.KW_MATCH_ANY)); keywordMap.put("match_all", new Integer(SqlParserSymbols.KW_MATCH_ALL)); keywordMap.put("match_phrase", new Integer(SqlParserSymbols.KW_MATCH_PHRASE)); + keywordMap.put("match_phrase_prefix", new Integer(SqlParserSymbols.KW_MATCH_PHRASE_PREFIX)); keywordMap.put("element_eq", new Integer(SqlParserSymbols.KW_MATCH_ELEMENT_EQ)); keywordMap.put("element_lt", new Integer(SqlParserSymbols.KW_MATCH_ELEMENT_LT)); keywordMap.put("element_gt", new Integer(SqlParserSymbols.KW_MATCH_ELEMENT_GT)); diff --git a/gensrc/thrift/Opcodes.thrift b/gensrc/thrift/Opcodes.thrift index f6444ebe218fd3..0afa53566d9bf2 100644 --- a/gensrc/thrift/Opcodes.thrift +++ b/gensrc/thrift/Opcodes.thrift @@ -93,4 +93,5 @@ enum TExprOpcode { MATCH_ELEMENT_GT, MATCH_ELEMENT_LE, MATCH_ELEMENT_GE, + MATCH_PHRASE_PREFIX, } diff --git a/gensrc/thrift/PaloInternalService.thrift b/gensrc/thrift/PaloInternalService.thrift index 401eb548a012b5..1c444cec1b48d7 100644 --- a/gensrc/thrift/PaloInternalService.thrift +++ b/gensrc/thrift/PaloInternalService.thrift @@ -259,6 +259,8 @@ struct TQueryOptions { 91: optional bool runtime_filter_wait_infinitely = false; 92: optional i32 wait_full_block_schedule_times = 1; + + 93: optional i32 inverted_index_max_expansions = 50; } diff --git a/regression-test/data/inverted_index_p0/test_index_match_phrase_prefix.out b/regression-test/data/inverted_index_p0/test_index_match_phrase_prefix.out new file mode 100644 index 00000000000000..140fd5ee937992 --- /dev/null +++ b/regression-test/data/inverted_index_p0/test_index_match_phrase_prefix.out @@ -0,0 +1,31 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +863 + +-- !sql -- +863 + +-- !sql -- +235 + +-- !sql -- +235 + +-- !sql -- +166 + +-- !sql -- +166 + +-- !sql -- +56 + +-- !sql -- +56 + +-- !sql -- +7 + +-- !sql -- +7 + diff --git a/regression-test/suites/inverted_index_p0/test_index_match_phrase_prefix.groovy b/regression-test/suites/inverted_index_p0/test_index_match_phrase_prefix.groovy new file mode 100644 index 00000000000000..b23bc1b5a8b82a --- /dev/null +++ b/regression-test/suites/inverted_index_p0/test_index_match_phrase_prefix.groovy @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +suite("test_index_match_phrase_prefix", "p0"){ + def indexTbName1 = "test_index_match_phrase_prefix" + + sql "DROP TABLE IF EXISTS ${indexTbName1}" + + sql """ + CREATE TABLE ${indexTbName1} ( + `@timestamp` int(11) NULL COMMENT "", + `clientip` varchar(20) NULL COMMENT "", + `request` text NULL COMMENT "", + `status` int(11) NULL COMMENT "", + `size` int(11) NULL COMMENT "", + INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = "english", "support_phrase" = "true") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`@timestamp`) + COMMENT "OLAP" + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + def load_httplogs_data = {table_name, label, read_flag, format_flag, file_name, ignore_failure=false, + expected_succ_rows = -1, load_to_single_tablet = 'true' -> + + // load the json data + streamLoad { + table "${table_name}" + + // set http request header params + set 'label', label + "_" + UUID.randomUUID().toString() + set 'read_json_by_line', read_flag + set 'format', format_flag + file file_name // import json file + time 10000 // limit inflight 10s + if (expected_succ_rows >= 0) { + set 'max_filter_ratio', '1' + } + + // if declared a check callback, the default check condition will ignore. + // So you must check all condition + check { result, exception, startTime, endTime -> + if (ignore_failure && expected_succ_rows < 0) { return } + if (exception != null) { + throw exception + } + log.info("Stream load result: ${result}".toString()) + def json = parseJson(result) + assertEquals("success", json.Status.toLowerCase()) + if (expected_succ_rows >= 0) { + assertEquals(json.NumberLoadedRows, expected_succ_rows) + } else { + assertEquals(json.NumberTotalRows, json.NumberLoadedRows + json.NumberUnselectedRows) + assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes > 0) + } + } + } + } + + try { + load_httplogs_data.call(indexTbName1, 'test_index_match_phrase_prefix', 'true', 'json', 'documents-1000.json') + + qt_sql """ select count() from test_index_match_phrase_prefix where request match_phrase_prefix 'ima'; """ + qt_sql """ select count() from test_index_match_phrase_prefix where request like '%ima%'; """ + + qt_sql """ select count() from test_index_match_phrase_prefix where request match_phrase_prefix 'images/h'; """ + qt_sql """ select count() from test_index_match_phrase_prefix where request like '%images/h%'; """ + + qt_sql """ select count() from test_index_match_phrase_prefix where request match_phrase_prefix 'images/hm'; """ + qt_sql """ select count() from test_index_match_phrase_prefix where request like '%images/hm%'; """ + + qt_sql """ select count() from test_index_match_phrase_prefix where request match_phrase_prefix '/french/images/n'; """ + qt_sql """ select count() from test_index_match_phrase_prefix where request like '%/french/images/n%'; """ + + qt_sql """ select count() from test_index_match_phrase_prefix where request match_phrase_prefix '/french/tickets/images/ti'; """ + qt_sql """ select count() from test_index_match_phrase_prefix where request like '%/french/tickets/images/ti%'; """ + } finally { + //try_sql("DROP TABLE IF EXISTS ${testTable}") + } +} \ No newline at end of file