Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
317 changes: 308 additions & 9 deletions be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,286 @@

namespace doris::segment_v2 {

Status PhraseQuery::parser_slop(std::string& query, int32_t& slop) {
template <typename Derived>
bool PhraseMatcherBase<Derived>::matches(int32_t doc) {
reset(doc);
return static_cast<Derived*>(this)->next_match();
}

template <typename Derived>
void PhraseMatcherBase<Derived>::reset(int32_t doc) {
for (PostingsAndPosition& posting : _postings) {
if (posting._postings.docID() != doc) {
posting._postings.advance(doc);
}
posting._freq = posting._postings.freq();
posting._pos = -1;
posting._upTo = 0;
}
}

template <typename Derived>
bool PhraseMatcherBase<Derived>::advance_position(PostingsAndPosition& posting, int32_t target) {
while (posting._pos < target) {
if (posting._upTo == posting._freq) {
return false;
} else {
posting._pos = posting._postings.nextPosition();
posting._upTo += 1;
}
}
return true;
}

bool ExactPhraseMatcher::next_match() {
PostingsAndPosition& lead = _postings[0];
if (lead._upTo < lead._freq) {
lead._pos = lead._postings.nextPosition();
lead._upTo += 1;
} else {
return false;
}

while (true) {
int32_t phrasePos = lead._pos - lead._offset;

bool advance_head = false;
for (size_t j = 1; j < _postings.size(); ++j) {
PostingsAndPosition& posting = _postings[j];
int32_t expectedPos = phrasePos + posting._offset;
// advance up to the same position as the lead
if (!advance_position(posting, expectedPos)) {
return false;
}

if (posting._pos != expectedPos) { // we advanced too far
if (advance_position(lead, posting._pos - posting._offset + lead._offset)) {
advance_head = true;
break;
} else {
return false;
}
}
}
if (advance_head) {
continue;
}

return true;
}

return false;
}

bool OrderedSloppyPhraseMatcher::next_match() {
PostingsAndPosition* prev_posting = _postings.data();
while (prev_posting->_upTo < prev_posting->_freq) {
prev_posting->_pos = prev_posting->_postings.nextPosition();
prev_posting->_upTo += 1;
if (stretchToOrder(prev_posting) && _match_width <= _allowed_slop) {
return true;
}
}
return false;
}

bool OrderedSloppyPhraseMatcher::stretchToOrder(PostingsAndPosition* prev_posting) {
_match_width = 0;
for (size_t i = 1; i < _postings.size(); i++) {
PostingsAndPosition& posting = _postings[i];
if (!advance_position(posting, prev_posting->_pos + 1)) {
return false;
}
_match_width += (posting._pos - (prev_posting->_pos + 1));
prev_posting = &posting;
}
return true;
}

PhraseQuery::PhraseQuery(const std::shared_ptr<lucene::search::IndexSearcher>& searcher,
const TQueryOptions& query_options)
: _searcher(searcher) {}

PhraseQuery::~PhraseQuery() {
for (auto& term_doc : _term_docs) {
if (term_doc) {
_CLDELETE(term_doc);
}
}
for (auto& term : _terms) {
if (term) {
_CLDELETE(term);
}
}
}

void PhraseQuery::add(const std::wstring& field_name, const std::vector<std::string>& terms,
int32_t slop, bool ordered) {
if (terms.empty()) {
_CLTHROWA(CL_ERR_IllegalArgument, "PhraseQuery::add: terms empty");
}

if (slop == 0 || ordered) {
add(field_name, terms, slop);
} else {
auto query = std::make_unique<CL_NS(search)::PhraseQuery>();
for (const auto& term : terms) {
std::wstring ws_term = StringUtil::string_to_wstring(term);
auto* t = _CLNEW lucene::index::Term(field_name.c_str(), ws_term.c_str());
query->add(t);
_CLDECDELETE(t);
}
query->setSlop(slop);
_matcher = std::move(query);
}
}

void PhraseQuery::add(const std::wstring& field_name, const std::vector<std::string>& terms,
int32_t slop) {
if (terms.empty()) {
_CLTHROWA(CL_ERR_IllegalArgument, "PhraseQuery::add: terms empty");
}

if (terms.size() == 1) {
std::wstring ws_term = StringUtil::string_to_wstring(terms[0]);
Term* t = _CLNEW Term(field_name.c_str(), ws_term.c_str());
_terms.push_back(t);
TermDocs* term_doc = _searcher->getReader()->termDocs(t);
_term_docs.push_back(term_doc);
_lead1 = TermIterator(term_doc);
return;
}

std::vector<TermIterator> iterators;
auto ensureTermPosition = [this, &iterators, &field_name](const std::string& term) {
std::wstring ws_term = StringUtil::string_to_wstring(term);
Term* t = _CLNEW Term(field_name.c_str(), ws_term.c_str());
_terms.push_back(t);
TermPositions* term_pos = _searcher->getReader()->termPositions(t);
_term_docs.push_back(term_pos);
iterators.emplace_back(term_pos);
return term_pos;
};

if (slop == 0) {
ExactPhraseMatcher matcher;
for (size_t i = 0; i < terms.size(); i++) {
const auto& term = terms[i];
auto* term_pos = ensureTermPosition(term);
matcher._postings.emplace_back(term_pos, i);
}
_matcher = matcher;
} else {
OrderedSloppyPhraseMatcher matcher;
for (size_t i = 0; i < terms.size(); i++) {
const auto& term = terms[i];
auto* term_pos = ensureTermPosition(term);
matcher._postings.emplace_back(term_pos, i);
}
matcher._allowed_slop = slop;
_matcher = matcher;
}

std::sort(iterators.begin(), iterators.end(), [](const TermIterator& a, const TermIterator& b) {
return a.docFreq() < b.docFreq();
});

_lead1 = iterators[0];
_lead2 = iterators[1];
for (int32_t i = 2; i < iterators.size(); i++) {
_others.push_back(iterators[i]);
}
}

void PhraseQuery::search(roaring::Roaring& roaring) {
if (std::holds_alternative<PhraseQueryPtr>(_matcher)) {
_searcher->_search(
std::get<PhraseQueryPtr>(_matcher).get(),
[&roaring](const int32_t docid, const float_t /*score*/) { roaring.add(docid); });
} else {
if (_lead1.isEmpty()) {
return;
}
if (_lead2.isEmpty()) {
search_by_bitmap(roaring);
return;
}
search_by_skiplist(roaring);
}
}

void PhraseQuery::search_by_bitmap(roaring::Roaring& roaring) {
DocRange doc_range;
while (_lead1.readRange(&doc_range)) {
if (doc_range.type_ == DocRangeType::kMany) {
roaring.addMany(doc_range.doc_many_size_, doc_range.doc_many->data());
} else {
roaring.addRange(doc_range.doc_range.first, doc_range.doc_range.second);
}
}
}

void PhraseQuery::search_by_skiplist(roaring::Roaring& roaring) {
int32_t doc = 0;
while ((doc = do_next(_lead1.nextDoc())) != INT32_MAX) {
if (matches(doc)) {
roaring.add(doc);
}
}
}

int32_t PhraseQuery::do_next(int32_t doc) {
while (true) {
assert(doc == _lead1.docID());

// the skip list is used to find the two smallest inverted lists
int32_t next2 = _lead2.advance(doc);
if (next2 != doc) {
doc = _lead1.advance(next2);
if (next2 != doc) {
continue;
}
}

// if both lead1 and lead2 exist, use skip list to lookup other inverted indexes
bool advance_head = false;
for (auto& other : _others) {
if (other.isEmpty()) {
continue;
}

if (other.docID() < doc) {
int32_t next = other.advance(doc);
if (next > doc) {
doc = _lead1.advance(next);
advance_head = true;
break;
}
}
}
if (advance_head) {
continue;
}

return doc;
}
}

bool PhraseQuery::matches(int32_t doc) {
return std::visit(
[&doc](auto&& m) -> bool {
using T = std::decay_t<decltype(m)>;
if constexpr (std::is_same_v<T, PhraseQueryPtr>) {
_CLTHROWA(CL_ERR_IllegalArgument,
"PhraseQueryPtr does not support matches function");
} else {
return m.matches(doc);
}
},
_matcher);
}

void PhraseQuery::parser_slop(std::string& query, int32_t& slop, bool& ordered_phrase) {
auto is_digits = [](const std::string_view& str) {
return std::all_of(str.begin(), str.end(), [](unsigned char c) { return std::isdigit(c); });
};
Expand All @@ -32,17 +311,37 @@ Status PhraseQuery::parser_slop(std::string& query, int32_t& slop) {
if (tilde_pos < query.size() - 1 && query[tilde_pos] == '~') {
size_t slop_pos = tilde_pos + 1;
std::string_view slop_str(query.data() + slop_pos, query.size() - slop_pos);
if (is_digits(slop_str)) {
auto result = std::from_chars(slop_str.begin(), slop_str.end(), slop);
if (result.ec != std::errc()) {
return Status::Error<doris::ErrorCode::INVERTED_INDEX_INVALID_PARAMETERS>(
"PhraseQuery parser failed: {}", query);
do {
if (slop_str.empty()) {
break;
}
query = query.substr(0, last_space_pos);
}

bool ordered = false;
if (slop_str.size() == 1) {
if (!std::isdigit(slop_str[0])) {
break;
}
} else {
if (slop_str.back() == '+') {
ordered = true;
slop_str.remove_suffix(1);
}
}

if (is_digits(slop_str)) {
auto result = std::from_chars(slop_str.begin(), slop_str.end(), slop);
if (result.ec != std::errc()) {
break;
}
ordered_phrase = ordered;
query = query.substr(0, last_space_pos);
}
} while (false);
}
}
return Status::OK();
}

template class PhraseMatcherBase<ExactPhraseMatcher>;
template class PhraseMatcherBase<OrderedSloppyPhraseMatcher>;

} // namespace doris::segment_v2
Loading