diff --git a/include/paimon/global_config.h b/include/paimon/global_config.h index 84b362ec..03c58fc0 100644 --- a/include/paimon/global_config.h +++ b/include/paimon/global_config.h @@ -29,7 +29,7 @@ namespace paimon { /// not necessarily the exact number of threads at a given point in time. /// /// You can change this number using SetArrowCpuThreadPoolCapacity(). -PAIMON_EXPORT int GetArrowCpuThreadPoolCapacity(); +PAIMON_EXPORT int32_t GetArrowCpuThreadPoolCapacity(); /// Set the capacity of the arrow's global thread pool /// This is a simple wrapper of arrow::SetCpuThreadPoolCapacity() @@ -40,6 +40,6 @@ PAIMON_EXPORT int GetArrowCpuThreadPoolCapacity(); /// The current number is returned by GetArrowCpuThreadPoolCapacity(). /// Currently, this capacity will significantly affect the performance /// of parquet file batch read. -PAIMON_EXPORT Status SetArrowCpuThreadPoolCapacity(int threads); +PAIMON_EXPORT Status SetArrowCpuThreadPoolCapacity(int32_t threads); } // namespace paimon diff --git a/include/paimon/predicate/full_text_search.h b/include/paimon/predicate/full_text_search.h index e5fba975..ad906d03 100644 --- a/include/paimon/predicate/full_text_search.h +++ b/include/paimon/predicate/full_text_search.h @@ -23,8 +23,8 @@ #include #include "paimon/predicate/predicate.h" +#include "paimon/utils/roaring_bitmap64.h" #include "paimon/visibility.h" - namespace paimon { /// A configuration structure for full-text search operations. struct PAIMON_EXPORT FullTextSearch { @@ -44,14 +44,20 @@ struct PAIMON_EXPORT FullTextSearch { UNKNOWN = 128 }; - FullTextSearch(const std::string& _field_name, int32_t _limit, const std::string& _query, - const SearchType& _search_type) - : field_name(_field_name), limit(_limit), query(_query), search_type(_search_type) {} + FullTextSearch(const std::string& _field_name, std::optional _limit, + const std::string& _query, const SearchType& _search_type, + const std::optional& _pre_filter) + : field_name(_field_name), + limit(_limit), + query(_query), + search_type(_search_type), + pre_filter(_pre_filter) {} /// Name of the field to search within (must be a full-text indexed field). std::string field_name; - /// Maximum number of documents to return. Ordered by scores. - int32_t limit; + /// Maximum number of documents to return. If set, limit ordered by top scores. Otherwise, no + /// score return. + std::optional limit; /// The query string to search for. The interpretation depends on search_type: /// /// - For MATCH_ALL/MATCH_ANY: keywords are split into terms using the **same analyzer as @@ -70,5 +76,9 @@ struct PAIMON_EXPORT FullTextSearch { std::string query; /// Type of search to perform. SearchType search_type; + /// A pre-filter based on **local row IDs**, implemented by leveraging another global index. + /// Only rows whose local row ID is present in `pre_filter` will be included during search. + /// If not set, all rows will be included. + std::optional pre_filter; }; } // namespace paimon diff --git a/include/paimon/utils/roaring_bitmap64.h b/include/paimon/utils/roaring_bitmap64.h index 7b1eacfb..f170d66e 100644 --- a/include/paimon/utils/roaring_bitmap64.h +++ b/include/paimon/utils/roaring_bitmap64.h @@ -62,6 +62,8 @@ class PAIMON_EXPORT RoaringBitmap64 { Iterator& operator++(); bool operator==(const Iterator& other) const; bool operator!=(const Iterator& other) const; + /// Move the iterator to the value which is equal or larger than input value + void EqualOrLarger(int64_t value); private: void* iterator_ = nullptr; diff --git a/src/paimon/common/utils/roaring_bitmap64.cpp b/src/paimon/common/utils/roaring_bitmap64.cpp index 98b943d7..ae078e76 100644 --- a/src/paimon/common/utils/roaring_bitmap64.cpp +++ b/src/paimon/common/utils/roaring_bitmap64.cpp @@ -97,6 +97,10 @@ bool RoaringBitmap64::Iterator::operator!=(const Iterator& other) const { return !(*this == other); } +void RoaringBitmap64::Iterator::EqualOrLarger(int64_t value) { + [[maybe_unused]] bool _ = GetIterator(iterator_).move(value); +} + RoaringBitmap64::RoaringBitmap64() { roaring_bitmap_ = new roaring::Roaring64Map(); } diff --git a/src/paimon/common/utils/roaring_bitmap64_test.cpp b/src/paimon/common/utils/roaring_bitmap64_test.cpp index 8e6b2084..71e15c8a 100644 --- a/src/paimon/common/utils/roaring_bitmap64_test.cpp +++ b/src/paimon/common/utils/roaring_bitmap64_test.cpp @@ -403,4 +403,15 @@ TEST(RoaringBitmap64Test, TestFromRoaringBitmap32) { } } +TEST(RoaringBitmap64Test, TestIteratorEqualOrLarger) { + RoaringBitmap64 roaring = RoaringBitmap64::From({1l, 3l, 5l, 100l}); + auto iter = roaring.Begin(); + ASSERT_EQ(*iter, 1l); + iter.EqualOrLarger(5l); + ASSERT_EQ(*iter, 5l); + iter.EqualOrLarger(10l); + ASSERT_EQ(*iter, 100l); + iter.EqualOrLarger(200l); + ASSERT_EQ(iter, roaring.End()); +} } // namespace paimon::test diff --git a/src/paimon/global_index/lucene/CMakeLists.txt b/src/paimon/global_index/lucene/CMakeLists.txt index b049ca88..cb556ae9 100644 --- a/src/paimon/global_index/lucene/CMakeLists.txt +++ b/src/paimon/global_index/lucene/CMakeLists.txt @@ -41,6 +41,7 @@ if(PAIMON_ENABLE_LUCENE) lucene_api_test.cpp lucene_directory_test.cpp lucene_global_index_test.cpp + lucene_filter_test.cpp EXTRA_INCLUDES ${LUCENE_INCLUDE_DIR} STATIC_LINK_LIBS diff --git a/src/paimon/global_index/lucene/lucene_api_test.cpp b/src/paimon/global_index/lucene/lucene_api_test.cpp index 815ca510..d6868f0f 100644 --- a/src/paimon/global_index/lucene/lucene_api_test.cpp +++ b/src/paimon/global_index/lucene/lucene_api_test.cpp @@ -22,7 +22,67 @@ #include "paimon/testing/utils/testharness.h" namespace paimon::lucene::test { -TEST(LuceneInterfaceTest, TestSimple) { +class LuceneInterfaceTest : public ::testing::Test { + public: + void SetUp() override {} + void TearDown() override {} + + class TestDocIdSetIterator : public Lucene::DocIdSetIterator { + public: + explicit TestDocIdSetIterator(const std::vector& ids) + : Lucene::DocIdSetIterator(), ids_(ids) {} + + int32_t advance(int32_t target) override { + int32_t doc_id = nextDoc(); + while (doc_id < target) { + doc_id = nextDoc(); + } + return doc_id; + } + int32_t docID() override { + return ids_[cursor_]; + } + int32_t nextDoc() override { + if (cursor_ == ids_.size()) { + return Lucene::DocIdSetIterator::NO_MORE_DOCS; + } + return ids_[cursor_++]; + } + + private: + size_t cursor_ = 0; + std::vector ids_; + }; + + class TestDocIdSet : public Lucene::DocIdSet { + public: + explicit TestDocIdSet(const std::vector& ids) : DocIdSet(), ids_(ids) {} + + Lucene::DocIdSetIteratorPtr iterator() override { + return Lucene::newLucene(ids_); + } + bool isCacheable() override { + return true; + } + + private: + std::vector ids_; + }; + + class TestFilter : public Lucene::Filter { + public: + explicit TestFilter(const std::vector& ids) : ids_(ids) {} + + Lucene::DocIdSetPtr getDocIdSet(const Lucene::IndexReaderPtr& reader) override { + return Lucene::newLucene(ids_); + } + + private: + std::vector ids_; + }; +}; + +TEST_F(LuceneInterfaceTest, TestSimple) { auto dir = paimon::test::UniqueTestDirectory::Create("local"); std::string index_path = dir->Str() + "/lucene_test"; auto lucene_dir = Lucene::FSDirectory::open(LuceneUtils::StringToWstring(index_path), @@ -68,10 +128,17 @@ TEST(LuceneInterfaceTest, TestSimple) { parser->setAllowLeadingWildcard(true); auto search = [&](const std::wstring& query_str, int32_t limit, + const std::optional> selected_id, const std::vector& expected_doc_id_vec, const std::vector& expected_doc_id_content_vec) { Lucene::QueryPtr query = parser->parse(query_str); - Lucene::TopDocsPtr results = searcher->search(query, limit); + Lucene::TopDocsPtr results; + if (selected_id) { + Lucene::FilterPtr lucene_filter = Lucene::newLucene(selected_id.value()); + results = searcher->search(query, lucene_filter, limit); + } else { + results = searcher->search(query, limit); + } ASSERT_EQ(expected_doc_id_vec.size(), results->scoreDocs.size()); std::vector resule_doc_id_vec; @@ -86,18 +153,29 @@ TEST(LuceneInterfaceTest, TestSimple) { }; // result is sorted by tf-idf score - search(L"document", /*limit=*/10, std::vector({2, 1, 0}), + search(L"document", /*limit=*/10, /*selected_id=*/std::nullopt, std::vector({2, 1, 0}), std::vector({L"2", L"1", L"0"})); - search(L"document", /*limit=*/1, std::vector({2}), std::vector({L"2"})); - search(L"test AND document", /*limit=*/10, std::vector({2, 0}), - std::vector({L"2", L"0"})); - search(L"test OR new", /*limit=*/10, std::vector({1, 0, 2}), - std::vector({L"1", L"0", L"2"})); - search(L"\"test document\"", /*limit=*/10, std::vector({0}), - std::vector({L"0"})); - search(L"unordered", /*limit=*/10, std::vector({3}), + search(L"document", /*limit=*/1, /*selected_id=*/std::nullopt, std::vector({2}), + std::vector({L"2"})); + search(L"test AND document", /*limit=*/10, /*selected_id=*/std::nullopt, + std::vector({2, 0}), std::vector({L"2", L"0"})); + search(L"test OR new", /*limit=*/10, /*selected_id=*/std::nullopt, + std::vector({1, 0, 2}), std::vector({L"1", L"0", L"2"})); + search(L"\"test document\"", /*limit=*/10, /*selected_id=*/std::nullopt, + std::vector({0}), std::vector({L"0"})); + search(L"unordered", /*limit=*/10, /*selected_id=*/std::nullopt, std::vector({3}), std::vector({L"5"})); - search(L"*orDer*", /*limit=*/10, std::vector({3}), std::vector({L"5"})); + search(L"*orDer*", /*limit=*/10, /*selected_id=*/std::nullopt, std::vector({3}), + std::vector({L"5"})); + + // test filter + search(L"document", /*limit=*/10, /*selected_id=*/std::vector({0, 1}), + std::vector({1, 0}), std::vector({L"1", L"0"})); + search(L"document OR unordered", /*limit=*/10, + /*selected_id=*/std::vector({0, 1, 3}), std::vector({3, 1, 0}), + std::vector({L"5", L"1", L"0"})); + search(L"unordered", /*limit=*/10, /*selected_id=*/std::vector({0}), + std::vector(), std::vector()); reader->close(); lucene_dir->close(); diff --git a/src/paimon/global_index/lucene/lucene_collector.h b/src/paimon/global_index/lucene/lucene_collector.h new file mode 100644 index 00000000..9fda5a68 --- /dev/null +++ b/src/paimon/global_index/lucene/lucene_collector.h @@ -0,0 +1,44 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include "lucene++/LuceneHeaders.h" +#include "paimon/utils/roaring_bitmap64.h" + +namespace paimon::lucene { +class LuceneCollector : public Lucene::Collector { + public: + LuceneCollector() : Lucene::Collector() {} + void setScorer(const Lucene::ScorerPtr& scorer) override { + // ignore scorer + } + void collect(int32_t doc) override { + bitmap_.Add(doc_base_ + doc); + } + void setNextReader(const Lucene::IndexReaderPtr& reader, int32_t doc_base) override { + doc_base_ = doc_base; + } + bool acceptsDocsOutOfOrder() override { + return true; + } + const RoaringBitmap64& GetBitmap() const { + return bitmap_; + } + + private: + RoaringBitmap64 bitmap_; + int64_t doc_base_ = 0; +}; +} // namespace paimon::lucene diff --git a/src/paimon/global_index/lucene/lucene_filter.h b/src/paimon/global_index/lucene/lucene_filter.h new file mode 100644 index 00000000..c34f94e4 --- /dev/null +++ b/src/paimon/global_index/lucene/lucene_filter.h @@ -0,0 +1,83 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include "lucene++/LuceneHeaders.h" +#include "paimon/utils/roaring_bitmap64.h" + +namespace paimon::lucene { +class BitmapDocIdSetIterator : public Lucene::DocIdSetIterator { + public: + explicit BitmapDocIdSetIterator(const RoaringBitmap64* ids) + : Lucene::DocIdSetIterator(), ids_(ids), iter_(ids->Begin()) {} + + int32_t advance(int32_t target) override { + iter_.EqualOrLarger(static_cast(target)); + if (iter_ == ids_->End()) { + return Lucene::DocIdSetIterator::NO_MORE_DOCS; + } + return static_cast(*iter_); + } + + int32_t docID() override { + if (iter_ == ids_->End()) { + return Lucene::DocIdSetIterator::NO_MORE_DOCS; + } + return static_cast(*iter_); + } + + int32_t nextDoc() override { + if (iter_ == ids_->End()) { + return Lucene::DocIdSetIterator::NO_MORE_DOCS; + } + auto id = static_cast(*iter_); + ++iter_; + return id; + } + + private: + const RoaringBitmap64* ids_; + RoaringBitmap64::Iterator iter_; +}; + +class BitmapDocIdSet : public Lucene::DocIdSet { + public: + explicit BitmapDocIdSet(const RoaringBitmap64* ids) : DocIdSet(), ids_(ids) {} + + Lucene::DocIdSetIteratorPtr iterator() override { + return Lucene::newLucene(ids_); + } + + bool isCacheable() override { + return true; + } + + private: + const RoaringBitmap64* ids_; +}; + +class LuceneFilter : public Lucene::Filter { + public: + explicit LuceneFilter(const RoaringBitmap64* ids) : ids_(ids) {} + + Lucene::DocIdSetPtr getDocIdSet(const Lucene::IndexReaderPtr& reader) override { + return Lucene::newLucene(ids_); + } + + private: + const RoaringBitmap64* ids_; +}; + +} // namespace paimon::lucene diff --git a/src/paimon/global_index/lucene/lucene_filter_test.cpp b/src/paimon/global_index/lucene/lucene_filter_test.cpp new file mode 100644 index 00000000..8d0efb6d --- /dev/null +++ b/src/paimon/global_index/lucene/lucene_filter_test.cpp @@ -0,0 +1,46 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "paimon/global_index/lucene/lucene_filter.h" + +#include "gtest/gtest.h" +#include "lucene++/LuceneHeaders.h" +#include "paimon/testing/utils/testharness.h" + +namespace paimon::lucene::test { +TEST(LuceneFilterTest, TestSimple) { + RoaringBitmap64 roaring = RoaringBitmap64::From({1l, 3l, 5l, 100l}); + LuceneFilter filter(&roaring); + + auto doc_id_set = filter.getDocIdSet(/*reader=*/Lucene::IndexReaderPtr()); + ASSERT_TRUE(doc_id_set); + + auto doc_iter = doc_id_set->iterator(); + ASSERT_TRUE(doc_iter); + ASSERT_TRUE(doc_id_set->isCacheable()); + + ASSERT_EQ(1, doc_iter->nextDoc()); + + ASSERT_EQ(5, doc_iter->advance(4)); + ASSERT_EQ(5, doc_iter->docID()); + + ASSERT_EQ(100, doc_iter->advance(100)); + + ASSERT_EQ(Lucene::DocIdSetIterator::NO_MORE_DOCS, doc_iter->advance(1000)); + ASSERT_EQ(Lucene::DocIdSetIterator::NO_MORE_DOCS, doc_iter->docID()); + ASSERT_EQ(Lucene::DocIdSetIterator::NO_MORE_DOCS, doc_iter->nextDoc()); +} + +} // namespace paimon::lucene::test diff --git a/src/paimon/global_index/lucene/lucene_global_index.cpp b/src/paimon/global_index/lucene/lucene_global_index.cpp index 1502cf2a..5e4dc5f6 100644 --- a/src/paimon/global_index/lucene/lucene_global_index.cpp +++ b/src/paimon/global_index/lucene/lucene_global_index.cpp @@ -25,8 +25,10 @@ #include "paimon/common/utils/rapidjson_util.h" #include "paimon/common/utils/uuid.h" #include "paimon/global_index/bitmap_vector_search_global_index_result.h" +#include "paimon/global_index/lucene/lucene_collector.h" #include "paimon/global_index/lucene/lucene_defs.h" #include "paimon/global_index/lucene/lucene_directory.h" +#include "paimon/global_index/lucene/lucene_filter.h" #include "paimon/global_index/lucene/lucene_utils.h" #include "paimon/io/data_input_stream.h" @@ -331,7 +333,7 @@ std::vector LuceneGlobalIndexReader::TokenizeQuery(const std::stri return wterms; } -Result> LuceneGlobalIndexReader::VisitFullTextSearch( +Result> LuceneGlobalIndexReader::VisitFullTextSearch( const std::shared_ptr& full_text_search) { try { Lucene::QueryPtr query; @@ -381,32 +383,45 @@ Result> LuceneGlobalIndexReader:: fmt::format("Not support for FullTextSearch SearchType {}", static_cast(full_text_search->search_type))); } + Lucene::FilterPtr filter = + full_text_search->pre_filter + ? Lucene::newLucene(&(full_text_search->pre_filter.value())) + : Lucene::FilterPtr(); - Lucene::TopDocsPtr results = searcher_->search(query, full_text_search->limit); + if (full_text_search->limit) { + Lucene::TopDocsPtr results = + searcher_->search(query, filter, full_text_search->limit.value()); - // prepare BitmapVectorSearchGlobalIndexResult - std::map id_to_score; - for (auto score_doc : results->scoreDocs) { - Lucene::DocumentPtr result_doc = searcher_->doc(score_doc->doc); - std::string row_id_str = - LuceneUtils::WstringToString(result_doc->get(kRowIdFieldWstring)); - std::optional row_id = StringUtils::StringToValue(row_id_str); - if (!row_id) { - return Status::Invalid(fmt::format("parse row id str {} to int failed"), - row_id_str); + // prepare BitmapVectorSearchGlobalIndexResult + std::map id_to_score; + for (auto score_doc : results->scoreDocs) { + Lucene::DocumentPtr result_doc = searcher_->doc(score_doc->doc); + std::string row_id_str = + LuceneUtils::WstringToString(result_doc->get(kRowIdFieldWstring)); + std::optional row_id = StringUtils::StringToValue(row_id_str); + if (!row_id) { + return Status::Invalid( + fmt::format("parse row id str {} to int failed", row_id_str)); + } + id_to_score[static_cast(row_id.value())] = + static_cast(score_doc->score); } - id_to_score[static_cast(row_id.value())] = - static_cast(score_doc->score); - } - RoaringBitmap64 bitmap; - std::vector scores; - scores.reserve(id_to_score.size()); - for (const auto& [id, score] : id_to_score) { - bitmap.Add(id); - scores.push_back(score); + RoaringBitmap64 bitmap; + std::vector scores; + scores.reserve(id_to_score.size()); + for (const auto& [id, score] : id_to_score) { + bitmap.Add(id); + scores.push_back(score); + } + return std::make_shared(std::move(bitmap), + std::move(scores)); + } else { + // with no limit & no score + auto collector = Lucene::newLucene(); + searcher_->search(query, filter, collector); + return std::make_shared( + [collector]() -> Result { return collector->GetBitmap(); }); } - return std::make_shared(std::move(bitmap), - std::move(scores)); } catch (const std::exception& e) { return Status::Invalid(fmt::format("visit term query failed, with {} error.", e.what())); } catch (...) { diff --git a/src/paimon/global_index/lucene/lucene_global_index.h b/src/paimon/global_index/lucene/lucene_global_index.h index 786e775b..dafa801d 100644 --- a/src/paimon/global_index/lucene/lucene_global_index.h +++ b/src/paimon/global_index/lucene/lucene_global_index.h @@ -164,7 +164,7 @@ class LuceneGlobalIndexReader : public GlobalIndexReader { "LuceneGlobalIndexReader is not supposed to handle vector search query"); } - Result> VisitFullTextSearch( + Result> VisitFullTextSearch( const std::shared_ptr& full_text_search); bool IsThreadSafe() const override { diff --git a/src/paimon/global_index/lucene/lucene_global_index_test.cpp b/src/paimon/global_index/lucene/lucene_global_index_test.cpp index bd13a9c1..f5274f7e 100644 --- a/src/paimon/global_index/lucene/lucene_global_index_test.cpp +++ b/src/paimon/global_index/lucene/lucene_global_index_test.cpp @@ -100,16 +100,21 @@ class LuceneGlobalIndexTest : public ::testing::Test, pool_); } - void CheckResult(const std::shared_ptr& result, + void CheckResult(const std::shared_ptr& result, const std::vector& expected_ids) const { - auto typed_result = std::dynamic_pointer_cast(result); - ASSERT_TRUE(typed_result); - ASSERT_OK_AND_ASSIGN(const RoaringBitmap64* bitmap, typed_result->GetBitmap()); + const RoaringBitmap64* bitmap = nullptr; + if (auto vector_search_result = + std::dynamic_pointer_cast(result)) { + ASSERT_OK_AND_ASSIGN(bitmap, vector_search_result->GetBitmap()); + ASSERT_EQ(vector_search_result->GetScores().size(), expected_ids.size()); + } else if (auto bitmap_result = + std::dynamic_pointer_cast(result)) { + ASSERT_OK_AND_ASSIGN(bitmap, bitmap_result->GetBitmap()); + } ASSERT_TRUE(bitmap); - ASSERT_EQ(*(typed_result->GetBitmap().value()), RoaringBitmap64::From(expected_ids)) - << "result=" << (typed_result->GetBitmap().value())->ToString() + ASSERT_EQ(*bitmap, RoaringBitmap64::From(expected_ids)) + << "result=" << bitmap->ToString() << ", expected=" << RoaringBitmap64::From(expected_ids).ToString(); - ASSERT_EQ(typed_result->scores_.size(), expected_ids.size()); } private: @@ -157,66 +162,128 @@ TEST_P(LuceneGlobalIndexTest, TestSimple) { ASSERT_OK_AND_ASSIGN(auto result, lucene_reader->VisitFullTextSearch(std::make_shared( "f0", - /*limit=*/10, "document", FullTextSearch::SearchType::MATCH_ALL))); + /*limit=*/10, "document", FullTextSearch::SearchType::MATCH_ALL, + /*pre_filter=*/std::nullopt))); CheckResult(result, {2l, 1l, 0l}); } { ASSERT_OK_AND_ASSIGN(auto result, lucene_reader->VisitFullTextSearch(std::make_shared( "f0", - /*limit=*/1, "document", FullTextSearch::SearchType::MATCH_ANY))); + /*limit=*/1, "document", FullTextSearch::SearchType::MATCH_ANY, + /*pre_filter=*/std::nullopt))); CheckResult(result, {2l}); } { ASSERT_OK_AND_ASSIGN( - auto result, - lucene_reader->VisitFullTextSearch(std::make_shared( - "f0", - /*limit=*/10, "test document", FullTextSearch::SearchType::MATCH_ALL))); + auto result, lucene_reader->VisitFullTextSearch(std::make_shared( + "f0", + /*limit=*/10, "test document", FullTextSearch::SearchType::MATCH_ALL, + /*pre_filter=*/std::nullopt))); CheckResult(result, {2l, 0l}); } { ASSERT_OK_AND_ASSIGN(auto result, lucene_reader->VisitFullTextSearch(std::make_shared( "f0", - /*limit=*/10, "test new", FullTextSearch::SearchType::MATCH_ANY))); + /*limit=*/10, "test new", FullTextSearch::SearchType::MATCH_ANY, + /*pre_filter=*/std::nullopt))); CheckResult(result, {1l, 0l, 2l}); } { - ASSERT_OK_AND_ASSIGN( - auto result, lucene_reader->VisitFullTextSearch(std::make_shared( - "f0", - /*limit=*/10, "test document", FullTextSearch::SearchType::PHRASE))); + ASSERT_OK_AND_ASSIGN(auto result, + lucene_reader->VisitFullTextSearch(std::make_shared( + "f0", + /*limit=*/10, "test document", FullTextSearch::SearchType::PHRASE, + /*pre_filter=*/std::nullopt))); CheckResult(result, {0l}); } { - ASSERT_OK_AND_ASSIGN( - auto result, lucene_reader->VisitFullTextSearch(std::make_shared( - "f0", - /*limit=*/10, "unordered", FullTextSearch::SearchType::MATCH_ALL))); + ASSERT_OK_AND_ASSIGN(auto result, + lucene_reader->VisitFullTextSearch(std::make_shared( + "f0", + /*limit=*/10, "unordered", FullTextSearch::SearchType::MATCH_ALL, + /*pre_filter=*/std::nullopt))); CheckResult(result, {3l}); } { ASSERT_OK_AND_ASSIGN(auto result, lucene_reader->VisitFullTextSearch(std::make_shared( "f0", - /*limit=*/10, "unorder", FullTextSearch::SearchType::PREFIX))); + /*limit=*/10, "unorder", FullTextSearch::SearchType::PREFIX, + /*pre_filter=*/std::nullopt))); CheckResult(result, {3l}); } + // test wildcard query { ASSERT_OK_AND_ASSIGN(auto result, lucene_reader->VisitFullTextSearch(std::make_shared( "f0", - /*limit=*/10, "*order*", FullTextSearch::SearchType::WILDCARD))); + /*limit=*/10, "*order*", FullTextSearch::SearchType::WILDCARD, + /*pre_filter=*/std::nullopt))); CheckResult(result, {3l}); } { ASSERT_OK_AND_ASSIGN(auto result, lucene_reader->VisitFullTextSearch(std::make_shared( "f0", - /*limit=*/10, "*or*er*", FullTextSearch::SearchType::WILDCARD))); + /*limit=*/10, "*or*er*", FullTextSearch::SearchType::WILDCARD, + /*pre_filter=*/std::nullopt))); CheckResult(result, {3l}); } + // test filter + { + ASSERT_OK_AND_ASSIGN(auto result, + lucene_reader->VisitFullTextSearch(std::make_shared( + "f0", + /*limit=*/10, "document", FullTextSearch::SearchType::MATCH_ALL, + /*pre_filter=*/RoaringBitmap64::From({0l, 1l})))); + CheckResult(result, {0l, 1l}); + } + { + ASSERT_OK_AND_ASSIGN(auto result, + lucene_reader->VisitFullTextSearch(std::make_shared( + "f0", + /*limit=*/10, "document", FullTextSearch::SearchType::MATCH_ALL, + /*pre_filter=*/RoaringBitmap64::From({2l, 100l})))); + CheckResult(result, {2l}); + } + { + ASSERT_OK_AND_ASSIGN(auto result, + lucene_reader->VisitFullTextSearch(std::make_shared( + "f0", + /*limit=*/10, "document", FullTextSearch::SearchType::MATCH_ALL, + /*pre_filter=*/RoaringBitmap64::From({20l, 100l})))); + CheckResult(result, {}); + } + // test no limit + { + ASSERT_OK_AND_ASSIGN( + auto result, + lucene_reader->VisitFullTextSearch(std::make_shared( + "f0", + /*limit=*/std::nullopt, "document", FullTextSearch::SearchType::MATCH_ALL, + /*pre_filter=*/std::nullopt))); + CheckResult(result, {0l, 1l, 2l}); + } + { + ASSERT_OK_AND_ASSIGN( + auto result, + lucene_reader->VisitFullTextSearch(std::make_shared( + "f0", + /*limit=*/std::nullopt, "document", FullTextSearch::SearchType::MATCH_ALL, + /*pre_filter=*/RoaringBitmap64::From({2l})))); + CheckResult(result, {2l}); + } + { + ASSERT_OK_AND_ASSIGN( + auto result, + lucene_reader->VisitFullTextSearch(std::make_shared( + "f0", + /*limit=*/std::nullopt, "document test", FullTextSearch::SearchType::MATCH_ALL, + /*pre_filter=*/RoaringBitmap64::From({1l, 2l, 3l, 100l})))); + CheckResult(result, {2l}); + } } INSTANTIATE_TEST_SUITE_P(ReadBufferSize, LuceneGlobalIndexTest,