From cd2b87a2ec23e744f0f8f9ca1e25c51d8e4a7e35 Mon Sep 17 00:00:00 2001 From: lxy264173 Date: Thu, 29 Jan 2026 18:17:43 +0800 Subject: [PATCH 1/4] feat: support filter & limit in full text search --- include/paimon/global_config.h | 4 +- include/paimon/predicate/full_text_search.h | 20 ++- include/paimon/utils/roaring_bitmap64.h | 2 + src/paimon/common/utils/roaring_bitmap64.cpp | 4 + .../common/utils/roaring_bitmap64_test.cpp | 11 ++ src/paimon/global_index/lucene/CMakeLists.txt | 1 + .../global_index/lucene/lucene_api_test.cpp | 104 ++++++++++++++-- .../global_index/lucene/lucene_collector.h | 44 +++++++ .../global_index/lucene/lucene_filter.h | 83 +++++++++++++ .../lucene/lucene_filter_test.cpp | 46 +++++++ .../lucene/lucene_global_index.cpp | 61 +++++---- .../global_index/lucene/lucene_global_index.h | 2 +- .../lucene/lucene_global_index_test.cpp | 117 ++++++++++++++---- 13 files changed, 430 insertions(+), 69 deletions(-) create mode 100644 src/paimon/global_index/lucene/lucene_collector.h create mode 100644 src/paimon/global_index/lucene/lucene_filter.h create mode 100644 src/paimon/global_index/lucene/lucene_filter_test.cpp diff --git a/include/paimon/global_config.h b/include/paimon/global_config.h index 84b362ec..03c58fc0 100644 --- a/include/paimon/global_config.h +++ b/include/paimon/global_config.h @@ -29,7 +29,7 @@ namespace paimon { /// not necessarily the exact number of threads at a given point in time. /// /// You can change this number using SetArrowCpuThreadPoolCapacity(). -PAIMON_EXPORT int GetArrowCpuThreadPoolCapacity(); +PAIMON_EXPORT int32_t GetArrowCpuThreadPoolCapacity(); /// Set the capacity of the arrow's global thread pool /// This is a simple wrapper of arrow::SetCpuThreadPoolCapacity() @@ -40,6 +40,6 @@ PAIMON_EXPORT int GetArrowCpuThreadPoolCapacity(); /// The current number is returned by GetArrowCpuThreadPoolCapacity(). /// Currently, this capacity will significantly affect the performance /// of parquet file batch read. -PAIMON_EXPORT Status SetArrowCpuThreadPoolCapacity(int threads); +PAIMON_EXPORT Status SetArrowCpuThreadPoolCapacity(int32_t threads); } // namespace paimon diff --git a/include/paimon/predicate/full_text_search.h b/include/paimon/predicate/full_text_search.h index e5fba975..377125ba 100644 --- a/include/paimon/predicate/full_text_search.h +++ b/include/paimon/predicate/full_text_search.h @@ -44,14 +44,20 @@ struct PAIMON_EXPORT FullTextSearch { UNKNOWN = 128 }; - FullTextSearch(const std::string& _field_name, int32_t _limit, const std::string& _query, - const SearchType& _search_type) - : field_name(_field_name), limit(_limit), query(_query), search_type(_search_type) {} + FullTextSearch(const std::string& _field_name, std::optional _limit, + const std::string& _query, const SearchType& _search_type, + const std::optional& _pre_filter) + : field_name(_field_name), + limit(_limit), + query(_query), + search_type(_search_type), + pre_filter(_pre_filter) {} /// Name of the field to search within (must be a full-text indexed field). std::string field_name; - /// Maximum number of documents to return. Ordered by scores. - int32_t limit; + /// Maximum number of documents to return. If set, limit ordered by top scores. Otherwise, no + /// score return. + std::optional limit; /// The query string to search for. The interpretation depends on search_type: /// /// - For MATCH_ALL/MATCH_ANY: keywords are split into terms using the **same analyzer as @@ -70,5 +76,9 @@ struct PAIMON_EXPORT FullTextSearch { std::string query; /// Type of search to perform. SearchType search_type; + /// A pre-filter based on **local row IDs**, implemented by leveraging another global index. + /// Only rows whose local row ID is present in `pre_filter` will be included during search. + /// If not set, all rows will be included. + std::optional pre_filter; }; } // namespace paimon diff --git a/include/paimon/utils/roaring_bitmap64.h b/include/paimon/utils/roaring_bitmap64.h index 7b1eacfb..f170d66e 100644 --- a/include/paimon/utils/roaring_bitmap64.h +++ b/include/paimon/utils/roaring_bitmap64.h @@ -62,6 +62,8 @@ class PAIMON_EXPORT RoaringBitmap64 { Iterator& operator++(); bool operator==(const Iterator& other) const; bool operator!=(const Iterator& other) const; + /// Move the iterator to the value which is equal or larger than input value + void EqualOrLarger(int64_t value); private: void* iterator_ = nullptr; diff --git a/src/paimon/common/utils/roaring_bitmap64.cpp b/src/paimon/common/utils/roaring_bitmap64.cpp index 98b943d7..ae078e76 100644 --- a/src/paimon/common/utils/roaring_bitmap64.cpp +++ b/src/paimon/common/utils/roaring_bitmap64.cpp @@ -97,6 +97,10 @@ bool RoaringBitmap64::Iterator::operator!=(const Iterator& other) const { return !(*this == other); } +void RoaringBitmap64::Iterator::EqualOrLarger(int64_t value) { + [[maybe_unused]] bool _ = GetIterator(iterator_).move(value); +} + RoaringBitmap64::RoaringBitmap64() { roaring_bitmap_ = new roaring::Roaring64Map(); } diff --git a/src/paimon/common/utils/roaring_bitmap64_test.cpp b/src/paimon/common/utils/roaring_bitmap64_test.cpp index 8e6b2084..71e15c8a 100644 --- a/src/paimon/common/utils/roaring_bitmap64_test.cpp +++ b/src/paimon/common/utils/roaring_bitmap64_test.cpp @@ -403,4 +403,15 @@ TEST(RoaringBitmap64Test, TestFromRoaringBitmap32) { } } +TEST(RoaringBitmap64Test, TestIteratorEqualOrLarger) { + RoaringBitmap64 roaring = RoaringBitmap64::From({1l, 3l, 5l, 100l}); + auto iter = roaring.Begin(); + ASSERT_EQ(*iter, 1l); + iter.EqualOrLarger(5l); + ASSERT_EQ(*iter, 5l); + iter.EqualOrLarger(10l); + ASSERT_EQ(*iter, 100l); + iter.EqualOrLarger(200l); + ASSERT_EQ(iter, roaring.End()); +} } // namespace paimon::test diff --git a/src/paimon/global_index/lucene/CMakeLists.txt b/src/paimon/global_index/lucene/CMakeLists.txt index b049ca88..43c76277 100644 --- a/src/paimon/global_index/lucene/CMakeLists.txt +++ b/src/paimon/global_index/lucene/CMakeLists.txt @@ -41,6 +41,7 @@ if(PAIMON_ENABLE_LUCENE) lucene_api_test.cpp lucene_directory_test.cpp lucene_global_index_test.cpp + lucene_filter_test.cpp EXTRA_INCLUDES ${LUCENE_INCLUDE_DIR} STATIC_LINK_LIBS diff --git a/src/paimon/global_index/lucene/lucene_api_test.cpp b/src/paimon/global_index/lucene/lucene_api_test.cpp index 815ca510..6d75c040 100644 --- a/src/paimon/global_index/lucene/lucene_api_test.cpp +++ b/src/paimon/global_index/lucene/lucene_api_test.cpp @@ -22,7 +22,67 @@ #include "paimon/testing/utils/testharness.h" namespace paimon::lucene::test { -TEST(LuceneInterfaceTest, TestSimple) { +class LuceneInterfaceTest : public ::testing::Test { + public: + void SetUp() override {} + void TearDown() override {} + + class TestDocIdSetIterator : public Lucene::DocIdSetIterator { + public: + explicit TestDocIdSetIterator(const std::vector& ids) + : Lucene::DocIdSetIterator(), ids_(ids) {} + + int32_t advance(int32_t target) override { + int32_t doc_id = nextDoc(); + while (doc_id < target) { + doc_id = nextDoc(); + } + return doc_id; + } + int32_t docID() override { + return ids_[cursor_]; + } + int32_t nextDoc() override { + if (cursor_ == ids_.size()) { + return Lucene::DocIdSetIterator::NO_MORE_DOCS; + } + return ids_[cursor_++]; + } + + private: + size_t cursor_ = 0; + std::vector ids_; + }; + + class TestDocIdSet : public Lucene::DocIdSet { + public: + explicit TestDocIdSet(const std::vector& ids) : DocIdSet(), ids_(ids) {} + + Lucene::DocIdSetIteratorPtr iterator() override { + return Lucene::newLucene(ids_); + } + bool isCacheable() override { + return true; + } + + private: + std::vector ids_; + }; + + class TestFilter : public Lucene::Filter { + public: + explicit TestFilter(const std::vector& ids) : ids_(ids) {} + + Lucene::DocIdSetPtr getDocIdSet(const Lucene::IndexReaderPtr& reader) override { + return Lucene::newLucene(ids_); + } + + private: + std::vector ids_; + }; +}; + +TEST_F(LuceneInterfaceTest, TestSimple) { auto dir = paimon::test::UniqueTestDirectory::Create("local"); std::string index_path = dir->Str() + "/lucene_test"; auto lucene_dir = Lucene::FSDirectory::open(LuceneUtils::StringToWstring(index_path), @@ -68,11 +128,18 @@ TEST(LuceneInterfaceTest, TestSimple) { parser->setAllowLeadingWildcard(true); auto search = [&](const std::wstring& query_str, int32_t limit, + const std::optional> selected_id, const std::vector& expected_doc_id_vec, const std::vector& expected_doc_id_content_vec) { Lucene::QueryPtr query = parser->parse(query_str); - Lucene::TopDocsPtr results = searcher->search(query, limit); - ASSERT_EQ(expected_doc_id_vec.size(), results->scoreDocs.size()); + Lucene::TopDocsPtr results; + if (selected_id) { + Lucene::FilterPtr lucene_filter = Lucene::newLucene(selected_id.value()); + results = searcher->search(query, lucene_filter, limit); + } else { + results = searcher->search(query, limit); + } + // ASSERT_EQ(expected_doc_id_vec.size(), results->scoreDocs.size()); std::vector resule_doc_id_vec; std::vector result_doc_id_content_vec; @@ -86,18 +153,29 @@ TEST(LuceneInterfaceTest, TestSimple) { }; // result is sorted by tf-idf score - search(L"document", /*limit=*/10, std::vector({2, 1, 0}), + search(L"document", /*limit=*/10, /*selected_id=*/std::nullopt, std::vector({2, 1, 0}), std::vector({L"2", L"1", L"0"})); - search(L"document", /*limit=*/1, std::vector({2}), std::vector({L"2"})); - search(L"test AND document", /*limit=*/10, std::vector({2, 0}), - std::vector({L"2", L"0"})); - search(L"test OR new", /*limit=*/10, std::vector({1, 0, 2}), - std::vector({L"1", L"0", L"2"})); - search(L"\"test document\"", /*limit=*/10, std::vector({0}), - std::vector({L"0"})); - search(L"unordered", /*limit=*/10, std::vector({3}), + search(L"document", /*limit=*/1, /*selected_id=*/std::nullopt, std::vector({2}), + std::vector({L"2"})); + search(L"test AND document", /*limit=*/10, /*selected_id=*/std::nullopt, + std::vector({2, 0}), std::vector({L"2", L"0"})); + search(L"test OR new", /*limit=*/10, /*selected_id=*/std::nullopt, + std::vector({1, 0, 2}), std::vector({L"1", L"0", L"2"})); + search(L"\"test document\"", /*limit=*/10, /*selected_id=*/std::nullopt, + std::vector({0}), std::vector({L"0"})); + search(L"unordered", /*limit=*/10, /*selected_id=*/std::nullopt, std::vector({3}), std::vector({L"5"})); - search(L"*orDer*", /*limit=*/10, std::vector({3}), std::vector({L"5"})); + search(L"*orDer*", /*limit=*/10, /*selected_id=*/std::nullopt, std::vector({3}), + std::vector({L"5"})); + + // test filter + search(L"document", /*limit=*/10, /*selected_id=*/std::optional>({0, 1}), + std::vector({1, 0}), std::vector({L"1", L"0"})); + search(L"document OR unordered", /*limit=*/10, + /*selected_id=*/std::optional>({0, 1, 3}), + std::vector({3, 1, 0}), std::vector({L"5", L"1", L"0"})); + search(L"unordered", /*limit=*/10, /*selected_id=*/std::optional>({0}), + std::vector(), std::vector()); reader->close(); lucene_dir->close(); diff --git a/src/paimon/global_index/lucene/lucene_collector.h b/src/paimon/global_index/lucene/lucene_collector.h new file mode 100644 index 00000000..cec1193c --- /dev/null +++ b/src/paimon/global_index/lucene/lucene_collector.h @@ -0,0 +1,44 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include "lucene++/LuceneHeaders.h" +#include "paimon/utils/roaring_bitmap64.h" + +namespace paimon::lucene { +class LuceneCollector : public Lucene::Collector { + public: + LuceneCollector() : Lucene::Collector() {} + void setScorer(const Lucene::ScorerPtr& scorer) override { + // ignore scorer + } + void collect(int32_t doc) override { + bitmap_.Add(doc_base_ + doc); + } + void setNextReader(const Lucene::IndexReaderPtr& reader, int32_t doc_base) override { + doc_base_ = doc_base; + } + bool acceptsDocsOutOfOrder() override { + return true; + } + const RoaringBitmap64& GetBitmap() const { + return bitmap_; + } + + private: + RoaringBitmap64 bitmap_; + int64_t doc_base_; +}; +} // namespace paimon::lucene diff --git a/src/paimon/global_index/lucene/lucene_filter.h b/src/paimon/global_index/lucene/lucene_filter.h new file mode 100644 index 00000000..a7c4c56d --- /dev/null +++ b/src/paimon/global_index/lucene/lucene_filter.h @@ -0,0 +1,83 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include "lucene++/LuceneHeaders.h" +#include "paimon/utils/roaring_bitmap64.h" + +namespace paimon::lucene { +class BitmapDocIdSetIterator : public Lucene::DocIdSetIterator { + public: + explicit BitmapDocIdSetIterator(const RoaringBitmap64* ids) + : Lucene::DocIdSetIterator(), ids_(ids), iter_(ids->Begin()) {} + + int32_t advance(int32_t target) override { + iter_.EqualOrLarger(static_cast(target)); + if (iter_ == ids_->End()) { + return Lucene::DocIdSetIterator::NO_MORE_DOCS; + } + return static_cast(*iter_); + } + + int32_t docID() override { + if (iter_ == ids_->End()) { + return Lucene::DocIdSetIterator::NO_MORE_DOCS; + } + return static_cast(*iter_); + } + + int32_t nextDoc() override { + if (iter_ == ids_->End()) { + return Lucene::DocIdSetIterator::NO_MORE_DOCS; + } + int32_t id = static_cast(*iter_); + ++iter_; + return id; + } + + private: + const RoaringBitmap64* ids_; + RoaringBitmap64::Iterator iter_; +}; + +class BitmapDocIdSet : public Lucene::DocIdSet { + public: + explicit BitmapDocIdSet(const RoaringBitmap64* ids) : DocIdSet(), ids_(ids) {} + + Lucene::DocIdSetIteratorPtr iterator() override { + return Lucene::newLucene(ids_); + } + + bool isCacheable() override { + return true; + } + + private: + const RoaringBitmap64* ids_; +}; + +class LuceneFilter : public Lucene::Filter { + public: + explicit LuceneFilter(const RoaringBitmap64* ids) : ids_(ids) {} + + Lucene::DocIdSetPtr getDocIdSet(const Lucene::IndexReaderPtr& reader) override { + return Lucene::newLucene(ids_); + } + + private: + const RoaringBitmap64* ids_; +}; + +} // namespace paimon::lucene diff --git a/src/paimon/global_index/lucene/lucene_filter_test.cpp b/src/paimon/global_index/lucene/lucene_filter_test.cpp new file mode 100644 index 00000000..8d0efb6d --- /dev/null +++ b/src/paimon/global_index/lucene/lucene_filter_test.cpp @@ -0,0 +1,46 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "paimon/global_index/lucene/lucene_filter.h" + +#include "gtest/gtest.h" +#include "lucene++/LuceneHeaders.h" +#include "paimon/testing/utils/testharness.h" + +namespace paimon::lucene::test { +TEST(LuceneFilterTest, TestSimple) { + RoaringBitmap64 roaring = RoaringBitmap64::From({1l, 3l, 5l, 100l}); + LuceneFilter filter(&roaring); + + auto doc_id_set = filter.getDocIdSet(/*reader=*/Lucene::IndexReaderPtr()); + ASSERT_TRUE(doc_id_set); + + auto doc_iter = doc_id_set->iterator(); + ASSERT_TRUE(doc_iter); + ASSERT_TRUE(doc_id_set->isCacheable()); + + ASSERT_EQ(1, doc_iter->nextDoc()); + + ASSERT_EQ(5, doc_iter->advance(4)); + ASSERT_EQ(5, doc_iter->docID()); + + ASSERT_EQ(100, doc_iter->advance(100)); + + ASSERT_EQ(Lucene::DocIdSetIterator::NO_MORE_DOCS, doc_iter->advance(1000)); + ASSERT_EQ(Lucene::DocIdSetIterator::NO_MORE_DOCS, doc_iter->docID()); + ASSERT_EQ(Lucene::DocIdSetIterator::NO_MORE_DOCS, doc_iter->nextDoc()); +} + +} // namespace paimon::lucene::test diff --git a/src/paimon/global_index/lucene/lucene_global_index.cpp b/src/paimon/global_index/lucene/lucene_global_index.cpp index 1502cf2a..7a027e18 100644 --- a/src/paimon/global_index/lucene/lucene_global_index.cpp +++ b/src/paimon/global_index/lucene/lucene_global_index.cpp @@ -25,8 +25,10 @@ #include "paimon/common/utils/rapidjson_util.h" #include "paimon/common/utils/uuid.h" #include "paimon/global_index/bitmap_vector_search_global_index_result.h" +#include "paimon/global_index/lucene/lucene_collector.h" #include "paimon/global_index/lucene/lucene_defs.h" #include "paimon/global_index/lucene/lucene_directory.h" +#include "paimon/global_index/lucene/lucene_filter.h" #include "paimon/global_index/lucene/lucene_utils.h" #include "paimon/io/data_input_stream.h" @@ -331,7 +333,7 @@ std::vector LuceneGlobalIndexReader::TokenizeQuery(const std::stri return wterms; } -Result> LuceneGlobalIndexReader::VisitFullTextSearch( +Result> LuceneGlobalIndexReader::VisitFullTextSearch( const std::shared_ptr& full_text_search) { try { Lucene::QueryPtr query; @@ -381,32 +383,45 @@ Result> LuceneGlobalIndexReader:: fmt::format("Not support for FullTextSearch SearchType {}", static_cast(full_text_search->search_type))); } + Lucene::FilterPtr filter = + full_text_search->pre_filter + ? Lucene::newLucene(&(full_text_search->pre_filter.value())) + : Lucene::FilterPtr(); - Lucene::TopDocsPtr results = searcher_->search(query, full_text_search->limit); + if (full_text_search->limit) { + Lucene::TopDocsPtr results = + searcher_->search(query, filter, full_text_search->limit.value()); - // prepare BitmapVectorSearchGlobalIndexResult - std::map id_to_score; - for (auto score_doc : results->scoreDocs) { - Lucene::DocumentPtr result_doc = searcher_->doc(score_doc->doc); - std::string row_id_str = - LuceneUtils::WstringToString(result_doc->get(kRowIdFieldWstring)); - std::optional row_id = StringUtils::StringToValue(row_id_str); - if (!row_id) { - return Status::Invalid(fmt::format("parse row id str {} to int failed"), - row_id_str); + // prepare BitmapVectorSearchGlobalIndexResult + std::map id_to_score; + for (auto score_doc : results->scoreDocs) { + Lucene::DocumentPtr result_doc = searcher_->doc(score_doc->doc); + std::string row_id_str = + LuceneUtils::WstringToString(result_doc->get(kRowIdFieldWstring)); + std::optional row_id = StringUtils::StringToValue(row_id_str); + if (!row_id) { + return Status::Invalid(fmt::format("parse row id str {} to int failed"), + row_id_str); + } + id_to_score[static_cast(row_id.value())] = + static_cast(score_doc->score); } - id_to_score[static_cast(row_id.value())] = - static_cast(score_doc->score); - } - RoaringBitmap64 bitmap; - std::vector scores; - scores.reserve(id_to_score.size()); - for (const auto& [id, score] : id_to_score) { - bitmap.Add(id); - scores.push_back(score); + RoaringBitmap64 bitmap; + std::vector scores; + scores.reserve(id_to_score.size()); + for (const auto& [id, score] : id_to_score) { + bitmap.Add(id); + scores.push_back(score); + } + return std::make_shared(std::move(bitmap), + std::move(scores)); + } else { + // with no limit & no score + auto collector = Lucene::newLucene(); + searcher_->search(query, filter, collector); + return std::make_shared( + [collector]() -> Result { return collector->GetBitmap(); }); } - return std::make_shared(std::move(bitmap), - std::move(scores)); } catch (const std::exception& e) { return Status::Invalid(fmt::format("visit term query failed, with {} error.", e.what())); } catch (...) { diff --git a/src/paimon/global_index/lucene/lucene_global_index.h b/src/paimon/global_index/lucene/lucene_global_index.h index 67dd27ec..f5875f9d 100644 --- a/src/paimon/global_index/lucene/lucene_global_index.h +++ b/src/paimon/global_index/lucene/lucene_global_index.h @@ -164,7 +164,7 @@ class LuceneGlobalIndexReader : public GlobalIndexReader { "LuceneGlobalIndexReader is not supposed to handle vector search query"); } - Result> VisitFullTextSearch( + Result> VisitFullTextSearch( const std::shared_ptr& full_text_search); private: diff --git a/src/paimon/global_index/lucene/lucene_global_index_test.cpp b/src/paimon/global_index/lucene/lucene_global_index_test.cpp index bd13a9c1..a93c88f5 100644 --- a/src/paimon/global_index/lucene/lucene_global_index_test.cpp +++ b/src/paimon/global_index/lucene/lucene_global_index_test.cpp @@ -100,16 +100,21 @@ class LuceneGlobalIndexTest : public ::testing::Test, pool_); } - void CheckResult(const std::shared_ptr& result, + void CheckResult(const std::shared_ptr& result, const std::vector& expected_ids) const { - auto typed_result = std::dynamic_pointer_cast(result); - ASSERT_TRUE(typed_result); - ASSERT_OK_AND_ASSIGN(const RoaringBitmap64* bitmap, typed_result->GetBitmap()); + const RoaringBitmap64* bitmap = nullptr; + if (auto vector_search_result = + std::dynamic_pointer_cast(result)) { + ASSERT_OK_AND_ASSIGN(bitmap, vector_search_result->GetBitmap()); + ASSERT_EQ(vector_search_result->scores_.size(), expected_ids.size()); + } else if (auto bitmap_result = + std::dynamic_pointer_cast(result)) { + ASSERT_OK_AND_ASSIGN(bitmap, bitmap_result->GetBitmap()); + } ASSERT_TRUE(bitmap); - ASSERT_EQ(*(typed_result->GetBitmap().value()), RoaringBitmap64::From(expected_ids)) - << "result=" << (typed_result->GetBitmap().value())->ToString() + ASSERT_EQ(*bitmap, RoaringBitmap64::From(expected_ids)) + << "result=" << bitmap->ToString() << ", expected=" << RoaringBitmap64::From(expected_ids).ToString(); - ASSERT_EQ(typed_result->scores_.size(), expected_ids.size()); } private: @@ -157,66 +162,128 @@ TEST_P(LuceneGlobalIndexTest, TestSimple) { ASSERT_OK_AND_ASSIGN(auto result, lucene_reader->VisitFullTextSearch(std::make_shared( "f0", - /*limit=*/10, "document", FullTextSearch::SearchType::MATCH_ALL))); + /*limit=*/10, "document", FullTextSearch::SearchType::MATCH_ALL, + /*pre_filter=*/std::nullopt))); CheckResult(result, {2l, 1l, 0l}); } { ASSERT_OK_AND_ASSIGN(auto result, lucene_reader->VisitFullTextSearch(std::make_shared( "f0", - /*limit=*/1, "document", FullTextSearch::SearchType::MATCH_ANY))); + /*limit=*/1, "document", FullTextSearch::SearchType::MATCH_ANY, + /*pre_filter=*/std::nullopt))); CheckResult(result, {2l}); } { ASSERT_OK_AND_ASSIGN( - auto result, - lucene_reader->VisitFullTextSearch(std::make_shared( - "f0", - /*limit=*/10, "test document", FullTextSearch::SearchType::MATCH_ALL))); + auto result, lucene_reader->VisitFullTextSearch(std::make_shared( + "f0", + /*limit=*/10, "test document", FullTextSearch::SearchType::MATCH_ALL, + /*pre_filter=*/std::nullopt))); CheckResult(result, {2l, 0l}); } { ASSERT_OK_AND_ASSIGN(auto result, lucene_reader->VisitFullTextSearch(std::make_shared( "f0", - /*limit=*/10, "test new", FullTextSearch::SearchType::MATCH_ANY))); + /*limit=*/10, "test new", FullTextSearch::SearchType::MATCH_ANY, + /*pre_filter=*/std::nullopt))); CheckResult(result, {1l, 0l, 2l}); } { - ASSERT_OK_AND_ASSIGN( - auto result, lucene_reader->VisitFullTextSearch(std::make_shared( - "f0", - /*limit=*/10, "test document", FullTextSearch::SearchType::PHRASE))); + ASSERT_OK_AND_ASSIGN(auto result, + lucene_reader->VisitFullTextSearch(std::make_shared( + "f0", + /*limit=*/10, "test document", FullTextSearch::SearchType::PHRASE, + /*pre_filter=*/std::nullopt))); CheckResult(result, {0l}); } { - ASSERT_OK_AND_ASSIGN( - auto result, lucene_reader->VisitFullTextSearch(std::make_shared( - "f0", - /*limit=*/10, "unordered", FullTextSearch::SearchType::MATCH_ALL))); + ASSERT_OK_AND_ASSIGN(auto result, + lucene_reader->VisitFullTextSearch(std::make_shared( + "f0", + /*limit=*/10, "unordered", FullTextSearch::SearchType::MATCH_ALL, + /*pre_filter=*/std::nullopt))); CheckResult(result, {3l}); } { ASSERT_OK_AND_ASSIGN(auto result, lucene_reader->VisitFullTextSearch(std::make_shared( "f0", - /*limit=*/10, "unorder", FullTextSearch::SearchType::PREFIX))); + /*limit=*/10, "unorder", FullTextSearch::SearchType::PREFIX, + /*pre_filter=*/std::nullopt))); CheckResult(result, {3l}); } + // test wildcard query { ASSERT_OK_AND_ASSIGN(auto result, lucene_reader->VisitFullTextSearch(std::make_shared( "f0", - /*limit=*/10, "*order*", FullTextSearch::SearchType::WILDCARD))); + /*limit=*/10, "*order*", FullTextSearch::SearchType::WILDCARD, + /*pre_filter=*/std::nullopt))); CheckResult(result, {3l}); } { ASSERT_OK_AND_ASSIGN(auto result, lucene_reader->VisitFullTextSearch(std::make_shared( "f0", - /*limit=*/10, "*or*er*", FullTextSearch::SearchType::WILDCARD))); + /*limit=*/10, "*or*er*", FullTextSearch::SearchType::WILDCARD, + /*pre_filter=*/std::nullopt))); CheckResult(result, {3l}); } + // test filter + { + ASSERT_OK_AND_ASSIGN(auto result, + lucene_reader->VisitFullTextSearch(std::make_shared( + "f0", + /*limit=*/10, "document", FullTextSearch::SearchType::MATCH_ALL, + /*pre_filter=*/RoaringBitmap64::From({0l, 1l})))); + CheckResult(result, {0l, 1l}); + } + { + ASSERT_OK_AND_ASSIGN(auto result, + lucene_reader->VisitFullTextSearch(std::make_shared( + "f0", + /*limit=*/10, "document", FullTextSearch::SearchType::MATCH_ALL, + /*pre_filter=*/RoaringBitmap64::From({2l, 100l})))); + CheckResult(result, {2l}); + } + { + ASSERT_OK_AND_ASSIGN(auto result, + lucene_reader->VisitFullTextSearch(std::make_shared( + "f0", + /*limit=*/10, "document", FullTextSearch::SearchType::MATCH_ALL, + /*pre_filter=*/RoaringBitmap64::From({20l, 100l})))); + CheckResult(result, {}); + } + // test no limit + { + ASSERT_OK_AND_ASSIGN( + auto result, + lucene_reader->VisitFullTextSearch(std::make_shared( + "f0", + /*limit=*/std::nullopt, "document", FullTextSearch::SearchType::MATCH_ALL, + /*pre_filter=*/std::nullopt))); + CheckResult(result, {0l, 1l, 2l}); + } + { + ASSERT_OK_AND_ASSIGN( + auto result, + lucene_reader->VisitFullTextSearch(std::make_shared( + "f0", + /*limit=*/std::nullopt, "document", FullTextSearch::SearchType::MATCH_ALL, + /*pre_filter=*/RoaringBitmap64::From({2l})))); + CheckResult(result, {2l}); + } + { + ASSERT_OK_AND_ASSIGN( + auto result, + lucene_reader->VisitFullTextSearch(std::make_shared( + "f0", + /*limit=*/std::nullopt, "document test", FullTextSearch::SearchType::MATCH_ALL, + /*pre_filter=*/RoaringBitmap64::From({1l, 2l, 3l, 100l})))); + CheckResult(result, {2l}); + } } INSTANTIATE_TEST_SUITE_P(ReadBufferSize, LuceneGlobalIndexTest, From caf743db1b108ce42f25218e5fd88aa46d4abe7e Mon Sep 17 00:00:00 2001 From: lxy264173 Date: Thu, 29 Jan 2026 18:29:35 +0800 Subject: [PATCH 2/4] fix --- src/paimon/global_index/lucene/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/paimon/global_index/lucene/CMakeLists.txt b/src/paimon/global_index/lucene/CMakeLists.txt index 43c76277..cb556ae9 100644 --- a/src/paimon/global_index/lucene/CMakeLists.txt +++ b/src/paimon/global_index/lucene/CMakeLists.txt @@ -41,7 +41,7 @@ if(PAIMON_ENABLE_LUCENE) lucene_api_test.cpp lucene_directory_test.cpp lucene_global_index_test.cpp - lucene_filter_test.cpp + lucene_filter_test.cpp EXTRA_INCLUDES ${LUCENE_INCLUDE_DIR} STATIC_LINK_LIBS From f6a9ba0cd803dff8e55e70e43f5c29ceea85a024 Mon Sep 17 00:00:00 2001 From: lxy264173 Date: Fri, 30 Jan 2026 09:04:29 +0800 Subject: [PATCH 3/4] fix --- include/paimon/predicate/full_text_search.h | 2 +- src/paimon/global_index/lucene/lucene_filter.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/paimon/predicate/full_text_search.h b/include/paimon/predicate/full_text_search.h index 377125ba..ad906d03 100644 --- a/include/paimon/predicate/full_text_search.h +++ b/include/paimon/predicate/full_text_search.h @@ -23,8 +23,8 @@ #include #include "paimon/predicate/predicate.h" +#include "paimon/utils/roaring_bitmap64.h" #include "paimon/visibility.h" - namespace paimon { /// A configuration structure for full-text search operations. struct PAIMON_EXPORT FullTextSearch { diff --git a/src/paimon/global_index/lucene/lucene_filter.h b/src/paimon/global_index/lucene/lucene_filter.h index a7c4c56d..c34f94e4 100644 --- a/src/paimon/global_index/lucene/lucene_filter.h +++ b/src/paimon/global_index/lucene/lucene_filter.h @@ -42,7 +42,7 @@ class BitmapDocIdSetIterator : public Lucene::DocIdSetIterator { if (iter_ == ids_->End()) { return Lucene::DocIdSetIterator::NO_MORE_DOCS; } - int32_t id = static_cast(*iter_); + auto id = static_cast(*iter_); ++iter_; return id; } From e896ad1c151726395fea8771ec1992d28edfc7ce Mon Sep 17 00:00:00 2001 From: lxy264173 Date: Fri, 30 Jan 2026 11:47:30 +0800 Subject: [PATCH 4/4] fix --- src/paimon/global_index/lucene/lucene_api_test.cpp | 10 +++++----- src/paimon/global_index/lucene/lucene_collector.h | 2 +- src/paimon/global_index/lucene/lucene_global_index.cpp | 4 ++-- .../global_index/lucene/lucene_global_index_test.cpp | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/paimon/global_index/lucene/lucene_api_test.cpp b/src/paimon/global_index/lucene/lucene_api_test.cpp index 6d75c040..d6868f0f 100644 --- a/src/paimon/global_index/lucene/lucene_api_test.cpp +++ b/src/paimon/global_index/lucene/lucene_api_test.cpp @@ -139,7 +139,7 @@ TEST_F(LuceneInterfaceTest, TestSimple) { } else { results = searcher->search(query, limit); } - // ASSERT_EQ(expected_doc_id_vec.size(), results->scoreDocs.size()); + ASSERT_EQ(expected_doc_id_vec.size(), results->scoreDocs.size()); std::vector resule_doc_id_vec; std::vector result_doc_id_content_vec; @@ -169,12 +169,12 @@ TEST_F(LuceneInterfaceTest, TestSimple) { std::vector({L"5"})); // test filter - search(L"document", /*limit=*/10, /*selected_id=*/std::optional>({0, 1}), + search(L"document", /*limit=*/10, /*selected_id=*/std::vector({0, 1}), std::vector({1, 0}), std::vector({L"1", L"0"})); search(L"document OR unordered", /*limit=*/10, - /*selected_id=*/std::optional>({0, 1, 3}), - std::vector({3, 1, 0}), std::vector({L"5", L"1", L"0"})); - search(L"unordered", /*limit=*/10, /*selected_id=*/std::optional>({0}), + /*selected_id=*/std::vector({0, 1, 3}), std::vector({3, 1, 0}), + std::vector({L"5", L"1", L"0"})); + search(L"unordered", /*limit=*/10, /*selected_id=*/std::vector({0}), std::vector(), std::vector()); reader->close(); diff --git a/src/paimon/global_index/lucene/lucene_collector.h b/src/paimon/global_index/lucene/lucene_collector.h index cec1193c..9fda5a68 100644 --- a/src/paimon/global_index/lucene/lucene_collector.h +++ b/src/paimon/global_index/lucene/lucene_collector.h @@ -39,6 +39,6 @@ class LuceneCollector : public Lucene::Collector { private: RoaringBitmap64 bitmap_; - int64_t doc_base_; + int64_t doc_base_ = 0; }; } // namespace paimon::lucene diff --git a/src/paimon/global_index/lucene/lucene_global_index.cpp b/src/paimon/global_index/lucene/lucene_global_index.cpp index 7a027e18..5e4dc5f6 100644 --- a/src/paimon/global_index/lucene/lucene_global_index.cpp +++ b/src/paimon/global_index/lucene/lucene_global_index.cpp @@ -400,8 +400,8 @@ Result> LuceneGlobalIndexReader::VisitFullTex LuceneUtils::WstringToString(result_doc->get(kRowIdFieldWstring)); std::optional row_id = StringUtils::StringToValue(row_id_str); if (!row_id) { - return Status::Invalid(fmt::format("parse row id str {} to int failed"), - row_id_str); + return Status::Invalid( + fmt::format("parse row id str {} to int failed", row_id_str)); } id_to_score[static_cast(row_id.value())] = static_cast(score_doc->score); diff --git a/src/paimon/global_index/lucene/lucene_global_index_test.cpp b/src/paimon/global_index/lucene/lucene_global_index_test.cpp index a93c88f5..f5274f7e 100644 --- a/src/paimon/global_index/lucene/lucene_global_index_test.cpp +++ b/src/paimon/global_index/lucene/lucene_global_index_test.cpp @@ -106,7 +106,7 @@ class LuceneGlobalIndexTest : public ::testing::Test, if (auto vector_search_result = std::dynamic_pointer_cast(result)) { ASSERT_OK_AND_ASSIGN(bitmap, vector_search_result->GetBitmap()); - ASSERT_EQ(vector_search_result->scores_.size(), expected_ids.size()); + ASSERT_EQ(vector_search_result->GetScores().size(), expected_ids.size()); } else if (auto bitmap_result = std::dynamic_pointer_cast(result)) { ASSERT_OK_AND_ASSIGN(bitmap, bitmap_result->GetBitmap());