Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions include/paimon/global_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ namespace paimon {
/// not necessarily the exact number of threads at a given point in time.
///
/// You can change this number using SetArrowCpuThreadPoolCapacity().
PAIMON_EXPORT int GetArrowCpuThreadPoolCapacity();
PAIMON_EXPORT int32_t GetArrowCpuThreadPoolCapacity();

/// Set the capacity of the arrow's global thread pool
/// This is a simple wrapper of arrow::SetCpuThreadPoolCapacity()
Expand All @@ -40,6 +40,6 @@ PAIMON_EXPORT int GetArrowCpuThreadPoolCapacity();
/// The current number is returned by GetArrowCpuThreadPoolCapacity().
/// Currently, this capacity will significantly affect the performance
/// of parquet file batch read.
PAIMON_EXPORT Status SetArrowCpuThreadPoolCapacity(int threads);
PAIMON_EXPORT Status SetArrowCpuThreadPoolCapacity(int32_t threads);

} // namespace paimon
22 changes: 16 additions & 6 deletions include/paimon/predicate/full_text_search.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@
#include <vector>

#include "paimon/predicate/predicate.h"
#include "paimon/utils/roaring_bitmap64.h"
#include "paimon/visibility.h"

namespace paimon {
/// A configuration structure for full-text search operations.
struct PAIMON_EXPORT FullTextSearch {
Expand All @@ -44,14 +44,20 @@ struct PAIMON_EXPORT FullTextSearch {
UNKNOWN = 128
};

FullTextSearch(const std::string& _field_name, int32_t _limit, const std::string& _query,
const SearchType& _search_type)
: field_name(_field_name), limit(_limit), query(_query), search_type(_search_type) {}
FullTextSearch(const std::string& _field_name, std::optional<int32_t> _limit,
const std::string& _query, const SearchType& _search_type,
const std::optional<RoaringBitmap64>& _pre_filter)
: field_name(_field_name),
limit(_limit),
query(_query),
search_type(_search_type),
pre_filter(_pre_filter) {}

/// Name of the field to search within (must be a full-text indexed field).
std::string field_name;
/// Maximum number of documents to return. Ordered by scores.
int32_t limit;
/// Maximum number of documents to return. If set, limit ordered by top scores. Otherwise, no
/// score return.
std::optional<int32_t> limit;
/// The query string to search for. The interpretation depends on search_type:
///
/// - For MATCH_ALL/MATCH_ANY: keywords are split into terms using the **same analyzer as
Expand All @@ -70,5 +76,9 @@ struct PAIMON_EXPORT FullTextSearch {
std::string query;
/// Type of search to perform.
SearchType search_type;
/// A pre-filter based on **local row IDs**, implemented by leveraging another global index.
/// Only rows whose local row ID is present in `pre_filter` will be included during search.
/// If not set, all rows will be included.
std::optional<RoaringBitmap64> pre_filter;
};
} // namespace paimon
2 changes: 2 additions & 0 deletions include/paimon/utils/roaring_bitmap64.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ class PAIMON_EXPORT RoaringBitmap64 {
Iterator& operator++();
bool operator==(const Iterator& other) const;
bool operator!=(const Iterator& other) const;
/// Move the iterator to the value which is equal or larger than input value
void EqualOrLarger(int64_t value);

private:
void* iterator_ = nullptr;
Expand Down
4 changes: 4 additions & 0 deletions src/paimon/common/utils/roaring_bitmap64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,10 @@ bool RoaringBitmap64::Iterator::operator!=(const Iterator& other) const {
return !(*this == other);
}

void RoaringBitmap64::Iterator::EqualOrLarger(int64_t value) {
[[maybe_unused]] bool _ = GetIterator(iterator_).move(value);
}

RoaringBitmap64::RoaringBitmap64() {
roaring_bitmap_ = new roaring::Roaring64Map();
}
Expand Down
11 changes: 11 additions & 0 deletions src/paimon/common/utils/roaring_bitmap64_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -403,4 +403,15 @@ TEST(RoaringBitmap64Test, TestFromRoaringBitmap32) {
}
}

TEST(RoaringBitmap64Test, TestIteratorEqualOrLarger) {
RoaringBitmap64 roaring = RoaringBitmap64::From({1l, 3l, 5l, 100l});
auto iter = roaring.Begin();
ASSERT_EQ(*iter, 1l);
iter.EqualOrLarger(5l);
ASSERT_EQ(*iter, 5l);
iter.EqualOrLarger(10l);
ASSERT_EQ(*iter, 100l);
iter.EqualOrLarger(200l);
ASSERT_EQ(iter, roaring.End());
}
} // namespace paimon::test
1 change: 1 addition & 0 deletions src/paimon/global_index/lucene/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ if(PAIMON_ENABLE_LUCENE)
lucene_api_test.cpp
lucene_directory_test.cpp
lucene_global_index_test.cpp
lucene_filter_test.cpp
EXTRA_INCLUDES
${LUCENE_INCLUDE_DIR}
STATIC_LINK_LIBS
Expand Down
102 changes: 90 additions & 12 deletions src/paimon/global_index/lucene/lucene_api_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,67 @@
#include "paimon/testing/utils/testharness.h"

namespace paimon::lucene::test {
TEST(LuceneInterfaceTest, TestSimple) {
class LuceneInterfaceTest : public ::testing::Test {
public:
void SetUp() override {}
void TearDown() override {}

class TestDocIdSetIterator : public Lucene::DocIdSetIterator {
public:
explicit TestDocIdSetIterator(const std::vector<int32_t>& ids)
: Lucene::DocIdSetIterator(), ids_(ids) {}

int32_t advance(int32_t target) override {
int32_t doc_id = nextDoc();
while (doc_id < target) {
doc_id = nextDoc();
}
return doc_id;
}
int32_t docID() override {
return ids_[cursor_];
}
int32_t nextDoc() override {
if (cursor_ == ids_.size()) {
return Lucene::DocIdSetIterator::NO_MORE_DOCS;
}
return ids_[cursor_++];
}

private:
size_t cursor_ = 0;
std::vector<int32_t> ids_;
};

class TestDocIdSet : public Lucene::DocIdSet {
public:
explicit TestDocIdSet(const std::vector<int32_t>& ids) : DocIdSet(), ids_(ids) {}

Lucene::DocIdSetIteratorPtr iterator() override {
return Lucene::newLucene<TestDocIdSetIterator>(ids_);
}
bool isCacheable() override {
return true;
}

private:
std::vector<int32_t> ids_;
};

class TestFilter : public Lucene::Filter {
public:
explicit TestFilter(const std::vector<int32_t>& ids) : ids_(ids) {}

Lucene::DocIdSetPtr getDocIdSet(const Lucene::IndexReaderPtr& reader) override {
return Lucene::newLucene<TestDocIdSet>(ids_);
}

private:
std::vector<int32_t> ids_;
};
};

TEST_F(LuceneInterfaceTest, TestSimple) {
auto dir = paimon::test::UniqueTestDirectory::Create("local");
std::string index_path = dir->Str() + "/lucene_test";
auto lucene_dir = Lucene::FSDirectory::open(LuceneUtils::StringToWstring(index_path),
Expand Down Expand Up @@ -68,10 +128,17 @@ TEST(LuceneInterfaceTest, TestSimple) {
parser->setAllowLeadingWildcard(true);

auto search = [&](const std::wstring& query_str, int32_t limit,
const std::optional<std::vector<int32_t>> selected_id,
const std::vector<int32_t>& expected_doc_id_vec,
const std::vector<std::wstring>& expected_doc_id_content_vec) {
Lucene::QueryPtr query = parser->parse(query_str);
Lucene::TopDocsPtr results = searcher->search(query, limit);
Lucene::TopDocsPtr results;
if (selected_id) {
Lucene::FilterPtr lucene_filter = Lucene::newLucene<TestFilter>(selected_id.value());
results = searcher->search(query, lucene_filter, limit);
} else {
results = searcher->search(query, limit);
}
ASSERT_EQ(expected_doc_id_vec.size(), results->scoreDocs.size());

std::vector<int32_t> resule_doc_id_vec;
Expand All @@ -86,18 +153,29 @@ TEST(LuceneInterfaceTest, TestSimple) {
};

// result is sorted by tf-idf score
search(L"document", /*limit=*/10, std::vector<int32_t>({2, 1, 0}),
search(L"document", /*limit=*/10, /*selected_id=*/std::nullopt, std::vector<int32_t>({2, 1, 0}),
std::vector<std::wstring>({L"2", L"1", L"0"}));
search(L"document", /*limit=*/1, std::vector<int32_t>({2}), std::vector<std::wstring>({L"2"}));
search(L"test AND document", /*limit=*/10, std::vector<int32_t>({2, 0}),
std::vector<std::wstring>({L"2", L"0"}));
search(L"test OR new", /*limit=*/10, std::vector<int32_t>({1, 0, 2}),
std::vector<std::wstring>({L"1", L"0", L"2"}));
search(L"\"test document\"", /*limit=*/10, std::vector<int32_t>({0}),
std::vector<std::wstring>({L"0"}));
search(L"unordered", /*limit=*/10, std::vector<int32_t>({3}),
search(L"document", /*limit=*/1, /*selected_id=*/std::nullopt, std::vector<int32_t>({2}),
std::vector<std::wstring>({L"2"}));
search(L"test AND document", /*limit=*/10, /*selected_id=*/std::nullopt,
std::vector<int32_t>({2, 0}), std::vector<std::wstring>({L"2", L"0"}));
search(L"test OR new", /*limit=*/10, /*selected_id=*/std::nullopt,
std::vector<int32_t>({1, 0, 2}), std::vector<std::wstring>({L"1", L"0", L"2"}));
search(L"\"test document\"", /*limit=*/10, /*selected_id=*/std::nullopt,
std::vector<int32_t>({0}), std::vector<std::wstring>({L"0"}));
search(L"unordered", /*limit=*/10, /*selected_id=*/std::nullopt, std::vector<int32_t>({3}),
std::vector<std::wstring>({L"5"}));
search(L"*orDer*", /*limit=*/10, std::vector<int32_t>({3}), std::vector<std::wstring>({L"5"}));
search(L"*orDer*", /*limit=*/10, /*selected_id=*/std::nullopt, std::vector<int32_t>({3}),
std::vector<std::wstring>({L"5"}));

// test filter
search(L"document", /*limit=*/10, /*selected_id=*/std::vector<int32_t>({0, 1}),
std::vector<int32_t>({1, 0}), std::vector<std::wstring>({L"1", L"0"}));
search(L"document OR unordered", /*limit=*/10,
/*selected_id=*/std::vector<int32_t>({0, 1, 3}), std::vector<int32_t>({3, 1, 0}),
std::vector<std::wstring>({L"5", L"1", L"0"}));
search(L"unordered", /*limit=*/10, /*selected_id=*/std::vector<int32_t>({0}),
std::vector<int32_t>(), std::vector<std::wstring>());

reader->close();
lucene_dir->close();
Expand Down
44 changes: 44 additions & 0 deletions src/paimon/global_index/lucene/lucene_collector.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/*
* Copyright 2026-present Alibaba Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "lucene++/LuceneHeaders.h"
#include "paimon/utils/roaring_bitmap64.h"

namespace paimon::lucene {
class LuceneCollector : public Lucene::Collector {
public:
LuceneCollector() : Lucene::Collector() {}
void setScorer(const Lucene::ScorerPtr& scorer) override {
// ignore scorer
}
void collect(int32_t doc) override {
bitmap_.Add(doc_base_ + doc);
}
void setNextReader(const Lucene::IndexReaderPtr& reader, int32_t doc_base) override {
doc_base_ = doc_base;
}
bool acceptsDocsOutOfOrder() override {
return true;
}
const RoaringBitmap64& GetBitmap() const {
return bitmap_;
}

private:
RoaringBitmap64 bitmap_;
int64_t doc_base_ = 0;
};
} // namespace paimon::lucene
83 changes: 83 additions & 0 deletions src/paimon/global_index/lucene/lucene_filter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
/*
* Copyright 2026-present Alibaba Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "lucene++/LuceneHeaders.h"
#include "paimon/utils/roaring_bitmap64.h"

namespace paimon::lucene {
class BitmapDocIdSetIterator : public Lucene::DocIdSetIterator {
public:
explicit BitmapDocIdSetIterator(const RoaringBitmap64* ids)
: Lucene::DocIdSetIterator(), ids_(ids), iter_(ids->Begin()) {}

int32_t advance(int32_t target) override {
iter_.EqualOrLarger(static_cast<int64_t>(target));
if (iter_ == ids_->End()) {
return Lucene::DocIdSetIterator::NO_MORE_DOCS;
}
return static_cast<int32_t>(*iter_);
}

int32_t docID() override {
if (iter_ == ids_->End()) {
return Lucene::DocIdSetIterator::NO_MORE_DOCS;
}
return static_cast<int32_t>(*iter_);
}

int32_t nextDoc() override {
if (iter_ == ids_->End()) {
return Lucene::DocIdSetIterator::NO_MORE_DOCS;
}
auto id = static_cast<int32_t>(*iter_);
++iter_;
return id;
}

private:
const RoaringBitmap64* ids_;
RoaringBitmap64::Iterator iter_;
};

class BitmapDocIdSet : public Lucene::DocIdSet {
public:
explicit BitmapDocIdSet(const RoaringBitmap64* ids) : DocIdSet(), ids_(ids) {}

Lucene::DocIdSetIteratorPtr iterator() override {
return Lucene::newLucene<BitmapDocIdSetIterator>(ids_);
}

bool isCacheable() override {
return true;
}

private:
const RoaringBitmap64* ids_;
};

class LuceneFilter : public Lucene::Filter {
public:
explicit LuceneFilter(const RoaringBitmap64* ids) : ids_(ids) {}

Lucene::DocIdSetPtr getDocIdSet(const Lucene::IndexReaderPtr& reader) override {
return Lucene::newLucene<BitmapDocIdSet>(ids_);
}

private:
const RoaringBitmap64* ids_;
};

} // namespace paimon::lucene
Loading
Loading