Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ option(PAIMON_ENABLE_ORC "Whether to enable orc file format" ON)
option(PAIMON_ENABLE_LANCE "Whether to enable lance file format" OFF)
option(PAIMON_ENABLE_JINDO "Whether to enable jindo file system" OFF)
option(PAIMON_ENABLE_LUMINA "Whether to enable lumina vector index" ON)
option(PAIMON_ENABLE_LUCENE "Whether to enable lucene index" ON)

if(PAIMON_ENABLE_ORC)
add_definitions(-DPAIMON_ENABLE_ORC)
Expand Down Expand Up @@ -82,6 +83,10 @@ if(PAIMON_ENABLE_LUMINA)
add_definitions(-DPAIMON_ENABLE_LUMINA)
endif()

if(PAIMON_ENABLE_LUCENE)
add_definitions(-DPAIMON_ENABLE_LUCENE)
endif()

add_definitions(-DSNAPPY_CODEC_AVAILABLE)
add_definitions(-DZSTD_CODEC_AVAILABLE)
add_definitions(-DRAPIDJSON_HAS_STDSTRING)
Expand Down Expand Up @@ -379,6 +384,11 @@ if(PAIMON_BUILD_TESTS)
list(APPEND TEST_STATIC_LINK_LIBS paimon_lumina_index_shared)
list(APPEND TEST_STATIC_LINK_LIBS "-Wl,--as-needed")
endif()
if(PAIMON_ENABLE_LUCENE)
list(APPEND TEST_STATIC_LINK_LIBS "-Wl,--no-as-needed")
list(APPEND TEST_STATIC_LINK_LIBS paimon_lucene_index_shared)
list(APPEND TEST_STATIC_LINK_LIBS "-Wl,--as-needed")
endif()

endif()

Expand Down Expand Up @@ -407,6 +417,7 @@ add_subdirectory(src/paimon/format/parquet)
add_subdirectory(src/paimon/format/avro)
add_subdirectory(src/paimon/format/lance)
add_subdirectory(src/paimon/global_index/lumina)
add_subdirectory(src/paimon/global_index/lucene)
add_subdirectory(src/paimon/testing/mock)
add_subdirectory(src/paimon/testing/utils)
add_subdirectory(test/inte)
5 changes: 5 additions & 0 deletions cmake_modules/SetupCxxFlags.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,13 @@ if("${BUILD_WARNING_LEVEL}" STREQUAL "CHECKIN")
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-unused-parameter")
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-unknown-warning-option")
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-constant-logical-operand")
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-deprecated-declarations")
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-deprecated-builtins")
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wall")
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-conversion")
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-deprecated-declarations")
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-deprecated-builtins")
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-sign-conversion")
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-unused-variable")
else()
Expand Down Expand Up @@ -182,6 +185,8 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STRE

# Don't complain about optimization passes that were not possible
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-pass-failed")
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-deprecated-declarations")
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -Wno-deprecated-builtins")

if(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
# Depending on the default OSX_DEPLOYMENT_TARGET (< 10.9), libstdc++ may be
Expand Down
165 changes: 165 additions & 0 deletions cmake_modules/ThirdpartyToolchain.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,18 @@ else()
endif()
endif()

if(DEFINED ENV{PAIMON_LUCENE_URL})
set(LUCENE_SOURCE_URL "$ENV{PAIMON_LUCENE_URL}")
else()
if(EXISTS "${THIRDPARTY_DIR}/${PAIMON_LUCENE_PKG_NAME}")
set_urls(LUCENE_SOURCE_URL "${THIRDPARTY_DIR}/${PAIMON_LUCENE_PKG_NAME}")
else()
set_urls(LUCENE_SOURCE_URL
"${THIRDPARTY_MIRROR_URL}https://github.com/luceneplusplus/LucenePlusPlus/archive/refs/tags/${PAIMON_LUCENE_PKG_NAME}"
)
endif()
endif()

if(DEFINED ENV{PAIMON_GLOG_URL})
set(GLOG_SOURCE_URL "$ENV{PAIMON_GLOG_URL}")
else()
Expand Down Expand Up @@ -275,6 +287,62 @@ set(EP_COMMON_CMAKE_ARGS
-DCMAKE_C_FLAGS=${EP_C_FLAGS}
-DCMAKE_INSTALL_LIBDIR=lib)

macro(build_lucene)
message(STATUS "Building lucene from source")
set(LUCENE_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/lucene_ep-install")
set(LUCENE_CMAKE_ARGS
${EP_COMMON_CMAKE_ARGS}
"-DENABLE_TEST=OFF"
"-DCMAKE_C_FLAGS=-pthread"
"-DCMAKE_CXX_FLAGS=-pthread"
"-DCMAKE_EXE_LINKER_FLAGS=-pthread"
"-DBoost_INCLUDE_DIR=${BOOST_INCLUDE_DIR}"
"-DBoost_LIBRARY_DIR=${BOOST_LIBRARY_DIR}"
"-DBOOST_ROOT=${BOOST_INSTALL}"
"-DBoost_CHRONO_FOUND=TRUE"
"-DBoost_THREAD_FOUND=TRUE"
"-DCMAKE_INSTALL_PREFIX=${LUCENE_PREFIX}")

set(LUCENE_LIB "${LUCENE_PREFIX}/lib/liblucene++.so.0")
externalproject_add(lucene_ep
${EP_COMMON_OPTIONS}
URL ${LUCENE_SOURCE_URL}
URL_HASH "SHA256=${PAIMON_LUCENE_BUILD_SHA256_CHECKSUM}"
CMAKE_ARGS ${LUCENE_CMAKE_ARGS}
BUILD_BYPRODUCTS ${LUCENE_LIB}
DEPENDS boost_date_time
boost_filesystem
boost_regex
boost_thread
boost_iostreams
boost_system
boost_chrono
boost_atomic)

set(LUCENE_INCLUDE_DIR "${LUCENE_PREFIX}/include")
# The include directory must exist before it is referenced by a target.
file(MAKE_DIRECTORY "${LUCENE_INCLUDE_DIR}")
include_directories(SYSTEM ${LUCENE_INCLUDE_DIR} ${BOOST_INCLUDE_DIR}
${BOOST_EXTRA_INCLUDE_DIR})
add_library(lucene INTERFACE IMPORTED)
target_include_directories(lucene SYSTEM INTERFACE "${LUCENE_INCLUDE_DIR}")
target_compile_options(lucene INTERFACE -pthread)

target_link_libraries(lucene
INTERFACE "${LUCENE_LIB}"
boost_date_time
boost_filesystem
boost_regex
boost_thread
boost_iostreams
boost_system
boost_chrono
boost_atomic
pthread
dl)
add_dependencies(lucene lucene_ep)
endmacro()

macro(build_rapidjson)
message(STATUS "Building RapidJSON from source")
set(RAPIDJSON_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/rapidjson_ep-install")
Expand Down Expand Up @@ -342,6 +410,99 @@ macro(build_fmt)
add_dependencies(fmt fmt_ep)
endmacro(build_fmt)

macro(build_boost)
message(STATUS "Building boost from source")
set(BOOST_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/boost_ep-prefix")
set(BOOST_INSTALL "${CMAKE_CURRENT_BINARY_DIR}/boost_ep-install")
set(BOOST_INCLUDE_DIR "${BOOST_INSTALL}/include")
set(BOOST_LIBRARY_DIR ${BOOST_INSTALL}/lib)
file(MAKE_DIRECTORY ${BOOST_INCLUDE_DIR})
file(MAKE_DIRECTORY ${BOOST_LIBRARY_DIR})

set(BOOST_BYPRODUCTS
${BOOST_LIBRARY_DIR}/libboost_date_time.a
${BOOST_LIBRARY_DIR}/libboost_filesystem.a
${BOOST_LIBRARY_DIR}/libboost_system.a
${BOOST_LIBRARY_DIR}/libboost_regex.a
${BOOST_LIBRARY_DIR}/libboost_thread.a
${BOOST_LIBRARY_DIR}/libboost_atomic.a
${BOOST_LIBRARY_DIR}/libboost_chrono.a
${BOOST_LIBRARY_DIR}/libboost_iostreams.a)

externalproject_add(boost_ep
GIT_REPOSITORY https://github.com/boostorg/boost.git
GIT_TAG boost-${PAIMON_BOOST_BUILD_VERSION}
GIT_SHALLOW FALSE
GIT_PROGRESS TRUE
GIT_SUBMODULES_RECURSE TRUE
CONFIGURE_COMMAND ${BOOST_PREFIX}/src/boost_ep/bootstrap.sh
--with-libraries=date_time,filesystem,iostreams,regex,system,thread,chrono,atomic
BUILD_IN_SOURCE TRUE
BUILD_COMMAND ${BOOST_PREFIX}/src/boost_ep/b2
--prefix=${BOOST_INSTALL}
--libdir=${BOOST_LIBRARY_DIR} link=static
runtime-link=shared threading=multi variant=release
cxxflags=-fPIC install
INSTALL_COMMAND bash -c
"mkdir -p ${BOOST_INSTALL}/include/boost && cp -r ${BOOST_PREFIX}/src/boost_ep/libs/*/include/boost/* ${BOOST_INSTALL}/include/boost && cp -r ${BOOST_PREFIX}/src/boost_ep/libs/*/*/include/boost/* ${BOOST_INSTALL}/include/boost"
BUILD_BYPRODUCTS ${BOOST_BYPRODUCTS}
LOG_DOWNLOAD ON
LOG_CONFIGURE ON
LOG_BUILD ON)

include_directories(SYSTEM ${BOOST_INCLUDE_DIR})

add_library(boost_atomic STATIC IMPORTED)
set_target_properties(boost_atomic
PROPERTIES IMPORTED_LOCATION
${BOOST_LIBRARY_DIR}/libboost_atomic.a
INTERFACE_INCLUDE_DIRECTORIES ${BOOST_INCLUDE_DIR})
add_library(boost_chrono STATIC IMPORTED)
set_target_properties(boost_chrono
PROPERTIES IMPORTED_LOCATION
${BOOST_LIBRARY_DIR}/libboost_chrono.a
INTERFACE_INCLUDE_DIRECTORIES ${BOOST_INCLUDE_DIR})
add_library(boost_date_time STATIC IMPORTED)
set_target_properties(boost_date_time
PROPERTIES IMPORTED_LOCATION
${BOOST_LIBRARY_DIR}/libboost_date_time.a
INTERFACE_INCLUDE_DIRECTORIES ${BOOST_INCLUDE_DIR})
add_library(boost_filesystem STATIC IMPORTED)
set_target_properties(boost_filesystem
PROPERTIES IMPORTED_LOCATION
${BOOST_LIBRARY_DIR}/libboost_filesystem.a
INTERFACE_INCLUDE_DIRECTORIES ${BOOST_INCLUDE_DIR})
add_library(boost_regex STATIC IMPORTED)
set_target_properties(boost_regex
PROPERTIES IMPORTED_LOCATION
${BOOST_LIBRARY_DIR}/libboost_regex.a
INTERFACE_INCLUDE_DIRECTORIES ${BOOST_INCLUDE_DIR})
add_library(boost_thread STATIC IMPORTED)
set_target_properties(boost_thread
PROPERTIES IMPORTED_LOCATION
${BOOST_LIBRARY_DIR}/libboost_thread.a
INTERFACE_INCLUDE_DIRECTORIES ${BOOST_INCLUDE_DIR})
add_library(boost_iostreams STATIC IMPORTED)
set_target_properties(boost_iostreams
PROPERTIES IMPORTED_LOCATION
${BOOST_LIBRARY_DIR}/libboost_iostreams.a
INTERFACE_INCLUDE_DIRECTORIES ${BOOST_INCLUDE_DIR})
add_library(boost_system STATIC IMPORTED)
set_target_properties(boost_system
PROPERTIES IMPORTED_LOCATION
${BOOST_LIBRARY_DIR}/libboost_system.a
INTERFACE_INCLUDE_DIRECTORIES ${BOOST_INCLUDE_DIR})

add_dependencies(boost_atomic boost_ep)
add_dependencies(boost_chrono boost_ep)
add_dependencies(boost_date_time boost_ep)
add_dependencies(boost_filesystem boost_ep)
add_dependencies(boost_regex boost_ep)
add_dependencies(boost_thread boost_ep)
add_dependencies(boost_iostreams boost_ep)
add_dependencies(boost_system boost_ep)
endmacro(build_boost)

macro(build_snappy)
message(STATUS "Building snappy from source")
set(SNAPPY_HOME "${CMAKE_CURRENT_BINARY_DIR}/snappy_ep-install")
Expand Down Expand Up @@ -1108,3 +1269,7 @@ if(PAIMON_ENABLE_JINDO)
build_jindosdk_c()
build_jindosdk_nextarch()
endif()
if(PAIMON_ENABLE_LUCENE)
build_boost()
build_lucene()
endif()
74 changes: 74 additions & 0 deletions include/paimon/predicate/full_text_search.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
* Copyright 2026-present Alibaba Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once
#include <functional>
#include <map>
#include <memory>
#include <optional>
#include <string>
#include <vector>

#include "paimon/predicate/predicate.h"
#include "paimon/visibility.h"

namespace paimon {
/// A configuration structure for full-text search operations.
struct PAIMON_EXPORT FullTextSearch {
/// Enumeration of supported full-text search types.
enum class SearchType {
/// All terms in the query must be present (AND semantics).
MATCH_ALL = 1,
/// Any term in the query can match (OR semantics).
MATCH_ANY = 2,
/// Matches the exact sequence of words (with proximity).
PHRASE = 3,
/// Matches terms starting with the given string (e.g., "run*" → running, runner).
PREFIX = 4,
/// Supports wildcards * and ? (e.g., "ap*e", "app?e" -> "apple").
WILDCARD = 5,
/// Default/fallback type for unrecognized or invalid queries.
UNKNOWN = 128
};

FullTextSearch(const std::string& _field_name, int32_t _limit, const std::string& _query,
const SearchType& _search_type)
: field_name(_field_name), limit(_limit), query(_query), search_type(_search_type) {}

/// Name of the field to search within (must be a full-text indexed field).
std::string field_name;
/// Maximum number of documents to return. Ordered by scores.
int32_t limit;
/// The query string to search for. The interpretation depends on search_type:
///
/// - For MATCH_ALL/MATCH_ANY: keywords are split into terms using the **same analyzer as
/// indexing**.
/// Example: "Hello World" → terms ["hello", "world"] (after lowercasing and tokenization).
///
/// - For PHRASE: matches the exact word sequence (with optional slop). Also be analyzed.
///
/// - For PREFIX: matches terms starting with the given string (e.g., "run" → running, runner).
/// Only the prefix part is considered; analysis will not be applied.
///
/// - For WILDCARD: supports wildcards * and ? (e.g., "ap*e", "app?e").
/// Not passed through analyzer — matched directly against indexed terms.
///
/// @note Analyzer consistency between indexing and querying is critical for correctness.
std::string query;
/// Type of search to perform.
SearchType search_type;
};
} // namespace paimon
2 changes: 1 addition & 1 deletion src/paimon/common/io/data_output_stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class OutputStream;

// data output stream, support WriteValue() and WriteString() from OutputStream, also do big-endian
// conversion to ensure cross-language compatibility
class DataOutputStream {
class PAIMON_EXPORT DataOutputStream {
public:
explicit DataOutputStream(const std::shared_ptr<OutputStream>& output_stream);

Expand Down
13 changes: 13 additions & 0 deletions src/paimon/common/utils/options_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,19 @@ class OptionsUtils {
return value.value();
}

/// Fetch options with specific prefix and remove prefix for key.
static std::map<std::string, std::string> FetchOptionsWithPrefix(
const std::string& prefix, const std::map<std::string, std::string>& options) {
std::map<std::string, std::string> options_with_prefix;
int64_t prefix_len = prefix.size();
for (const auto& [key, value] : options) {
if (StringUtils::StartsWith(key, prefix)) {
options_with_prefix[key.substr(prefix_len)] = value;
}
}
return options_with_prefix;
}

private:
template <typename T>
static std::string GetTypeName() {
Expand Down
7 changes: 7 additions & 0 deletions src/paimon/common/utils/options_utils_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,4 +59,11 @@ TEST(OptionsUtilsTest, TestGetValueFromMap) {
OptionsUtils::GetValueFromMap<int32_t>(key_value_map, "", 999));
ASSERT_EQ(999, empty);
}

TEST(OptionsUtilsTest, TestFetchOptionsWithPrefix) {
std::map<std::string, std::string> options = {{"key1", "value1"}, {"test.key2", "value2"}};
auto new_options = OptionsUtils::FetchOptionsWithPrefix("test.", options);
std::map<std::string, std::string> expected = {{"key2", "value2"}};
ASSERT_EQ(expected, new_options);
}
} // namespace paimon::test
Loading
Loading