Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
a6d4a36
feat: support segmented inverted index build and search
Xuanwo Mar 26, 2026
1efcfa9
test: cover segmented inverted index workflows
Xuanwo Mar 26, 2026
b71d333
fix: address segmented inverted index review issues
Xuanwo Mar 26, 2026
f69eb66
fix: update inverted bench for shared bm25 scorer
Xuanwo Mar 26, 2026
ea80a5b
fix: satisfy clippy visibility checks
Xuanwo Mar 26, 2026
28e963b
fix: restore vector segment builder compatibility
Xuanwo Mar 26, 2026
f9a7c71
fix: require explicit index type for segment builder
Xuanwo Mar 26, 2026
6d242df
fix: require explicit segment types in java builder
Xuanwo Mar 26, 2026
5fd5794
fix: skip segment commit path for vector extensions
Xuanwo Mar 26, 2026
88dab0e
fix: decode inverted index details from payload
Xuanwo Mar 26, 2026
30fff36
Merge remote-tracking branch 'origin/main' into feat/fts-segment-pr1
Xuanwo Apr 1, 2026
f5e60ab
feat: benchmark and parallelize segmented fts search
Xuanwo Apr 1, 2026
84f2ee6
Merge remote-tracking branch 'origin/main' into feat/fts-segment-pr1
Xuanwo Apr 2, 2026
e85f6d6
perf: reduce segmented fts scorer setup overhead
Xuanwo Apr 2, 2026
d9eef68
Merge remote-tracking branch 'origin/main' into feat/fts-segment-pr1
Xuanwo Apr 14, 2026
2ec9771
bench: compare segmented and partitioned inverted search
Xuanwo Apr 14, 2026
83f53a0
fix: silence unused vector segment parameter
Xuanwo Apr 14, 2026
21a15f8
Merge remote-tracking branch 'origin/main' into feat/fts-segment-pr1
Xuanwo Apr 14, 2026
c3dfd98
fix: restore segment builder call sites after main merge
Xuanwo Apr 14, 2026
d525086
Merge remote-tracking branch 'origin/main' into feat/fts-segment-pr1
Xuanwo Apr 20, 2026
c78f29e
fix: restore binding support after tokenizer merge
Xuanwo Apr 20, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
194 changes: 139 additions & 55 deletions java/lance-jni/src/blocking_dataset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ use lance::dataset::{
ColumnAlteration, CommitBuilder, Dataset, NewColumnTransform, ProjectionRequest, ReadParams,
Version, WriteParams,
};
use lance::index::DatasetIndexExt;
use lance::index::{DatasetIndexExt, IndexSegment};
use lance::io::commit::namespace_manifest::LanceNamespaceExternalManifestStore;
use lance::io::{ObjectStore, ObjectStoreParams};
use lance::session::Session as LanceSession;
Expand Down Expand Up @@ -239,10 +239,6 @@ impl BlockingDataset {
Ok(version)
}

pub fn version_id(&self) -> u64 {
self.inner.version_id()
}

pub fn list_versions(&self) -> Result<Vec<Version>> {
let versions = RT.block_on(self.inner.versions())?;
Ok(versions)
Expand Down Expand Up @@ -1089,6 +1085,61 @@ fn inner_merge_index_metadata(
Ok(())
}

#[unsafe(no_mangle)]
pub extern "system" fn Java_org_lance_Dataset_nativeBuildIndexSegments<'local>(
mut env: JNIEnv<'local>,
java_dataset: JObject,
java_segments: JObject,
index_type: jint,
target_segment_bytes_jobj: JObject,
) -> JObject<'local> {
ok_or_throw!(
env,
inner_build_index_segments(
&mut env,
java_dataset,
java_segments,
index_type,
target_segment_bytes_jobj
)
)
}

fn inner_build_index_segments<'local>(
env: &mut JNIEnv<'local>,
java_dataset: JObject,
java_segments: JObject,
index_type: jint,
target_segment_bytes_jobj: JObject,
) -> Result<JObject<'local>> {
let segments = import_vec_to_rust(env, &java_segments, |env, obj| obj.extract_object(env))?;
let index_type = IndexType::try_from(index_type)?;
let target_segment_bytes = env
.get_long_opt(&target_segment_bytes_jobj)?
.map(|v| v as u64);
let template = segment_template(&segments)?;

let built_segments = {
let dataset_guard =
unsafe { env.get_rust_field::<_, _, BlockingDataset>(java_dataset, NATIVE_DATASET) }?;
let mut builder = dataset_guard
.inner
.create_index_segment_builder()
.with_index_type(index_type)
.with_segments(segments);
if let Some(target_segment_bytes) = target_segment_bytes {
builder = builder.with_target_segment_bytes(target_segment_bytes);
}
RT.block_on(builder.build_all())?
};

let built_metadata = built_segments
.into_iter()
.map(|segment| index_segment_to_metadata(&template, segment))
.collect::<Vec<_>>();
export_vec(env, &built_metadata)
}

#[unsafe(no_mangle)]
pub extern "system" fn Java_org_lance_Dataset_nativeMergeExistingIndexSegments<'local>(
mut env: JNIEnv<'local>,
Expand Down Expand Up @@ -1144,7 +1195,12 @@ fn inner_commit_existing_index_segments<'local>(
) -> Result<JObject<'local>> {
let index_name = index_name.extract(env)?;
let column = column.extract(env)?;
let segments = import_vec_to_rust(env, &java_segments, |env, obj| obj.extract_object(env))?;
let segment_metadata =
import_vec_to_rust(env, &java_segments, |env, obj| obj.extract_object(env))?;
let segments = segment_metadata
.iter()
.map(index_metadata_to_segment)
.collect::<Result<Vec<_>>>()?;

let committed = {
let mut dataset_guard =
Expand All @@ -1160,6 +1216,82 @@ fn inner_commit_existing_index_segments<'local>(
export_vec(env, &committed)
}

struct SegmentTemplate {
name: String,
fields: Vec<i32>,
dataset_version: u64,
}

fn segment_template(segments: &[IndexMetadata]) -> Result<SegmentTemplate> {
let first = segments
.first()
.ok_or_else(|| Error::input_error("segments cannot be empty".to_string()))?;
for segment in &segments[1..] {
if segment.name != first.name {
return Err(Error::input_error(format!(
"All segments must share the same index name, got '{}' and '{}'",
first.name, segment.name
)));
}
if segment.fields != first.fields {
return Err(Error::input_error(format!(
"All segments must target the same field ids, got {:?} and {:?}",
first.fields, segment.fields
)));
}
if segment.dataset_version != first.dataset_version {
return Err(Error::input_error(format!(
"All segments must share the same dataset version, got {} and {}",
first.dataset_version, segment.dataset_version
)));
}
}

Ok(SegmentTemplate {
name: first.name.clone(),
fields: first.fields.clone(),
dataset_version: first.dataset_version,
})
}

fn index_metadata_to_segment(metadata: &IndexMetadata) -> Result<IndexSegment> {
let fragment_bitmap = metadata.fragment_bitmap.clone().ok_or_else(|| {
Error::input_error(format!(
"Segment '{}' is missing fragment coverage metadata",
metadata.uuid
))
})?;
let index_details = metadata.index_details.clone().ok_or_else(|| {
Error::input_error(format!(
"Segment '{}' is missing index details metadata",
metadata.uuid
))
})?;

Ok(IndexSegment::new(
metadata.uuid,
fragment_bitmap,
index_details,
metadata.index_version,
))
}

fn index_segment_to_metadata(template: &SegmentTemplate, segment: IndexSegment) -> IndexMetadata {
let (uuid, fragment_bitmap, index_details, index_version) = segment.into_parts();
IndexMetadata {
uuid,
fields: template.fields.clone(),
name: template.name.clone(),
dataset_version: template.dataset_version,
fragment_bitmap: Some(fragment_bitmap),
index_details: Some(index_details),
index_version,
created_at: Some(Utc::now()),
base_id: None,
files: None,
}
}

#[unsafe(no_mangle)]
pub extern "system" fn Java_org_lance_Dataset_nativeOptimizeIndices(
mut env: JNIEnv,
Expand Down Expand Up @@ -1541,20 +1673,6 @@ fn inner_get_version<'local>(
version.into_java(env)
}

#[unsafe(no_mangle)]
pub extern "system" fn Java_org_lance_Dataset_nativeGetVersionId(
mut env: JNIEnv,
java_dataset: JObject,
) -> jlong {
ok_or_throw_with_return!(env, inner_get_version_id(&mut env, java_dataset), -1) as jlong
}

fn inner_get_version_id(env: &mut JNIEnv, java_dataset: JObject) -> Result<u64> {
let dataset_guard =
unsafe { env.get_rust_field::<_, _, BlockingDataset>(java_dataset, NATIVE_DATASET) }?;
Ok(dataset_guard.version_id())
}

#[unsafe(no_mangle)]
pub extern "system" fn Java_org_lance_Dataset_nativeGetLatestVersionId(
mut env: JNIEnv,
Expand Down Expand Up @@ -1879,20 +1997,6 @@ fn inner_get_config<'local>(
Ok(java_hashmap)
}

#[unsafe(no_mangle)]
pub extern "system" fn Java_org_lance_Dataset_nativeHasStableRowIds(
mut env: JNIEnv,
java_dataset: JObject,
) -> jboolean {
ok_or_throw_with_return!(env, inner_has_stable_row_ids(&mut env, java_dataset), 0u8)
}

fn inner_has_stable_row_ids(env: &mut JNIEnv, java_dataset: JObject) -> Result<u8> {
let dataset_guard =
unsafe { env.get_rust_field::<_, _, BlockingDataset>(java_dataset, NATIVE_DATASET) }?;
Ok(dataset_guard.inner.manifest().uses_stable_row_ids() as u8)
}

#[unsafe(no_mangle)]
pub extern "system" fn Java_org_lance_Dataset_nativeGetLanceFileFormatVersion<'local>(
mut env: JNIEnv<'local>,
Expand Down Expand Up @@ -2445,32 +2549,12 @@ fn inner_list_branches<'local>(
} else {
JObject::null()
};
let jbranch_identifier = env.new_object(
"java/util/ArrayList",
"(I)V",
&[JValue::Int(contents.identifier.version_mapping.len() as i32)],
)?;
for (version, uuid) in contents.identifier.version_mapping.iter() {
let juuid = env.new_string(uuid)?;
let jmapping = env.new_object(
"org/lance/Branch$BranchVersionMapping",
"(JLjava/lang/String;)V",
&[JValue::Long(*version as i64), JValue::Object(&juuid)],
)?;
env.call_method(
&jbranch_identifier,
"add",
"(Ljava/lang/Object;)Z",
&[JValue::Object(&jmapping)],
)?;
}
let jbranch = env.new_object(
"org/lance/Branch",
"(Ljava/lang/String;Ljava/lang/String;Ljava/util/List;JJI)V",
"(Ljava/lang/String;Ljava/lang/String;JJI)V",
&[
JValue::Object(&jname),
JValue::Object(&jparent),
JValue::Object(&jbranch_identifier),
JValue::Long(contents.parent_version as i64),
JValue::Long(contents.create_at as i64),
JValue::Int(contents.manifest_size as i32),
Expand Down
68 changes: 40 additions & 28 deletions java/src/main/java/org/lance/Dataset.java
Original file line number Diff line number Diff line change
Expand Up @@ -803,14 +803,9 @@ public String uri() {
* @return the version id of the dataset
*/
public long version() {
try (LockManager.ReadLock readLock = lockManager.acquireReadLock()) {
Preconditions.checkArgument(nativeDatasetHandle != 0, "Dataset is closed");
return nativeGetVersionId();
}
return getVersion().getId();
}

private native long nativeGetVersionId();

/**
* Gets the currently checked out version of the dataset.
*
Expand Down Expand Up @@ -1037,6 +1032,44 @@ public void mergeIndexMetadata(
private native void innerMergeIndexMetadata(
String indexUUID, int indexType, Optional<Integer> batchReadHead);

/**
* Build physical vector index segments from previously-created fragment-level index outputs.
*
* @param segments segment metadata returned by {@link #createIndex(IndexOptions)} when
* fragmentIds are provided
* @param indexType concrete index type for the staged segments
* @param targetSegmentBytes optional size target for merged physical segments
* @return built physical segment metadata
*/
public List<Index> buildIndexSegments(
List<Index> segments, IndexType indexType, Optional<Long> targetSegmentBytes) {
Preconditions.checkNotNull(segments, "segments cannot be null");
Preconditions.checkArgument(!segments.isEmpty(), "segments cannot be empty");
Preconditions.checkNotNull(indexType, "indexType cannot be null");
try (LockManager.WriteLock writeLock = lockManager.acquireWriteLock()) {
Preconditions.checkArgument(nativeDatasetHandle != 0, "Dataset is closed");
return nativeBuildIndexSegments(segments, indexType.getValue(), targetSegmentBytes);
}
}

/**
* Build physical vector index segments from previously-created fragment-level index outputs.
*
* @param segments segment metadata returned by {@link #createIndex(IndexOptions)} when
* fragmentIds are provided
* @param targetSegmentBytes optional size target for merged physical segments
* @return built physical segment metadata
*/
@Deprecated
public List<Index> buildIndexSegments(List<Index> segments, Optional<Long> targetSegmentBytes) {
throw new IllegalArgumentException(
"buildIndexSegments now requires an explicit index type; call "
+ "buildIndexSegments(segments, indexType, targetSegmentBytes)");
}

private native List<Index> nativeBuildIndexSegments(
List<Index> segments, int indexType, Optional<Long> targetSegmentBytes);

/** Merge one caller-defined group of existing uncommitted vector index segments. */
public Index mergeExistingIndexSegments(List<Index> segments) {
Preconditions.checkNotNull(segments, "segments cannot be null");
Expand Down Expand Up @@ -1262,11 +1295,7 @@ public List<String> listIndexes() {
/**
* Get all indexes with full metadata.
*
* <p>Each returned {@link Index} is a physical index segment from the manifest. Use {@link
* #describeIndices()} for the logical-index view.
*
* @return list of Index objects with complete segment metadata, including index type and fragment
* coverage
* @return list of Index objects with complete metadata including index type and fragment coverage
*/
public List<Index> getIndexes() {
try (LockManager.ReadLock readLock = lockManager.acquireReadLock()) {
Expand Down Expand Up @@ -1362,23 +1391,6 @@ public Map<String, String> getConfig() {

private native Map<String, String> nativeGetConfig();

/**
* Check whether the dataset uses stable row IDs.
*
* <p>Stable row IDs remain constant when rows are moved during compaction. This reads the
* manifest feature flag directly rather than the user-facing config map.
*
* @return true if the dataset was created with stable row IDs enabled
*/
public boolean hasStableRowIds() {
try (LockManager.ReadLock readLock = lockManager.acquireReadLock()) {
Preconditions.checkArgument(nativeDatasetHandle != 0, "Dataset is closed");
return nativeHasStableRowIds();
}
}

private native boolean nativeHasStableRowIds();

/**
* Get the Lance file format version of this dataset.
*
Expand Down
Loading
Loading