diff --git a/java/lance-jni/src/blocking_dataset.rs b/java/lance-jni/src/blocking_dataset.rs index b15132ad00b..cd10e27a085 100644 --- a/java/lance-jni/src/blocking_dataset.rs +++ b/java/lance-jni/src/blocking_dataset.rs @@ -2445,3 +2445,127 @@ fn inner_cleanup_with_policy<'local>( Ok(jstats) } + +////////////////////////////// +// Index operation Methods // +////////////////////////////// + +#[no_mangle] +pub extern "system" fn Java_org_lance_Dataset_nativeGetIndexes<'local>( + mut env: JNIEnv<'local>, + java_dataset: JObject, +) -> JObject<'local> { + ok_or_throw!(env, inner_get_indexes(&mut env, java_dataset)) +} + +fn inner_get_indexes<'local>( + env: &mut JNIEnv<'local>, + java_dataset: JObject, +) -> Result> { + let indexes = { + let dataset_guard = + unsafe { env.get_rust_field::<_, _, BlockingDataset>(java_dataset, NATIVE_DATASET) }?; + dataset_guard.list_indexes()? + }; + + let array_list = env.new_object("java/util/ArrayList", "()V", &[])?; + + for index_meta in indexes.iter() { + let java_index = index_meta.into_java(env)?; + env.call_method( + &array_list, + "add", + "(Ljava/lang/Object;)Z", + &[JValue::Object(&java_index)], + )?; + } + + Ok(array_list) +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_Dataset_nativeCountIndexedRows( + mut env: JNIEnv, + java_dataset: JObject, + jindex_name: JString, + jfilter: JString, + jfragment_ids: JObject, // Optional> +) -> jlong { + ok_or_throw_with_return!( + env, + inner_count_indexed_rows(&mut env, java_dataset, jindex_name, jfilter, jfragment_ids), + -1 + ) +} + +fn inner_count_indexed_rows( + env: &mut JNIEnv, + java_dataset: JObject, + _jindex_name: JString, + jfilter: JString, + jfragment_ids: JObject, // Optional> +) -> Result { + let filter: String = jfilter.extract(env)?; + + // Extract optional fragment IDs + let fragment_ids: Option> = if env + .call_method(&jfragment_ids, "isPresent", "()Z", &[])? + .z()? + { + let list_obj = env + .call_method(&jfragment_ids, "get", "()Ljava/lang/Object;", &[])? + .l()?; + let list = env.get_list(&list_obj)?; + let mut ids = Vec::new(); + let mut iter = list.iter(env)?; + while let Some(elem) = iter.next(env)? { + let int_val = env.call_method(&elem, "intValue", "()I", &[])?.i()?; + ids.push(int_val as u32); + } + Some(ids) + } else { + None + }; + + let count = { + let dataset_guard = + unsafe { env.get_rust_field::<_, _, BlockingDataset>(java_dataset, NATIVE_DATASET) }?; + + // Use a scanner with fragment filtering to count rows + // This ensures we only count rows in the specified fragments + let inner = dataset_guard.inner.clone(); + + RT.block_on(async { + let mut scanner = inner.scan(); + + // Apply filter + if !filter.is_empty() { + scanner.filter(&filter)?; + } + + // Empty projection and enable row_id for count_rows to work + // count_rows() requires metadata-only projection + scanner.project::(&[])?; + scanner.with_row_id(); + + // Apply fragment filter if specified + if let Some(frag_ids) = fragment_ids { + // Convert FileFragment to Fragment by extracting metadata + let filtered_fragments: Vec<_> = inner + .get_fragments() + .into_iter() + .filter(|f| frag_ids.contains(&(f.id() as u32))) + .map(|f| f.metadata().clone()) + .collect(); + scanner.with_fragments(filtered_fragments); + } + + // Use the scanner's count_rows method + let count = scanner.count_rows().await?; + + Ok::(count as i64) + })? + }; + + Ok(count) +} diff --git a/java/lance-jni/src/transaction.rs b/java/lance-jni/src/transaction.rs index 32ffe3c99e0..9b80a741d6f 100644 --- a/java/lance-jni/src/transaction.rs +++ b/java/lance-jni/src/transaction.rs @@ -150,10 +150,13 @@ impl IntoJava for &IndexMetadata { JObject::null() }; - // Create IndexMetadata object + // Determine index type from index_details type_url + let index_type = determine_index_type(env, &self.index_details)?; + + // Create Index object Ok(env.new_object( "org/lance/index/Index", - "(Ljava/util/UUID;Ljava/util/List;Ljava/lang/String;JLjava/util/List;[BILjava/time/Instant;Ljava/lang/Integer;)V", + "(Ljava/util/UUID;Ljava/util/List;Ljava/lang/String;JLjava/util/List;[BILjava/time/Instant;Ljava/lang/Integer;Lorg/lance/index/IndexType;)V", &[ JValue::Object(&uuid), JValue::Object(&fields), @@ -164,11 +167,77 @@ impl IntoJava for &IndexMetadata { JValue::Int(self.index_version), JValue::Object(&created_at), JValue::Object(&base_id), + JValue::Object(&index_type), ], )?) } } +/// Determine the IndexType enum value from index_details protobuf +fn determine_index_type<'local>( + env: &mut JNIEnv<'local>, + index_details: &Option>, +) -> Result> { + let type_name = if let Some(details) = index_details { + // Extract type name from type_url (e.g., ".lance.index.BTreeIndexDetails" -> "BTREE") + let type_url = &details.type_url; + let type_part = type_url.split('.').next_back().unwrap_or(""); + let lower = type_part.to_lowercase(); + + if lower.contains("btree") { + Some("BTREE") + } else if lower.contains("bitmap") { + Some("BITMAP") + } else if lower.contains("labellist") { + Some("LABEL_LIST") + } else if lower.contains("inverted") { + Some("INVERTED") + } else if lower.contains("ngram") { + Some("NGRAM") + } else if lower.contains("zonemap") { + Some("ZONEMAP") + } else if lower.contains("bloomfilter") { + Some("BLOOM_FILTER") + } else if lower.contains("ivfhnsw") { + if lower.contains("sq") { + Some("IVF_HNSW_SQ") + } else if lower.contains("pq") { + Some("IVF_HNSW_PQ") + } else { + Some("IVF_HNSW_FLAT") + } + } else if lower.contains("ivf") { + if lower.contains("sq") { + Some("IVF_SQ") + } else if lower.contains("pq") { + Some("IVF_PQ") + } else { + Some("IVF_FLAT") + } + } else if lower.contains("vector") { + Some("VECTOR") + } else { + None + } + } else { + None + }; + + match type_name { + Some(name) => { + let index_type = env + .get_static_field( + "org/lance/index/IndexType", + name, + "Lorg/lance/index/IndexType;", + )? + .l()?; + Ok(index_type) + } + None => Ok(JObject::null()), + } +} + impl IntoJava for &UpdateMode { fn into_java<'a>(self, env: &mut JNIEnv<'a>) -> Result> { let name = match self { diff --git a/java/src/main/java/org/lance/Dataset.java b/java/src/main/java/org/lance/Dataset.java index 21572214eda..107294bdf7d 100644 --- a/java/src/main/java/org/lance/Dataset.java +++ b/java/src/main/java/org/lance/Dataset.java @@ -16,6 +16,7 @@ import org.lance.cleanup.CleanupPolicy; import org.lance.cleanup.RemovalStats; import org.lance.compaction.CompactionOptions; +import org.lance.index.Index; import org.lance.index.IndexOptions; import org.lance.index.IndexParams; import org.lance.index.IndexType; @@ -834,6 +835,31 @@ public long countRows(String filter) { private native long nativeCountRows(Optional filter); + /** + * Count rows matching a filter using a specific scalar index. This directly queries the index and + * counts matching row addresses, which is more efficient than scanning when the index covers the + * filter column. + * + * @param indexName the name of the scalar index to use + * @param filter the filter expression (e.g., "column = 5") + * @param fragmentIds optional list of fragment IDs to restrict the count to + * @return count of matching rows + */ + public long countIndexedRows( + String indexName, String filter, Optional> fragmentIds) { + try (LockManager.ReadLock readLock = lockManager.acquireReadLock()) { + Preconditions.checkArgument(nativeDatasetHandle != 0, "Dataset is closed"); + Preconditions.checkArgument( + indexName != null && !indexName.isEmpty(), "indexName cannot be null or empty"); + Preconditions.checkArgument( + filter != null && !filter.isEmpty(), "filter cannot be null or empty"); + return nativeCountIndexedRows(indexName, filter, fragmentIds); + } + } + + private native long nativeCountIndexedRows( + String indexName, String filter, Optional> fragmentIds); + /** * Calculate the size of the dataset. * @@ -928,6 +954,20 @@ public List listIndexes() { private native List nativeListIndexes(); + /** + * Get all indexes with full metadata. + * + * @return list of Index objects with complete metadata including index type and fragment coverage + */ + public List getIndexes() { + try (LockManager.ReadLock readLock = lockManager.acquireReadLock()) { + Preconditions.checkArgument(nativeDatasetHandle != 0, "Dataset is closed"); + return nativeGetIndexes(); + } + } + + private native List nativeGetIndexes(); + /** * Get the table config of the dataset. * diff --git a/java/src/main/java/org/lance/index/Index.java b/java/src/main/java/org/lance/index/Index.java index 86ff8c6007b..955835496ed 100644 --- a/java/src/main/java/org/lance/index/Index.java +++ b/java/src/main/java/org/lance/index/Index.java @@ -36,6 +36,7 @@ public class Index { private final int indexVersion; private final Instant createdAt; private final Integer baseId; + private final IndexType indexType; private Index( UUID uuid, @@ -46,7 +47,8 @@ private Index( byte[] indexDetails, int indexVersion, Instant createdAt, - Integer baseId) { + Integer baseId, + IndexType indexType) { this.uuid = uuid; this.fields = fields; this.name = name; @@ -56,6 +58,7 @@ private Index( this.indexVersion = indexVersion; this.createdAt = createdAt; this.baseId = baseId; + this.indexType = indexType; } public UUID uuid() { @@ -119,6 +122,15 @@ public Optional createdAt() { return Optional.ofNullable(createdAt); } + /** + * Get the type of the index (e.g., BTREE, BITMAP, VECTOR). + * + * @return the index type, or null if unknown + */ + public IndexType indexType() { + return indexType; + } + @Override public boolean equals(Object o) { if (this == o) return true; @@ -132,14 +144,23 @@ public boolean equals(Object o) { && Objects.equals(fragments, index.fragments) && Arrays.equals(indexDetails, index.indexDetails) && Objects.equals(createdAt, index.createdAt) - && Objects.equals(baseId, index.baseId); + && Objects.equals(baseId, index.baseId) + && indexType == index.indexType; } @Override public int hashCode() { int result = Objects.hash( - uuid, fields, name, datasetVersion, indexVersion, createdAt, baseId, fragments); + uuid, + fields, + name, + datasetVersion, + indexVersion, + createdAt, + baseId, + fragments, + indexType); result = 31 * result + Arrays.hashCode(indexDetails); return result; } @@ -152,6 +173,7 @@ public String toString() { .add("name", name) .add("datasetVersion", datasetVersion) .add("indexVersion", indexVersion) + .add("indexType", indexType) .add("createdAt", createdAt) .add("baseId", baseId) .toString(); @@ -177,6 +199,7 @@ public static class Builder { private int indexVersion; private Instant createdAt; private Integer baseId; + private IndexType indexType; private Builder() {} @@ -225,6 +248,11 @@ public Builder baseId(Integer baseId) { return this; } + public Builder indexType(IndexType indexType) { + this.indexType = indexType; + return this; + } + public Index build() { return new Index( uuid, @@ -235,7 +263,8 @@ public Index build() { indexDetails, indexVersion, createdAt, - baseId); + baseId, + indexType); } } }