From f747fd5c3c90dc0465bda25324be5713dcf3724e Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Thu, 20 Nov 2025 19:51:26 -0800 Subject: [PATCH 01/12] feat(java): support credential vending at write time --- java/lance-jni/src/blocking_dataset.rs | 83 ++- java/lance-jni/src/fragment.rs | 88 +-- java/lance-jni/src/merge_insert.rs | 7 +- java/lance-jni/src/transaction.rs | 20 +- java/lance-jni/src/utils.rs | 31 + .../main/java/com/lancedb/lance/Dataset.java | 36 ++ .../main/java/com/lancedb/lance/Fragment.java | 51 +- .../lancedb/lance/WriteDatasetBuilder.java | 348 +++++++++++ .../lance/NamespaceIntegrationTest.java | 232 +++++++ .../StorageOptionsProviderWriteTest.java | 572 ++++++++++++++++++ 10 files changed, 1403 insertions(+), 65 deletions(-) create mode 100644 java/src/main/java/com/lancedb/lance/WriteDatasetBuilder.java create mode 100644 java/src/test/java/com/lancedb/lance/StorageOptionsProviderWriteTest.java diff --git a/java/lance-jni/src/blocking_dataset.rs b/java/lance-jni/src/blocking_dataset.rs index 7aeb0ba72f9..b44ef7b41a3 100644 --- a/java/lance-jni/src/blocking_dataset.rs +++ b/java/lance-jni/src/blocking_dataset.rs @@ -56,9 +56,15 @@ pub const NATIVE_DATASET: &str = "nativeDatasetHandle"; #[derive(Clone)] pub struct BlockingDataset { pub(crate) inner: Dataset, + pub(crate) storage_options_provider: Option>, } impl BlockingDataset { + /// Get the storage options provider that was used when opening this dataset + pub fn get_storage_options_provider(&self) -> Option> { + self.storage_options_provider.clone() + } + pub fn drop(uri: &str, storage_options: HashMap) -> Result<()> { RT.block_on(async move { let registry = Arc::new(ObjectStoreRegistry::default()); @@ -81,8 +87,15 @@ impl BlockingDataset { uri: &str, params: Option, ) -> Result { + let storage_options_provider = params + .as_ref() + .and_then(|p| p.store_params.as_ref()) + .and_then(|sp| sp.storage_options_provider.clone()); let inner = RT.block_on(Dataset::write(reader, uri, params))?; - Ok(Self { inner }) + Ok(Self { + inner, + storage_options_provider, + }) } #[allow(clippy::too_many_arguments)] @@ -122,7 +135,7 @@ impl BlockingDataset { builder = builder.with_version(ver as u64); } builder = builder.with_storage_options(storage_options); - if let Some(provider) = storage_options_provider { + if let Some(provider) = storage_options_provider.clone() { builder = builder.with_storage_options_provider(provider) } if let Some(offset_seconds) = s3_credentials_refresh_offset_seconds { @@ -135,7 +148,10 @@ impl BlockingDataset { } let inner = RT.block_on(builder.load())?; - Ok(Self { inner }) + Ok(Self { + inner, + storage_options_provider, + }) } pub fn commit( @@ -156,7 +172,10 @@ impl BlockingDataset { Default::default(), false, // TODO: support enable_v2_manifest_paths ))?; - Ok(Self { inner }) + Ok(Self { + inner, + storage_options_provider: None, + }) } pub fn latest_version(&self) -> Result { @@ -175,12 +194,18 @@ impl BlockingDataset { pub fn checkout_version(&mut self, version: u64) -> Result { let inner = RT.block_on(self.inner.checkout_version(version))?; - Ok(Self { inner }) + Ok(Self { + inner, + storage_options_provider: self.storage_options_provider.clone(), + }) } pub fn checkout_tag(&mut self, tag: &str) -> Result { let inner = RT.block_on(self.inner.checkout_version(tag))?; - Ok(Self { inner }) + Ok(Self { + inner, + storage_options_provider: self.storage_options_provider.clone(), + }) } pub fn checkout_latest(&mut self) -> Result<()> { @@ -214,7 +239,10 @@ impl BlockingDataset { None => Ref::from(version), }; let inner = RT.block_on(self.inner.create_branch(branch, reference, None))?; - Ok(Self { inner }) + Ok(Self { + inner, + storage_options_provider: self.storage_options_provider.clone(), + }) } pub fn delete_branch(&mut self, branch: &str) -> Result<()> { @@ -234,7 +262,10 @@ impl BlockingDataset { Ref::Version(branch, version) }; let inner = RT.block_on(self.inner.checkout_version(reference))?; - Ok(Self { inner }) + Ok(Self { + inner, + storage_options_provider: self.storage_options_provider.clone(), + }) } pub fn create_tag( @@ -284,17 +315,18 @@ impl BlockingDataset { pub fn commit_transaction( &mut self, transaction: Transaction, - write_params: HashMap, + store_params: ObjectStoreParams, ) -> Result { + let storage_options_provider = store_params.storage_options_provider.clone(); let new_dataset = RT.block_on( CommitBuilder::new(Arc::new(self.clone().inner)) - .with_store_params(ObjectStoreParams { - storage_options: Some(write_params), - ..Default::default() - }) + .with_store_params(store_params) .execute(transaction), )?; - Ok(BlockingDataset { inner: new_dataset }) + Ok(BlockingDataset { + inner: new_dataset, + storage_options_provider, + }) } pub fn read_transaction(&self) -> Result> { @@ -473,6 +505,8 @@ fn create_dataset<'local>( ) -> Result> { let path_str = path.extract(env)?; + // Dataset.create() doesn't support storage_options_provider yet, pass empty Optional + let empty_provider = JObject::null(); let write_params = extract_write_params( env, &max_rows_per_file, @@ -482,6 +516,7 @@ fn create_dataset<'local>( &enable_stable_row_ids, &data_storage_version, &storage_options_obj, + &empty_provider, )?; let dataset = BlockingDataset::write(reader, &path_str, Some(write_params))?; @@ -1305,10 +1340,11 @@ fn inner_shallow_clone<'local>( } }; - let new_ds = { + let (new_ds, provider) = { let mut dataset_guard = unsafe { env.get_rust_field::<_, _, BlockingDataset>(java_dataset, NATIVE_DATASET) }?; - RT.block_on( + let provider = dataset_guard.storage_options_provider.clone(); + let new_ds = RT.block_on( dataset_guard.inner.shallow_clone( &target_path_str, reference, @@ -1321,10 +1357,15 @@ fn inner_shallow_clone<'local>( }) .unwrap_or(None), ), - )? + )?; + (new_ds, provider) }; - BlockingDataset { inner: new_ds }.into_java(env) + BlockingDataset { + inner: new_ds, + storage_options_provider: provider, + } + .into_java(env) } #[no_mangle] @@ -2112,12 +2153,16 @@ fn inner_create_branch_on_tag<'local>( let new_blocking_dataset = { let mut dataset_guard = unsafe { env.get_rust_field::<_, _, BlockingDataset>(java_dataset, NATIVE_DATASET) }?; + let provider = dataset_guard.storage_options_provider.clone(); let inner = RT.block_on(dataset_guard.inner.create_branch( branch_name.as_str(), reference, None, ))?; - BlockingDataset { inner } + BlockingDataset { + inner, + storage_options_provider: provider, + } }; new_blocking_dataset.into_java(env) } diff --git a/java/lance-jni/src/fragment.rs b/java/lance-jni/src/fragment.rs index 98be13a3e44..731d4631e6c 100644 --- a/java/lance-jni/src/fragment.rs +++ b/java/lance-jni/src/fragment.rs @@ -20,12 +20,13 @@ use lance::dataset::fragment::FileFragment; use lance_datafusion::utils::StreamingWriteSource; use crate::error::{Error, Result}; +use crate::ffi::JNIEnvExt; use crate::traits::{export_vec, import_vec, FromJObjectWithEnv, IntoJava, JLance}; use crate::{ blocking_dataset::{BlockingDataset, NATIVE_DATASET}, traits::FromJString, utils::extract_write_params, - JNIEnvExt, RT, + RT, }; #[derive(Debug, Clone)] @@ -82,13 +83,14 @@ pub extern "system" fn Java_com_lancedb_lance_Fragment_createWithFfiArray<'local dataset_uri: JString, arrow_array_addr: jlong, arrow_schema_addr: jlong, - max_rows_per_file: JObject, // Optional - max_rows_per_group: JObject, // Optional - max_bytes_per_file: JObject, // Optional - mode: JObject, // Optional - enable_stable_row_ids: JObject, // Optional - data_storage_version: JObject, // Optional - storage_options_obj: JObject, // Map + max_rows_per_file: JObject, // Optional + max_rows_per_group: JObject, // Optional + max_bytes_per_file: JObject, // Optional + mode: JObject, // Optional + enable_stable_row_ids: JObject, // Optional + data_storage_version: JObject, // Optional + storage_options_obj: JObject, // Map + storage_options_provider_obj: JObject, // Optional ) -> JObject<'local> { ok_or_throw_with_return!( env, @@ -103,7 +105,8 @@ pub extern "system" fn Java_com_lancedb_lance_Fragment_createWithFfiArray<'local mode, enable_stable_row_ids, data_storage_version, - storage_options_obj + storage_options_obj, + storage_options_provider_obj ), JObject::default() ) @@ -115,13 +118,14 @@ fn inner_create_with_ffi_array<'local>( dataset_uri: JString, arrow_array_addr: jlong, arrow_schema_addr: jlong, - max_rows_per_file: JObject, // Optional - max_rows_per_group: JObject, // Optional - max_bytes_per_file: JObject, // Optional - mode: JObject, // Optional - enable_stable_row_ids: JObject, // Optional - data_storage_version: JObject, // Optional - storage_options_obj: JObject, // Map + max_rows_per_file: JObject, // Optional + max_rows_per_group: JObject, // Optional + max_bytes_per_file: JObject, // Optional + mode: JObject, // Optional + enable_stable_row_ids: JObject, // Optional + data_storage_version: JObject, // Optional + storage_options_obj: JObject, // Map + storage_options_provider_obj: JObject, // Optional ) -> Result> { let c_array_ptr = arrow_array_addr as *mut FFI_ArrowArray; let c_schema_ptr = arrow_schema_addr as *mut FFI_ArrowSchema; @@ -146,6 +150,7 @@ fn inner_create_with_ffi_array<'local>( enable_stable_row_ids, data_storage_version, storage_options_obj, + storage_options_provider_obj, reader, ) } @@ -156,13 +161,14 @@ pub extern "system" fn Java_com_lancedb_lance_Fragment_createWithFfiStream<'a>( _obj: JObject, dataset_uri: JString, arrow_array_stream_addr: jlong, - max_rows_per_file: JObject, // Optional - max_rows_per_group: JObject, // Optional - max_bytes_per_file: JObject, // Optional - mode: JObject, // Optional - enable_stable_row_ids: JObject, // Optional - data_storage_version: JObject, // Optional - storage_options_obj: JObject, // Map + max_rows_per_file: JObject, // Optional + max_rows_per_group: JObject, // Optional + max_bytes_per_file: JObject, // Optional + mode: JObject, // Optional + enable_stable_row_ids: JObject, // Optional + data_storage_version: JObject, // Optional + storage_options_obj: JObject, // Map + storage_options_provider_obj: JObject, // Optional ) -> JObject<'a> { ok_or_throw_with_return!( env, @@ -176,7 +182,8 @@ pub extern "system" fn Java_com_lancedb_lance_Fragment_createWithFfiStream<'a>( mode, enable_stable_row_ids, data_storage_version, - storage_options_obj + storage_options_obj, + storage_options_provider_obj ), JObject::null() ) @@ -187,13 +194,14 @@ fn inner_create_with_ffi_stream<'local>( env: &mut JNIEnv<'local>, dataset_uri: JString, arrow_array_stream_addr: jlong, - max_rows_per_file: JObject, // Optional - max_rows_per_group: JObject, // Optional - max_bytes_per_file: JObject, // Optional - mode: JObject, // Optional - enable_stable_row_ids: JObject, // Optional - data_storage_version: JObject, // Optional - storage_options_obj: JObject, // Map + max_rows_per_file: JObject, // Optional + max_rows_per_group: JObject, // Optional + max_bytes_per_file: JObject, // Optional + mode: JObject, // Optional + enable_stable_row_ids: JObject, // Optional + data_storage_version: JObject, // Optional + storage_options_obj: JObject, // Map + storage_options_provider_obj: JObject, // Optional ) -> Result> { let stream_ptr = arrow_array_stream_addr as *mut FFI_ArrowArrayStream; let reader = unsafe { ArrowArrayStreamReader::from_raw(stream_ptr) }?; @@ -208,6 +216,7 @@ fn inner_create_with_ffi_stream<'local>( enable_stable_row_ids, data_storage_version, storage_options_obj, + storage_options_provider_obj, reader, ) } @@ -216,13 +225,14 @@ fn inner_create_with_ffi_stream<'local>( fn create_fragment<'a>( env: &mut JNIEnv<'a>, dataset_uri: JString, - max_rows_per_file: JObject, // Optional - max_rows_per_group: JObject, // Optional - max_bytes_per_file: JObject, // Optional - mode: JObject, // Optional - enable_stable_row_ids: JObject, // Optional - data_storage_version: JObject, // Optional - storage_options_obj: JObject, // Map + max_rows_per_file: JObject, // Optional + max_rows_per_group: JObject, // Optional + max_bytes_per_file: JObject, // Optional + mode: JObject, // Optional + enable_stable_row_ids: JObject, // Optional + data_storage_version: JObject, // Optional + storage_options_obj: JObject, // Map + storage_options_provider_obj: JObject, // Optional source: impl StreamingWriteSource, ) -> Result> { let path_str = dataset_uri.extract(env)?; @@ -236,7 +246,9 @@ fn create_fragment<'a>( &enable_stable_row_ids, &data_storage_version, &storage_options_obj, + &storage_options_provider_obj, )?; + let fragments = RT.block_on(FileFragment::create_fragments( &path_str, source, diff --git a/java/lance-jni/src/merge_insert.rs b/java/lance-jni/src/merge_insert.rs index 3fe9a6742f2..f9b243bd999 100644 --- a/java/lance-jni/src/merge_insert.rs +++ b/java/lance-jni/src/merge_insert.rs @@ -49,8 +49,9 @@ fn inner_merge_insert<'local>( let retry_timeout_ms = extract_retry_timeout_ms(env, &jparam)?; let skip_auto_cleanup = extract_skip_auto_cleanup(env, &jparam)?; - let (new_ds, merge_stats) = unsafe { + let (new_ds, merge_stats, provider) = unsafe { let dataset = env.get_rust_field::<_, _, BlockingDataset>(jdataset, NATIVE_DATASET)?; + let provider = dataset.storage_options_provider.clone(); let when_not_matched_by_source = extract_when_not_matched_by_source( dataset.inner.schema(), @@ -70,12 +71,14 @@ fn inner_merge_insert<'local>( let stream_ptr = batch_address as *mut FFI_ArrowArrayStream; let source_stream = ArrowArrayStreamReader::from_raw(stream_ptr)?; - RT.block_on(async move { merge_insert_job.execute_reader(source_stream).await })? + let result = RT.block_on(async move { merge_insert_job.execute_reader(source_stream).await })?; + (result.0, result.1, provider) }; MergeResult( BlockingDataset { inner: Arc::try_unwrap(new_ds).unwrap(), + storage_options_provider: provider, }, merge_stats, ) diff --git a/java/lance-jni/src/transaction.rs b/java/lance-jni/src/transaction.rs index 87ddc6fc7c4..8a2f2cc82a9 100644 --- a/java/lance-jni/src/transaction.rs +++ b/java/lance-jni/src/transaction.rs @@ -17,6 +17,7 @@ use lance::dataset::transaction::{ DataReplacementGroup, Operation, RewriteGroup, RewrittenIndex, Transaction, TransactionBuilder, UpdateMap, UpdateMapEntry, UpdateMode, }; +use lance::io::ObjectStoreParams; use lance::table::format::{Fragment, IndexMetadata}; use lance_core::datatypes::Schema as LanceSchema; use prost::Message; @@ -679,11 +680,26 @@ fn inner_commit_transaction<'local>( .l()?; let write_param_jmap = JMap::from_env(env, &write_param_jobj)?; let write_param = to_rust_map(env, &write_param_jmap)?; + + // Get the Dataset's storage_options_provider + let storage_options_provider = { + let dataset_guard = + unsafe { env.get_rust_field::<_, _, BlockingDataset>(&java_dataset, NATIVE_DATASET) }?; + dataset_guard.get_storage_options_provider() + }; + + // Build ObjectStoreParams using write_param for storage_options and provider from Dataset + let store_params = ObjectStoreParams { + storage_options: Some(write_param), + storage_options_provider, + ..Default::default() + }; + let transaction = convert_to_rust_transaction(env, java_transaction, Some(&java_dataset))?; let new_blocking_ds = { let mut dataset_guard = - unsafe { env.get_rust_field::<_, _, BlockingDataset>(java_dataset, NATIVE_DATASET) }?; - dataset_guard.commit_transaction(transaction, write_param)? + unsafe { env.get_rust_field::<_, _, BlockingDataset>(&java_dataset, NATIVE_DATASET) }?; + dataset_guard.commit_transaction(transaction, store_params)? }; new_blocking_ds.into_java(env) } diff --git a/java/lance-jni/src/utils.rs b/java/lance-jni/src/utils.rs index 495ab229f4b..767cd4753af 100644 --- a/java/lance-jni/src/utils.rs +++ b/java/lance-jni/src/utils.rs @@ -21,8 +21,10 @@ use lance_linalg::distance::DistanceType; use crate::error::{Error, Result}; use crate::ffi::JNIEnvExt; +use crate::storage_options::JavaStorageOptionsProvider; use lance_index::vector::Query; +use lance_io::object_store::StorageOptionsProvider; use std::collections::HashMap; use std::str::FromStr; @@ -45,6 +47,7 @@ pub fn extract_write_params( enable_stable_row_ids: &JObject, data_storage_version: &JObject, storage_options_obj: &JObject, + storage_options_provider_obj: &JObject, // Optional ) -> Result { let mut write_params = WriteParams::default(); @@ -71,8 +74,36 @@ pub fn extract_write_params( let storage_options: HashMap = extract_storage_options(env, storage_options_obj)?; + // Extract storage options provider if present + let storage_options_provider = if !storage_options_provider_obj.is_null() { + // Check if it's an Optional.empty() + let is_present = env + .call_method(storage_options_provider_obj, "isPresent", "()Z", &[])? + .z()?; + if is_present { + // Get the value from Optional + let provider_obj = env + .call_method( + storage_options_provider_obj, + "get", + "()Ljava/lang/Object;", + &[], + )? + .l()?; + Some(JavaStorageOptionsProvider::new(env, provider_obj)?) + } else { + None + } + } else { + None + }; + + let storage_options_provider_arc: Option> = + storage_options_provider.map(|v| Arc::new(v) as Arc); + write_params.store_params = Some(ObjectStoreParams { storage_options: Some(storage_options), + storage_options_provider: storage_options_provider_arc, ..Default::default() }); Ok(write_params) diff --git a/java/src/main/java/com/lancedb/lance/Dataset.java b/java/src/main/java/com/lancedb/lance/Dataset.java index 92badee32c0..f9574a832ad 100644 --- a/java/src/main/java/com/lancedb/lance/Dataset.java +++ b/java/src/main/java/com/lancedb/lance/Dataset.java @@ -75,6 +75,36 @@ public class Dataset implements Closeable { private Dataset() {} + /** + * Creates a builder for writing a dataset. + * + *

This builder supports writing datasets either directly to a URI or through a LanceNamespace. + * + *

Example usage with URI: + * + *

{@code
+   * Dataset dataset = Dataset.write(reader)
+   *     .uri("s3://bucket/table.lance")
+   *     .mode(WriteMode.CREATE)
+   *     .execute();
+   * }
+ * + *

Example usage with namespace: + * + *

{@code
+   * Dataset dataset = Dataset.write(reader)
+   *     .namespace(myNamespace, Arrays.asList("my_table"))
+   *     .mode(WriteMode.APPEND)
+   *     .execute();
+   * }
+ * + * @param reader ArrowReader containing the data to write + * @return A new WriteDatasetBuilder instance + */ + public static WriteDatasetBuilder write(ArrowReader reader) { + return new WriteDatasetBuilder(reader); + } + /** * Creates an empty dataset. * @@ -83,7 +113,10 @@ private Dataset() {} * @param schema dataset schema * @param params write params * @return Dataset + * @deprecated Use {@link #write(ArrowReader)} builder instead. For example: {@code + * Dataset.write(reader).uri(path).mode(WriteMode.CREATE).execute()} */ + @Deprecated public static Dataset create( BufferAllocator allocator, String path, Schema schema, WriteParams params) { Preconditions.checkNotNull(allocator); @@ -116,7 +149,10 @@ public static Dataset create( * @param path dataset uri * @param params write parameters * @return Dataset + * @deprecated Use {@link #write(ArrowReader)} builder instead. For example: {@code + * Dataset.write(reader).uri(path).mode(WriteMode.CREATE).execute()} */ + @Deprecated public static Dataset create( BufferAllocator allocator, ArrowArrayStream stream, String path, WriteParams params) { Preconditions.checkNotNull(allocator); diff --git a/java/src/main/java/com/lancedb/lance/Fragment.java b/java/src/main/java/com/lancedb/lance/Fragment.java index 7a0bfee67c9..3864734f06e 100644 --- a/java/src/main/java/com/lancedb/lance/Fragment.java +++ b/java/src/main/java/com/lancedb/lance/Fragment.java @@ -15,6 +15,7 @@ import com.lancedb.lance.fragment.FragmentMergeResult; import com.lancedb.lance.fragment.FragmentUpdateResult; +import com.lancedb.lance.io.StorageOptionsProvider; import com.lancedb.lance.ipc.LanceScanner; import com.lancedb.lance.ipc.ScanOptions; @@ -208,6 +209,26 @@ private native FragmentUpdateResult nativeUpdateColumns( */ public static List create( String datasetUri, BufferAllocator allocator, VectorSchemaRoot root, WriteParams params) { + return create(datasetUri, allocator, root, params, null); + } + + /** + * Create a fragment from the given data with optional storage options provider. + * + * @param datasetUri the dataset uri + * @param allocator the buffer allocator + * @param root the vector schema root + * @param params the write params + * @param storageOptionsProvider optional provider for dynamic storage options with automatic + * credential refresh + * @return the fragment metadata + */ + public static List create( + String datasetUri, + BufferAllocator allocator, + VectorSchemaRoot root, + WriteParams params, + StorageOptionsProvider storageOptionsProvider) { Preconditions.checkNotNull(datasetUri); Preconditions.checkNotNull(allocator); Preconditions.checkNotNull(root); @@ -225,7 +246,8 @@ public static List create( params.getMode(), params.getEnableStableRowIds(), params.getDataStorageVersion(), - params.getStorageOptions()); + params.getStorageOptions(), + Optional.ofNullable(storageOptionsProvider)); } } @@ -239,6 +261,24 @@ public static List create( */ public static List create( String datasetUri, ArrowArrayStream stream, WriteParams params) { + return create(datasetUri, stream, params, null); + } + + /** + * Create a fragment from the given arrow stream with optional storage options provider. + * + * @param datasetUri the dataset uri + * @param stream the arrow stream + * @param params the write params + * @param storageOptionsProvider optional provider for dynamic storage options with automatic + * credential refresh + * @return the fragment metadata + */ + public static List create( + String datasetUri, + ArrowArrayStream stream, + WriteParams params, + StorageOptionsProvider storageOptionsProvider) { Preconditions.checkNotNull(datasetUri); Preconditions.checkNotNull(stream); Preconditions.checkNotNull(params); @@ -251,7 +291,8 @@ public static List create( params.getMode(), params.getEnableStableRowIds(), params.getDataStorageVersion(), - params.getStorageOptions()); + params.getStorageOptions(), + Optional.ofNullable(storageOptionsProvider)); } /** @@ -269,7 +310,8 @@ private static native List createWithFfiArray( Optional mode, Optional enableStableRowIds, Optional dataStorageVersion, - Map storageOptions); + Map storageOptions, + Optional storageOptionsProvider); /** * Create a fragment from the given arrow stream. @@ -285,5 +327,6 @@ private static native List createWithFfiStream( Optional mode, Optional enableStableRowIds, Optional dataStorageVersion, - Map storageOptions); + Map storageOptions, + Optional storageOptionsProvider); } diff --git a/java/src/main/java/com/lancedb/lance/WriteDatasetBuilder.java b/java/src/main/java/com/lancedb/lance/WriteDatasetBuilder.java new file mode 100644 index 00000000000..a2e54304a8e --- /dev/null +++ b/java/src/main/java/com/lancedb/lance/WriteDatasetBuilder.java @@ -0,0 +1,348 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.lancedb.lance; + +import com.lancedb.lance.namespace.LanceNamespace; +import com.lancedb.lance.namespace.model.CreateEmptyTableRequest; +import com.lancedb.lance.namespace.model.CreateEmptyTableResponse; +import com.lancedb.lance.namespace.model.DescribeTableRequest; +import com.lancedb.lance.namespace.model.DescribeTableResponse; + +import org.apache.arrow.c.ArrowArrayStream; +import org.apache.arrow.c.Data; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.ipc.ArrowReader; +import org.apache.arrow.vector.types.pojo.Schema; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +/** + * Builder for writing datasets. + * + *

This builder provides a fluent API for creating or writing to datasets either directly to a + * URI or through a LanceNamespace. When using a namespace, the table location and storage options + * are automatically managed with credential vending support. + * + *

Example usage with URI: + * + *

{@code
+ * Dataset dataset = Dataset.write(reader)
+ *     .uri("s3://bucket/table.lance")
+ *     .mode(WriteMode.CREATE)
+ *     .execute();
+ * }
+ * + *

Example usage with namespace: + * + *

{@code
+ * Dataset dataset = Dataset.write(reader)
+ *     .namespace(myNamespace, Arrays.asList("my_table"))
+ *     .mode(WriteMode.CREATE)
+ *     .execute();
+ * }
+ */ +public class WriteDatasetBuilder { + private final ArrowReader reader; + private BufferAllocator allocator; + private String uri; + private LanceNamespace namespace; + private List tableId; + private WriteParams.WriteMode mode = WriteParams.WriteMode.CREATE; + private Schema schema; + private Map storageOptions = new HashMap<>(); + private boolean ignoreNamespaceStorageOptions = false; + private Optional maxRowsPerFile = Optional.empty(); + private Optional maxRowsPerGroup = Optional.empty(); + private Optional maxBytesPerFile = Optional.empty(); + private Optional enableStableRowIds = Optional.empty(); + private Optional dataStorageVersion = Optional.empty(); + + /** Creates a new builder instance. Package-private, use Dataset.write() instead. */ + WriteDatasetBuilder(ArrowReader reader) { + Preconditions.checkNotNull(reader, "reader must not be null"); + this.reader = reader; + } + + /** + * Sets the buffer allocator. + * + * @param allocator Arrow buffer allocator + * @return this builder instance + */ + public WriteDatasetBuilder allocator(BufferAllocator allocator) { + Preconditions.checkNotNull(allocator); + this.allocator = allocator; + return this; + } + + /** + * Sets the dataset URI. + * + *

Either uri() or namespace() must be specified, but not both. + * + * @param uri The dataset URI (e.g., "s3://bucket/table.lance" or "file:///path/to/table.lance") + * @return this builder instance + */ + public WriteDatasetBuilder uri(String uri) { + this.uri = uri; + return this; + } + + /** + * Sets the namespace and table identifier. + * + *

Either uri() or namespace() must be specified, but not both. + * + * @param namespace The namespace implementation to use for table operations + * @param tableId The table identifier (e.g., Arrays.asList("my_table")) + * @return this builder instance + */ + public WriteDatasetBuilder namespace(LanceNamespace namespace, List tableId) { + this.namespace = namespace; + this.tableId = tableId; + return this; + } + + /** + * Sets the write mode. + * + * @param mode The write mode (CREATE, APPEND, or OVERWRITE) + * @return this builder instance + */ + public WriteDatasetBuilder mode(WriteParams.WriteMode mode) { + Preconditions.checkNotNull(mode); + this.mode = mode; + return this; + } + + /** + * Sets the schema for the dataset. + * + *

If not provided, the schema will be inferred from the reader. + * + * @param schema The dataset schema + * @return this builder instance + */ + public WriteDatasetBuilder schema(Schema schema) { + this.schema = schema; + return this; + } + + /** + * Sets storage options for the dataset. + * + * @param storageOptions Storage configuration options + * @return this builder instance + */ + public WriteDatasetBuilder storageOptions(Map storageOptions) { + this.storageOptions = new HashMap<>(storageOptions); + return this; + } + + /** + * Sets whether to ignore storage options from the namespace's describe_table() or + * create_empty_table(). + * + * @param ignoreNamespaceStorageOptions If true, storage options returned from namespace will be + * ignored + * @return this builder instance + */ + public WriteDatasetBuilder ignoreNamespaceStorageOptions(boolean ignoreNamespaceStorageOptions) { + this.ignoreNamespaceStorageOptions = ignoreNamespaceStorageOptions; + return this; + } + + /** + * Sets the maximum number of rows per file. + * + * @param maxRowsPerFile Maximum rows per file + * @return this builder instance + */ + public WriteDatasetBuilder maxRowsPerFile(int maxRowsPerFile) { + this.maxRowsPerFile = Optional.of(maxRowsPerFile); + return this; + } + + /** + * Sets the maximum number of rows per group. + * + * @param maxRowsPerGroup Maximum rows per group + * @return this builder instance + */ + public WriteDatasetBuilder maxRowsPerGroup(int maxRowsPerGroup) { + this.maxRowsPerGroup = Optional.of(maxRowsPerGroup); + return this; + } + + /** + * Sets the maximum number of bytes per file. + * + * @param maxBytesPerFile Maximum bytes per file + * @return this builder instance + */ + public WriteDatasetBuilder maxBytesPerFile(long maxBytesPerFile) { + this.maxBytesPerFile = Optional.of(maxBytesPerFile); + return this; + } + + /** + * Sets whether to enable stable row IDs. + * + * @param enableStableRowIds Whether to enable stable row IDs + * @return this builder instance + */ + public WriteDatasetBuilder enableStableRowIds(boolean enableStableRowIds) { + this.enableStableRowIds = Optional.of(enableStableRowIds); + return this; + } + + /** + * Sets the data storage version. + * + * @param dataStorageVersion The Lance file version to use + * @return this builder instance + */ + public WriteDatasetBuilder dataStorageVersion(WriteParams.LanceFileVersion dataStorageVersion) { + this.dataStorageVersion = Optional.of(dataStorageVersion); + return this; + } + + /** + * Executes the write operation and returns the created dataset. + * + *

If a namespace is configured, this automatically handles table creation or retrieval through + * the namespace API with credential vending support. + * + * @return Dataset + * @throws IllegalArgumentException if required parameters are missing or invalid + */ + public Dataset execute() { + // Validate that exactly one of uri or namespace is provided + boolean hasUri = uri != null; + boolean hasNamespace = namespace != null && tableId != null; + + if (hasUri && hasNamespace) { + throw new IllegalArgumentException( + "Cannot specify both uri and namespace. Use one or the other."); + } + if (!hasUri && !hasNamespace) { + if (namespace != null) { + throw new IllegalArgumentException( + "namespace is set but tableId is missing. Both namespace and tableId must be provided" + + " together."); + } else if (tableId != null) { + throw new IllegalArgumentException( + "tableId is set but namespace is missing. Both namespace and tableId must be provided" + + " together."); + } else { + throw new IllegalArgumentException("Either uri or namespace must be provided."); + } + } + + // Create allocator if not provided + if (allocator == null) { + allocator = new RootAllocator(Long.MAX_VALUE); + } + + // Handle namespace-based writing + if (hasNamespace) { + return executeWithNamespace(); + } + + // Handle URI-based writing + return executeWithUri(); + } + + private Dataset executeWithNamespace() { + String tableUri; + Map namespaceStorageOptions = null; + + // Mode-specific namespace operations + if (mode == WriteParams.WriteMode.CREATE) { + // Call namespace.createEmptyTable() to create new table + CreateEmptyTableRequest request = new CreateEmptyTableRequest(); + request.setId(tableId); + + CreateEmptyTableResponse response = namespace.createEmptyTable(request); + + tableUri = response.getLocation(); + if (tableUri == null || tableUri.isEmpty()) { + throw new IllegalArgumentException("Namespace did not return a table location"); + } + + namespaceStorageOptions = ignoreNamespaceStorageOptions ? null : response.getStorageOptions(); + } else { + // For APPEND/OVERWRITE modes, call namespace.describeTable() + DescribeTableRequest request = new DescribeTableRequest(); + request.setId(tableId); + + DescribeTableResponse response = namespace.describeTable(request); + + tableUri = response.getLocation(); + if (tableUri == null || tableUri.isEmpty()) { + throw new IllegalArgumentException("Namespace did not return a table location"); + } + + namespaceStorageOptions = ignoreNamespaceStorageOptions ? null : response.getStorageOptions(); + } + + // Merge storage options (namespace options + user options, with namespace taking precedence) + Map mergedStorageOptions = new HashMap<>(storageOptions); + if (namespaceStorageOptions != null && !namespaceStorageOptions.isEmpty()) { + mergedStorageOptions.putAll(namespaceStorageOptions); + } + + // Build WriteParams with merged storage options + WriteParams.Builder paramsBuilder = + new WriteParams.Builder().withMode(mode).withStorageOptions(mergedStorageOptions); + + maxRowsPerFile.ifPresent(paramsBuilder::withMaxRowsPerFile); + maxRowsPerGroup.ifPresent(paramsBuilder::withMaxRowsPerGroup); + maxBytesPerFile.ifPresent(paramsBuilder::withMaxBytesPerFile); + enableStableRowIds.ifPresent(paramsBuilder::withEnableStableRowIds); + dataStorageVersion.ifPresent(paramsBuilder::withDataStorageVersion); + + WriteParams params = paramsBuilder.build(); + + // Use Dataset.create() which handles CREATE/APPEND/OVERWRITE modes + return createDatasetWithStream(tableUri, params); + } + + private Dataset executeWithUri() { + WriteParams.Builder paramsBuilder = + new WriteParams.Builder().withMode(mode).withStorageOptions(storageOptions); + + maxRowsPerFile.ifPresent(paramsBuilder::withMaxRowsPerFile); + maxRowsPerGroup.ifPresent(paramsBuilder::withMaxRowsPerGroup); + maxBytesPerFile.ifPresent(paramsBuilder::withMaxBytesPerFile); + enableStableRowIds.ifPresent(paramsBuilder::withEnableStableRowIds); + dataStorageVersion.ifPresent(paramsBuilder::withDataStorageVersion); + + WriteParams params = paramsBuilder.build(); + + return createDatasetWithStream(uri, params); + } + + private Dataset createDatasetWithStream(String path, WriteParams params) { + try (ArrowArrayStream stream = ArrowArrayStream.allocateNew(allocator)) { + Data.exportArrayStream(allocator, reader, stream); + return Dataset.create(allocator, stream, path, params); + } + } +} diff --git a/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java b/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java index 039fbbce662..514896cb86e 100644 --- a/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java +++ b/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java @@ -155,6 +155,29 @@ public String namespaceId() { return "MockLanceNamespace { }"; } + @Override + public com.lancedb.lance.namespace.model.CreateEmptyTableResponse createEmptyTable( + com.lancedb.lance.namespace.model.CreateEmptyTableRequest request) { + callCount.incrementAndGet(); + + String tableName = String.join("/", request.getId()); + // Generate a table URI for the new table + String tableUri = "s3://" + BUCKET_NAME + "/" + tableName + ".lance"; + tableLocations.put(tableName, tableUri); + + // Create storage options with expiration + Map storageOptions = new HashMap<>(baseStorageOptions); + long expiresAtMillis = System.currentTimeMillis() + (credentialExpiresInSeconds * 1000L); + storageOptions.put("expires_at_millis", String.valueOf(expiresAtMillis)); + + com.lancedb.lance.namespace.model.CreateEmptyTableResponse response = + new com.lancedb.lance.namespace.model.CreateEmptyTableResponse(); + response.setLocation(tableUri); + response.setStorageOptions(storageOptions); + + return response; + } + @Override public DescribeTableResponse describeTable(DescribeTableRequest request) { int count = callCount.incrementAndGet(); @@ -400,4 +423,213 @@ void testStorageOptionsProviderWithRefresh() throws Exception { } } } + + @Test + void testWriteDatasetBuilderWithNamespaceCreate() throws Exception { + try (BufferAllocator allocator = new RootAllocator()) { + // Set up storage options + Map storageOptions = new HashMap<>(); + storageOptions.put("allow_http", "true"); + storageOptions.put("aws_access_key_id", ACCESS_KEY); + storageOptions.put("aws_secret_access_key", SECRET_KEY); + storageOptions.put("aws_endpoint", ENDPOINT_URL); + storageOptions.put("aws_region", REGION); + + // Create mock namespace + MockLanceNamespace namespace = new MockLanceNamespace(storageOptions, 60); + String tableName = UUID.randomUUID().toString(); + + // Create schema and data + Schema schema = + new Schema( + Arrays.asList( + new Field("a", FieldType.nullable(new ArrowType.Int(32, true)), null), + new Field("b", FieldType.nullable(new ArrowType.Int(32, true)), null))); + + try (VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator)) { + IntVector aVector = (IntVector) root.getVector("a"); + IntVector bVector = (IntVector) root.getVector("b"); + + aVector.allocateNew(2); + bVector.allocateNew(2); + + aVector.set(0, 1); + bVector.set(0, 2); + aVector.set(1, 10); + bVector.set(1, 20); + + aVector.setValueCount(2); + bVector.setValueCount(2); + root.setRowCount(2); + + // Use the write builder with namespace + try (org.apache.arrow.vector.ipc.ArrowReader reader = + new org.apache.arrow.vector.ipc.ArrowStreamReader( + new java.io.ByteArrayInputStream(new byte[0]), allocator)) { + + // Create a test reader that returns our VectorSchemaRoot + org.apache.arrow.vector.ipc.ArrowReader testReader = + new org.apache.arrow.vector.ipc.ArrowReader(allocator) { + boolean firstRead = true; + + @Override + public boolean loadNextBatch() { + if (firstRead) { + firstRead = false; + return true; + } + return false; + } + + @Override + public long bytesRead() { + return 0; + } + + @Override + protected void closeReadSource() {} + + @Override + protected org.apache.arrow.vector.types.pojo.Schema readSchema() { + return schema; + } + + @Override + public VectorSchemaRoot getVectorSchemaRoot() { + return root; + } + }; + + int callCountBefore = namespace.getCallCount(); + + // Use the write builder to create a dataset through namespace + try (Dataset dataset = + Dataset.write(testReader) + .namespace(namespace, Arrays.asList(tableName)) + .mode(WriteParams.WriteMode.CREATE) + .allocator(allocator) + .execute()) { + + // Verify createEmptyTable was called + int callCountAfter = namespace.getCallCount(); + assertEquals( + 1, callCountAfter - callCountBefore, "createEmptyTable should be called once"); + + // Verify dataset was created successfully + assertEquals(2, dataset.countRows()); + assertEquals(schema, dataset.getSchema()); + } + } + } + } + } + + @Test + void testWriteDatasetBuilderWithNamespaceAppend() throws Exception { + try (BufferAllocator allocator = new RootAllocator()) { + // Set up storage options + Map storageOptions = new HashMap<>(); + storageOptions.put("allow_http", "true"); + storageOptions.put("aws_access_key_id", ACCESS_KEY); + storageOptions.put("aws_secret_access_key", SECRET_KEY); + storageOptions.put("aws_endpoint", ENDPOINT_URL); + storageOptions.put("aws_region", REGION); + + // Create initial dataset directly + String tableName = UUID.randomUUID().toString(); + String tableUri = "s3://" + BUCKET_NAME + "/" + tableName + ".lance"; + + Schema schema = + new Schema( + Arrays.asList( + new Field("a", FieldType.nullable(new ArrowType.Int(32, true)), null), + new Field("b", FieldType.nullable(new ArrowType.Int(32, true)), null))); + + try (VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator)) { + IntVector aVector = (IntVector) root.getVector("a"); + IntVector bVector = (IntVector) root.getVector("b"); + + aVector.allocateNew(2); + bVector.allocateNew(2); + + aVector.set(0, 1); + bVector.set(0, 2); + aVector.set(1, 10); + bVector.set(1, 20); + + aVector.setValueCount(2); + bVector.setValueCount(2); + root.setRowCount(2); + + WriteParams writeParams = + new WriteParams.Builder().withStorageOptions(storageOptions).build(); + + // Create initial dataset + try (Dataset dataset = Dataset.create(allocator, tableUri, schema, writeParams)) { + List fragments = + Fragment.create(tableUri, allocator, root, writeParams); + FragmentOperation.Append appendOp = new FragmentOperation.Append(fragments); + try (Dataset updatedDataset = + Dataset.commit(allocator, tableUri, appendOp, Optional.of(1L), storageOptions)) { + assertEquals(2, updatedDataset.countRows()); + } + } + + // Create mock namespace and register the table + MockLanceNamespace namespace = new MockLanceNamespace(storageOptions, 60); + namespace.registerTable(tableName, tableUri); + + // Now append data using the write builder with namespace + org.apache.arrow.vector.ipc.ArrowReader testReader = + new org.apache.arrow.vector.ipc.ArrowReader(allocator) { + boolean firstRead = true; + + @Override + public boolean loadNextBatch() { + if (firstRead) { + firstRead = false; + return true; + } + return false; + } + + @Override + public long bytesRead() { + return 0; + } + + @Override + protected void closeReadSource() {} + + @Override + protected org.apache.arrow.vector.types.pojo.Schema readSchema() { + return schema; + } + + @Override + public VectorSchemaRoot getVectorSchemaRoot() { + return root; + } + }; + + int callCountBefore = namespace.getCallCount(); + + // Use the write builder to append to dataset through namespace + try (Dataset dataset = + Dataset.write(testReader) + .namespace(namespace, Arrays.asList(tableName)) + .mode(WriteParams.WriteMode.APPEND) + .allocator(allocator) + .execute()) { + + // Verify describeTable was called + int callCountAfter = namespace.getCallCount(); + assertEquals(1, callCountAfter - callCountBefore, "describeTable should be called once"); + + // Verify data was appended successfully + assertEquals(4, dataset.countRows()); // Original 2 + appended 2 + } + } + } + } } diff --git a/java/src/test/java/com/lancedb/lance/StorageOptionsProviderWriteTest.java b/java/src/test/java/com/lancedb/lance/StorageOptionsProviderWriteTest.java new file mode 100644 index 00000000000..6d8408f3286 --- /dev/null +++ b/java/src/test/java/com/lancedb/lance/StorageOptionsProviderWriteTest.java @@ -0,0 +1,572 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.lancedb.lance; + +import com.lancedb.lance.io.StorageOptionsProvider; +import com.lancedb.lance.operation.Append; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.types.pojo.Schema; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.condition.EnabledIfEnvironmentVariable; +import software.amazon.awssdk.auth.credentials.AwsBasicCredentials; +import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider; +import software.amazon.awssdk.regions.Region; +import software.amazon.awssdk.services.s3.S3Client; +import software.amazon.awssdk.services.s3.model.CreateBucketRequest; +import software.amazon.awssdk.services.s3.model.DeleteBucketRequest; +import software.amazon.awssdk.services.s3.model.DeleteObjectRequest; +import software.amazon.awssdk.services.s3.model.ListObjectsV2Request; +import software.amazon.awssdk.services.s3.model.S3Object; + +import java.net.URI; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.UUID; +import java.util.concurrent.atomic.AtomicInteger; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Integration tests for Fragment write and Transaction commit with StorageOptionsProvider. + * + *

These tests verify that: - Fragment.create() can use StorageOptionsProvider for credential + * refresh - Transaction.commit() can use StorageOptionsProvider when creating new datasets - + * Transaction.commit() can use StorageOptionsProvider when committing to existing datasets + * + *

These tests require LocalStack to be running. Run with: docker compose up -d + * + *

Set LANCE_INTEGRATION_TEST=1 environment variable to enable these tests. + */ +@EnabledIfEnvironmentVariable(named = "LANCE_INTEGRATION_TEST", matches = "1") +public class StorageOptionsProviderWriteTest { + + private static final String ENDPOINT_URL = "http://localhost:4566"; + private static final String REGION = "us-east-1"; + private static final String ACCESS_KEY = "ACCESS_KEY"; + private static final String SECRET_KEY = "SECRET_KEY"; + private static final String BUCKET_NAME = "lance-write-provider-test-java"; + + private static S3Client s3Client; + + @BeforeAll + static void setup() { + s3Client = + S3Client.builder() + .endpointOverride(URI.create(ENDPOINT_URL)) + .region(Region.of(REGION)) + .credentialsProvider( + StaticCredentialsProvider.create( + AwsBasicCredentials.create(ACCESS_KEY, SECRET_KEY))) + .forcePathStyle(true) + .build(); + + // Delete bucket if it exists from previous run + try { + deleteBucket(); + } catch (Exception e) { + // Ignore if bucket doesn't exist + } + + // Create test bucket + s3Client.createBucket(CreateBucketRequest.builder().bucket(BUCKET_NAME).build()); + } + + @AfterAll + static void tearDown() { + if (s3Client != null) { + try { + deleteBucket(); + } catch (Exception e) { + // Ignore cleanup errors + } + s3Client.close(); + } + } + + private static void deleteBucket() { + // Delete all objects first + List objects = + s3Client + .listObjectsV2(ListObjectsV2Request.builder().bucket(BUCKET_NAME).build()) + .contents(); + for (S3Object obj : objects) { + s3Client.deleteObject( + DeleteObjectRequest.builder().bucket(BUCKET_NAME).key(obj.key()).build()); + } + s3Client.deleteBucket(DeleteBucketRequest.builder().bucket(BUCKET_NAME).build()); + } + + /** + * Mock StorageOptionsProvider that tracks how many times it's been called and provides + * credentials with expiration times. + */ + static class MockStorageOptionsProvider implements StorageOptionsProvider { + private final Map baseStorageOptions; + private final int credentialExpiresInSeconds; + private final AtomicInteger callCount = new AtomicInteger(0); + + public MockStorageOptionsProvider( + Map storageOptions, int credentialExpiresInSeconds) { + this.baseStorageOptions = new HashMap<>(storageOptions); + this.credentialExpiresInSeconds = credentialExpiresInSeconds; + } + + @Override + public Map fetchStorageOptions() { + int count = callCount.incrementAndGet(); + + Map storageOptions = new HashMap<>(baseStorageOptions); + long expiresAtMillis = System.currentTimeMillis() + (credentialExpiresInSeconds * 1000L); + storageOptions.put("expires_at_millis", String.valueOf(expiresAtMillis)); + + // Add a marker to track which credential set this is + storageOptions.put("credential_version", String.valueOf(count)); + + return storageOptions; + } + + public int getCallCount() { + return callCount.get(); + } + } + + @Test + void testFragmentCreateWithStorageOptionsProvider() throws Exception { + try (BufferAllocator allocator = new RootAllocator()) { + String tableName = UUID.randomUUID().toString(); + String tableUri = "s3://" + BUCKET_NAME + "/" + tableName + ".lance"; + + // Create base storage options + Map storageOptions = new HashMap<>(); + storageOptions.put("allow_http", "true"); + storageOptions.put("aws_access_key_id", ACCESS_KEY); + storageOptions.put("aws_secret_access_key", SECRET_KEY); + storageOptions.put("aws_endpoint", ENDPOINT_URL); + storageOptions.put("aws_region", REGION); + + // Create mock provider with 60-second expiration + MockStorageOptionsProvider provider = new MockStorageOptionsProvider(storageOptions, 60); + + // Create schema + Schema schema = + new Schema( + Arrays.asList( + new Field("id", FieldType.nullable(new ArrowType.Int(32, true)), null), + new Field("value", FieldType.nullable(new ArrowType.Int(32, true)), null))); + + // First, create an empty dataset + WriteParams createParams = + new WriteParams.Builder().withStorageOptions(storageOptions).build(); + try (Dataset dataset = Dataset.create(allocator, tableUri, schema, createParams)) { + assertEquals(1, dataset.version()); + + // Now write fragments using StorageOptionsProvider + try (VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator)) { + IntVector idVector = (IntVector) root.getVector("id"); + IntVector valueVector = (IntVector) root.getVector("value"); + + // Write first fragment + idVector.allocateNew(3); + valueVector.allocateNew(3); + + idVector.set(0, 1); + valueVector.set(0, 100); + idVector.set(1, 2); + valueVector.set(1, 200); + idVector.set(2, 3); + valueVector.set(2, 300); + + idVector.setValueCount(3); + valueVector.setValueCount(3); + root.setRowCount(3); + + WriteParams writeParams = new WriteParams.Builder().build(); + + int callCountBefore = provider.getCallCount(); + + // Create fragment with StorageOptionsProvider + List fragments1 = + Fragment.create(tableUri, allocator, root, writeParams, provider); + + assertNotNull(fragments1); + assertEquals(1, fragments1.size()); + + // Verify provider was called + int callCountAfter1 = provider.getCallCount(); + assertTrue( + callCountAfter1 > callCountBefore, + "Provider should be called during fragment creation"); + + // Write second fragment with different data + idVector.set(0, 4); + valueVector.set(0, 400); + idVector.set(1, 5); + valueVector.set(1, 500); + idVector.set(2, 6); + valueVector.set(2, 600); + root.setRowCount(3); + + // Create another fragment with the same provider + List fragments2 = + Fragment.create(tableUri, allocator, root, writeParams, provider); + + assertNotNull(fragments2); + assertEquals(1, fragments2.size()); + + int callCountAfter2 = provider.getCallCount(); + assertTrue( + callCountAfter2 > callCountAfter1, + "Provider should be called again for second fragment"); + + // Commit both fragments to the dataset + FragmentOperation.Append appendOp = new FragmentOperation.Append(fragments1); + try (Dataset updatedDataset = + Dataset.commit(allocator, tableUri, appendOp, Optional.of(1L), storageOptions)) { + assertEquals(2, updatedDataset.version()); + assertEquals(3, updatedDataset.countRows()); + + // Append second fragment + FragmentOperation.Append appendOp2 = new FragmentOperation.Append(fragments2); + try (Dataset finalDataset = + Dataset.commit(allocator, tableUri, appendOp2, Optional.of(2L), storageOptions)) { + assertEquals(3, finalDataset.version()); + assertEquals(6, finalDataset.countRows()); + } + } + } + } + } + } + + @Test + void testTransactionCommitWithStorageOptionsProviderNewDataset() throws Exception { + try (BufferAllocator allocator = new RootAllocator()) { + String tableName = UUID.randomUUID().toString(); + String tableUri = "s3://" + BUCKET_NAME + "/" + tableName + ".lance"; + + // Create base storage options + Map storageOptions = new HashMap<>(); + storageOptions.put("allow_http", "true"); + storageOptions.put("aws_access_key_id", ACCESS_KEY); + storageOptions.put("aws_secret_access_key", SECRET_KEY); + storageOptions.put("aws_endpoint", ENDPOINT_URL); + storageOptions.put("aws_region", REGION); + + // Create mock provider + MockStorageOptionsProvider provider = new MockStorageOptionsProvider(storageOptions, 60); + + // Create schema + Schema schema = + new Schema( + Arrays.asList( + new Field("id", FieldType.nullable(new ArrowType.Int(32, true)), null), + new Field("name", FieldType.nullable(new ArrowType.Utf8()), null))); + + // Create empty dataset first with provider + WriteParams createParams = + new WriteParams.Builder().withStorageOptions(storageOptions).build(); + try (Dataset dataset = Dataset.create(allocator, tableUri, schema, createParams)) { + assertEquals(1, dataset.version()); + + // Re-open dataset with provider so transactions can use it + ReadOptions readOptions = + new ReadOptions.Builder() + .setStorageOptions(storageOptions) + .setStorageOptionsProvider(provider) + .build(); + try (Dataset datasetWithProvider = Dataset.open(allocator, tableUri, readOptions)) { + + // Create fragments to commit + try (VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator)) { + IntVector idVector = (IntVector) root.getVector("id"); + org.apache.arrow.vector.VarCharVector nameVector = + (org.apache.arrow.vector.VarCharVector) root.getVector("name"); + + idVector.allocateNew(2); + nameVector.allocateNew(2); + + idVector.set(0, 1); + nameVector.setSafe(0, "Alice".getBytes()); + idVector.set(1, 2); + nameVector.setSafe(1, "Bob".getBytes()); + + idVector.setValueCount(2); + nameVector.setValueCount(2); + root.setRowCount(2); + + WriteParams writeParams = new WriteParams.Builder().build(); + + // Create fragments with provider + List fragments = + Fragment.create(tableUri, allocator, root, writeParams, provider); + + // Create transaction - provider will be inherited from dataset + Append appendOp = Append.builder().fragments(fragments).build(); + + int callCountBefore = provider.getCallCount(); + + Transaction transaction = + new Transaction.Builder(datasetWithProvider) + .readVersion(1L) + .operation(appendOp) + .build(); + + // Commit transaction + try (Dataset committedDataset = transaction.commit()) { + assertNotNull(committedDataset); + assertEquals(2, committedDataset.version()); + assertEquals(2, committedDataset.countRows()); + + // Verify provider was called during commit + int callCountAfter = provider.getCallCount(); + assertTrue( + callCountAfter > callCountBefore, + "Provider should be called during transaction commit"); + } + } + } + } + } + } + + @Test + void testTransactionCommitWithStorageOptionsProviderExistingDataset() throws Exception { + try (BufferAllocator allocator = new RootAllocator()) { + String tableName = UUID.randomUUID().toString(); + String tableUri = "s3://" + BUCKET_NAME + "/" + tableName + ".lance"; + + // Create base storage options + Map storageOptions = new HashMap<>(); + storageOptions.put("allow_http", "true"); + storageOptions.put("aws_access_key_id", ACCESS_KEY); + storageOptions.put("aws_secret_access_key", SECRET_KEY); + storageOptions.put("aws_endpoint", ENDPOINT_URL); + storageOptions.put("aws_region", REGION); + + // Create schema + Schema schema = + new Schema( + Arrays.asList( + new Field("x", FieldType.nullable(new ArrowType.Int(32, true)), null), + new Field("y", FieldType.nullable(new ArrowType.Int(32, true)), null))); + + // Create initial dataset with some data + try (VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator)) { + IntVector xVector = (IntVector) root.getVector("x"); + IntVector yVector = (IntVector) root.getVector("y"); + + xVector.allocateNew(2); + yVector.allocateNew(2); + + xVector.set(0, 1); + yVector.set(0, 10); + xVector.set(1, 2); + yVector.set(1, 20); + + xVector.setValueCount(2); + yVector.setValueCount(2); + root.setRowCount(2); + + WriteParams createParams = + new WriteParams.Builder().withStorageOptions(storageOptions).build(); + + try (Dataset dataset = Dataset.create(allocator, tableUri, schema, createParams)) { + List initialFragments = + Fragment.create(tableUri, allocator, root, createParams); + FragmentOperation.Append initialAppend = new FragmentOperation.Append(initialFragments); + + try (Dataset datasetV2 = + Dataset.commit(allocator, tableUri, initialAppend, Optional.of(1L), storageOptions)) { + assertEquals(2, datasetV2.version()); + assertEquals(2, datasetV2.countRows()); + + // Now test committing additional data with StorageOptionsProvider + MockStorageOptionsProvider provider = + new MockStorageOptionsProvider(storageOptions, 60); + + // Create more data to append + xVector.set(0, 3); + yVector.set(0, 30); + xVector.set(1, 4); + yVector.set(1, 40); + root.setRowCount(2); + + WriteParams writeParams = new WriteParams.Builder().build(); + + // Create fragments with provider + List newFragments = + Fragment.create(tableUri, allocator, root, writeParams, provider); + + int callCountBefore = provider.getCallCount(); + + // Open the existing dataset with provider + ReadOptions readOptions = + new ReadOptions.Builder() + .setStorageOptions(storageOptions) + .setStorageOptionsProvider(provider) + .build(); + try (Dataset existingDataset = Dataset.open(allocator, tableUri, readOptions)) { + // Create transaction - provider will be inherited from dataset + Append appendOp = Append.builder().fragments(newFragments).build(); + + Transaction transaction = + new Transaction.Builder(existingDataset) + .readVersion(existingDataset.version()) + .operation(appendOp) + .build(); + + // Commit transaction to existing dataset + try (Dataset committedDataset = transaction.commit()) { + assertNotNull(committedDataset); + assertEquals(3, committedDataset.version()); + assertEquals(4, committedDataset.countRows()); + + // Verify provider was called during commit + int callCountAfter = provider.getCallCount(); + assertTrue( + callCountAfter > callCountBefore, + "Provider should be called during transaction commit to existing dataset"); + } + } + } + } + } + } + } + + @Test + void testMultipleFragmentsAndCommitWithProvider() throws Exception { + try (BufferAllocator allocator = new RootAllocator()) { + String tableName = UUID.randomUUID().toString(); + String tableUri = "s3://" + BUCKET_NAME + "/" + tableName + ".lance"; + + // Create base storage options + Map storageOptions = new HashMap<>(); + storageOptions.put("allow_http", "true"); + storageOptions.put("aws_access_key_id", ACCESS_KEY); + storageOptions.put("aws_secret_access_key", SECRET_KEY); + storageOptions.put("aws_endpoint", ENDPOINT_URL); + storageOptions.put("aws_region", REGION); + + // Create mock provider with short expiration to test refresh + MockStorageOptionsProvider provider = new MockStorageOptionsProvider(storageOptions, 5); + + // Create schema + Schema schema = + new Schema( + Arrays.asList( + new Field("data", FieldType.nullable(new ArrowType.Int(32, true)), null))); + + // Create empty dataset + WriteParams createParams = + new WriteParams.Builder().withStorageOptions(storageOptions).build(); + try (Dataset dataset = Dataset.create(allocator, tableUri, schema, createParams)) { + assertEquals(1, dataset.version()); + + try (VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator)) { + IntVector dataVector = (IntVector) root.getVector("data"); + WriteParams writeParams = new WriteParams.Builder().build(); + + // Create multiple fragments with the provider + List> allFragments = new java.util.ArrayList<>(); + + for (int i = 0; i < 3; i++) { + dataVector.allocateNew(2); + dataVector.set(0, i * 10); + dataVector.set(1, i * 10 + 1); + dataVector.setValueCount(2); + root.setRowCount(2); + + List fragments = + Fragment.create(tableUri, allocator, root, writeParams, provider); + allFragments.add(fragments); + + // Add a small delay to simulate real-world usage + Thread.sleep(100); + } + + // Verify provider was called multiple times + int callCountAfterFragments = provider.getCallCount(); + assertTrue(callCountAfterFragments >= 3, "Provider should be called for each fragment"); + + // Commit all fragments in one transaction with provider + int callCountBeforeCommit = provider.getCallCount(); + + FragmentOperation.Append appendOp1 = new FragmentOperation.Append(allFragments.get(0)); + try (Dataset v2 = + Dataset.commit(allocator, tableUri, appendOp1, Optional.of(1L), storageOptions)) { + + // Open v2 with provider so transaction can inherit it + ReadOptions readOptions = + new ReadOptions.Builder() + .setStorageOptions(storageOptions) + .setStorageOptionsProvider(provider) + .build(); + try (Dataset v2WithProvider = Dataset.open(allocator, tableUri, readOptions)) { + + Append appendOp2 = Append.builder().fragments(allFragments.get(1)).build(); + Transaction transaction = + new Transaction.Builder(v2WithProvider) + .readVersion(v2WithProvider.version()) + .operation(appendOp2) + .build(); + + try (Dataset v3 = transaction.commit()) { + assertEquals(3, v3.version()); + + // Re-open v3 with provider for next transaction + try (Dataset v3WithProvider = Dataset.open(allocator, tableUri, readOptions)) { + + // Commit third fragment + Append appendOp3 = Append.builder().fragments(allFragments.get(2)).build(); + Transaction transaction2 = + new Transaction.Builder(v3WithProvider) + .readVersion(v3WithProvider.version()) + .operation(appendOp3) + .build(); + + try (Dataset finalDataset = transaction2.commit()) { + assertNotNull(finalDataset); + assertEquals(4, finalDataset.version()); + assertEquals(6, finalDataset.countRows()); // 3 fragments * 2 rows each + + // Verify provider was used during commits + int finalCallCount = provider.getCallCount(); + assertTrue( + finalCallCount > callCountBeforeCommit, + "Provider should be called during transaction commits"); + } + } + } + } + } + } + } + } + } +} From 3bcd5e0c516216a59d6664f3a4a1670b3e333b02 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Thu, 20 Nov 2025 20:08:05 -0800 Subject: [PATCH 02/12] cleanup api --- java/lance-jni/src/blocking_dataset.rs | 64 ++++++-------------------- java/lance-jni/src/merge_insert.rs | 7 +-- rust/lance/src/dataset.rs | 9 ++++ 3 files changed, 24 insertions(+), 56 deletions(-) diff --git a/java/lance-jni/src/blocking_dataset.rs b/java/lance-jni/src/blocking_dataset.rs index b44ef7b41a3..3b2e7f41b16 100644 --- a/java/lance-jni/src/blocking_dataset.rs +++ b/java/lance-jni/src/blocking_dataset.rs @@ -56,13 +56,12 @@ pub const NATIVE_DATASET: &str = "nativeDatasetHandle"; #[derive(Clone)] pub struct BlockingDataset { pub(crate) inner: Dataset, - pub(crate) storage_options_provider: Option>, } impl BlockingDataset { /// Get the storage options provider that was used when opening this dataset pub fn get_storage_options_provider(&self) -> Option> { - self.storage_options_provider.clone() + self.inner.storage_options_provider() } pub fn drop(uri: &str, storage_options: HashMap) -> Result<()> { @@ -87,15 +86,8 @@ impl BlockingDataset { uri: &str, params: Option, ) -> Result { - let storage_options_provider = params - .as_ref() - .and_then(|p| p.store_params.as_ref()) - .and_then(|sp| sp.storage_options_provider.clone()); let inner = RT.block_on(Dataset::write(reader, uri, params))?; - Ok(Self { - inner, - storage_options_provider, - }) + Ok(Self { inner }) } #[allow(clippy::too_many_arguments)] @@ -148,10 +140,7 @@ impl BlockingDataset { } let inner = RT.block_on(builder.load())?; - Ok(Self { - inner, - storage_options_provider, - }) + Ok(Self { inner }) } pub fn commit( @@ -172,10 +161,7 @@ impl BlockingDataset { Default::default(), false, // TODO: support enable_v2_manifest_paths ))?; - Ok(Self { - inner, - storage_options_provider: None, - }) + Ok(Self { inner }) } pub fn latest_version(&self) -> Result { @@ -194,18 +180,12 @@ impl BlockingDataset { pub fn checkout_version(&mut self, version: u64) -> Result { let inner = RT.block_on(self.inner.checkout_version(version))?; - Ok(Self { - inner, - storage_options_provider: self.storage_options_provider.clone(), - }) + Ok(Self { inner }) } pub fn checkout_tag(&mut self, tag: &str) -> Result { let inner = RT.block_on(self.inner.checkout_version(tag))?; - Ok(Self { - inner, - storage_options_provider: self.storage_options_provider.clone(), - }) + Ok(Self { inner }) } pub fn checkout_latest(&mut self) -> Result<()> { @@ -239,10 +219,7 @@ impl BlockingDataset { None => Ref::from(version), }; let inner = RT.block_on(self.inner.create_branch(branch, reference, None))?; - Ok(Self { - inner, - storage_options_provider: self.storage_options_provider.clone(), - }) + Ok(Self { inner }) } pub fn delete_branch(&mut self, branch: &str) -> Result<()> { @@ -262,10 +239,7 @@ impl BlockingDataset { Ref::Version(branch, version) }; let inner = RT.block_on(self.inner.checkout_version(reference))?; - Ok(Self { - inner, - storage_options_provider: self.storage_options_provider.clone(), - }) + Ok(Self { inner }) } pub fn create_tag( @@ -317,7 +291,6 @@ impl BlockingDataset { transaction: Transaction, store_params: ObjectStoreParams, ) -> Result { - let storage_options_provider = store_params.storage_options_provider.clone(); let new_dataset = RT.block_on( CommitBuilder::new(Arc::new(self.clone().inner)) .with_store_params(store_params) @@ -325,7 +298,6 @@ impl BlockingDataset { )?; Ok(BlockingDataset { inner: new_dataset, - storage_options_provider, }) } @@ -1340,11 +1312,10 @@ fn inner_shallow_clone<'local>( } }; - let (new_ds, provider) = { + let new_ds = { let mut dataset_guard = unsafe { env.get_rust_field::<_, _, BlockingDataset>(java_dataset, NATIVE_DATASET) }?; - let provider = dataset_guard.storage_options_provider.clone(); - let new_ds = RT.block_on( + RT.block_on( dataset_guard.inner.shallow_clone( &target_path_str, reference, @@ -1357,15 +1328,10 @@ fn inner_shallow_clone<'local>( }) .unwrap_or(None), ), - )?; - (new_ds, provider) + )? }; - BlockingDataset { - inner: new_ds, - storage_options_provider: provider, - } - .into_java(env) + BlockingDataset { inner: new_ds }.into_java(env) } #[no_mangle] @@ -2153,16 +2119,12 @@ fn inner_create_branch_on_tag<'local>( let new_blocking_dataset = { let mut dataset_guard = unsafe { env.get_rust_field::<_, _, BlockingDataset>(java_dataset, NATIVE_DATASET) }?; - let provider = dataset_guard.storage_options_provider.clone(); let inner = RT.block_on(dataset_guard.inner.create_branch( branch_name.as_str(), reference, None, ))?; - BlockingDataset { - inner, - storage_options_provider: provider, - } + BlockingDataset { inner } }; new_blocking_dataset.into_java(env) } diff --git a/java/lance-jni/src/merge_insert.rs b/java/lance-jni/src/merge_insert.rs index f9b243bd999..3fe9a6742f2 100644 --- a/java/lance-jni/src/merge_insert.rs +++ b/java/lance-jni/src/merge_insert.rs @@ -49,9 +49,8 @@ fn inner_merge_insert<'local>( let retry_timeout_ms = extract_retry_timeout_ms(env, &jparam)?; let skip_auto_cleanup = extract_skip_auto_cleanup(env, &jparam)?; - let (new_ds, merge_stats, provider) = unsafe { + let (new_ds, merge_stats) = unsafe { let dataset = env.get_rust_field::<_, _, BlockingDataset>(jdataset, NATIVE_DATASET)?; - let provider = dataset.storage_options_provider.clone(); let when_not_matched_by_source = extract_when_not_matched_by_source( dataset.inner.schema(), @@ -71,14 +70,12 @@ fn inner_merge_insert<'local>( let stream_ptr = batch_address as *mut FFI_ArrowArrayStream; let source_stream = ArrowArrayStreamReader::from_raw(stream_ptr)?; - let result = RT.block_on(async move { merge_insert_job.execute_reader(source_stream).await })?; - (result.0, result.1, provider) + RT.block_on(async move { merge_insert_job.execute_reader(source_stream).await })? }; MergeResult( BlockingDataset { inner: Arc::try_unwrap(new_ds).unwrap(), - storage_options_provider: provider, }, merge_stats, ) diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index 8f2895537c5..992bbd48be6 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -1581,6 +1581,15 @@ impl Dataset { .and_then(|params| params.storage_options.as_ref()) } + /// Returns the storage options provider used when opening this dataset, if any. + pub fn storage_options_provider( + &self, + ) -> Option> { + self.store_params + .as_ref() + .and_then(|params| params.storage_options_provider.clone()) + } + pub fn data_dir(&self) -> Path { self.base.child(DATA_DIR) } From d21943dcabaec9cf484c1c0e57e8f7efced82d83 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Thu, 20 Nov 2025 20:30:47 -0800 Subject: [PATCH 03/12] consistent with open --- .../lancedb/lance/WriteDatasetBuilder.java | 40 ++++++++++++------- .../lance/NamespaceIntegrationTest.java | 6 ++- 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/java/src/main/java/com/lancedb/lance/WriteDatasetBuilder.java b/java/src/main/java/com/lancedb/lance/WriteDatasetBuilder.java index a2e54304a8e..76c67b6d1eb 100644 --- a/java/src/main/java/com/lancedb/lance/WriteDatasetBuilder.java +++ b/java/src/main/java/com/lancedb/lance/WriteDatasetBuilder.java @@ -52,7 +52,8 @@ * *

{@code
  * Dataset dataset = Dataset.write(reader)
- *     .namespace(myNamespace, Arrays.asList("my_table"))
+ *     .namespace(myNamespace)
+ *     .tableId(Arrays.asList("my_table"))
  *     .mode(WriteMode.CREATE)
  *     .execute();
  * }
@@ -94,7 +95,7 @@ public WriteDatasetBuilder allocator(BufferAllocator allocator) { /** * Sets the dataset URI. * - *

Either uri() or namespace() must be specified, but not both. + *

Either uri() or namespace()+tableId() must be specified, but not both. * * @param uri The dataset URI (e.g., "s3://bucket/table.lance" or "file:///path/to/table.lance") * @return this builder instance @@ -105,16 +106,29 @@ public WriteDatasetBuilder uri(String uri) { } /** - * Sets the namespace and table identifier. + * Sets the namespace. * - *

Either uri() or namespace() must be specified, but not both. + *

Must be used together with tableId(). Either uri() or namespace()+tableId() must be + * specified, but not both. * * @param namespace The namespace implementation to use for table operations - * @param tableId The table identifier (e.g., Arrays.asList("my_table")) * @return this builder instance */ - public WriteDatasetBuilder namespace(LanceNamespace namespace, List tableId) { + public WriteDatasetBuilder namespace(LanceNamespace namespace) { this.namespace = namespace; + return this; + } + + /** + * Sets the table identifier. + * + *

Must be used together with namespace(). Either uri() or namespace()+tableId() must be + * specified, but not both. + * + * @param tableId The table identifier (e.g., Arrays.asList("my_table")) + * @return this builder instance + */ + public WriteDatasetBuilder tableId(List tableId) { this.tableId = tableId; return this; } @@ -226,8 +240,8 @@ public WriteDatasetBuilder dataStorageVersion(WriteParams.LanceFileVersion dataS /** * Executes the write operation and returns the created dataset. * - *

If a namespace is configured, this automatically handles table creation or retrieval through - * the namespace API with credential vending support. + *

If a namespace is configured via namespace()+tableId(), this automatically handles table + * creation or retrieval through the namespace API with credential vending support. * * @return Dataset * @throws IllegalArgumentException if required parameters are missing or invalid @@ -239,19 +253,17 @@ public Dataset execute() { if (hasUri && hasNamespace) { throw new IllegalArgumentException( - "Cannot specify both uri and namespace. Use one or the other."); + "Cannot specify both uri() and namespace()+tableId(). Use one or the other."); } if (!hasUri && !hasNamespace) { if (namespace != null) { throw new IllegalArgumentException( - "namespace is set but tableId is missing. Both namespace and tableId must be provided" - + " together."); + "namespace() is set but tableId() is missing. Both must be provided together."); } else if (tableId != null) { throw new IllegalArgumentException( - "tableId is set but namespace is missing. Both namespace and tableId must be provided" - + " together."); + "tableId() is set but namespace() is missing. Both must be provided together."); } else { - throw new IllegalArgumentException("Either uri or namespace must be provided."); + throw new IllegalArgumentException("Either uri() or namespace()+tableId() must be called."); } } diff --git a/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java b/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java index 514896cb86e..bb64ef795a1 100644 --- a/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java +++ b/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java @@ -505,7 +505,8 @@ public VectorSchemaRoot getVectorSchemaRoot() { // Use the write builder to create a dataset through namespace try (Dataset dataset = Dataset.write(testReader) - .namespace(namespace, Arrays.asList(tableName)) + .namespace(namespace) + .tableId(Arrays.asList(tableName)) .mode(WriteParams.WriteMode.CREATE) .allocator(allocator) .execute()) { @@ -617,7 +618,8 @@ public VectorSchemaRoot getVectorSchemaRoot() { // Use the write builder to append to dataset through namespace try (Dataset dataset = Dataset.write(testReader) - .namespace(namespace, Arrays.asList(tableName)) + .namespace(namespace) + .tableId(Arrays.asList(tableName)) .mode(WriteParams.WriteMode.APPEND) .allocator(allocator) .execute()) { From 31f1fb3187fbbfc614ff51876f444c267ca8838c Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Thu, 20 Nov 2025 22:14:29 -0800 Subject: [PATCH 04/12] update --- java/lance-jni/src/blocking_dataset.rs | 18 ++- java/lance-jni/src/fragment.rs | 14 ++- java/lance-jni/src/transaction.rs | 10 +- java/lance-jni/src/utils.rs | 8 ++ .../main/java/com/lancedb/lance/Dataset.java | 54 ++++++--- .../main/java/com/lancedb/lance/Fragment.java | 12 +- .../java/com/lancedb/lance/Transaction.java | 26 +++- .../lancedb/lance/WriteDatasetBuilder.java | 113 +++++++++++++++--- .../java/com/lancedb/lance/WriteParams.java | 18 ++- .../lance/NamespaceIntegrationTest.java | 10 +- 10 files changed, 232 insertions(+), 51 deletions(-) diff --git a/java/lance-jni/src/blocking_dataset.rs b/java/lance-jni/src/blocking_dataset.rs index 3b2e7f41b16..50b64596e83 100644 --- a/java/lance-jni/src/blocking_dataset.rs +++ b/java/lance-jni/src/blocking_dataset.rs @@ -296,9 +296,7 @@ impl BlockingDataset { .with_store_params(store_params) .execute(transaction), )?; - Ok(BlockingDataset { - inner: new_dataset, - }) + Ok(BlockingDataset { inner: new_dataset }) } pub fn read_transaction(&self) -> Result> { @@ -338,6 +336,7 @@ pub extern "system" fn Java_com_lancedb_lance_Dataset_createWithFfiSchema<'local enable_stable_row_ids: JObject, // Optional data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map + s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> JObject<'local> { ok_or_throw!( env, @@ -351,7 +350,8 @@ pub extern "system" fn Java_com_lancedb_lance_Dataset_createWithFfiSchema<'local mode, enable_stable_row_ids, data_storage_version, - storage_options_obj + storage_options_obj, + s3_credentials_refresh_offset_seconds_obj ) ) } @@ -368,6 +368,7 @@ fn inner_create_with_ffi_schema<'local>( enable_stable_row_ids: JObject, // Optional data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map + s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> Result> { let c_schema_ptr = arrow_schema_addr as *mut FFI_ArrowSchema; let c_schema = unsafe { FFI_ArrowSchema::from_raw(c_schema_ptr) }; @@ -384,6 +385,7 @@ fn inner_create_with_ffi_schema<'local>( enable_stable_row_ids, data_storage_version, storage_options_obj, + s3_credentials_refresh_offset_seconds_obj, reader, ) } @@ -415,6 +417,7 @@ pub extern "system" fn Java_com_lancedb_lance_Dataset_createWithFfiStream<'local enable_stable_row_ids: JObject, // Optional data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map + s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> JObject<'local> { ok_or_throw!( env, @@ -428,7 +431,8 @@ pub extern "system" fn Java_com_lancedb_lance_Dataset_createWithFfiStream<'local mode, enable_stable_row_ids, data_storage_version, - storage_options_obj + storage_options_obj, + s3_credentials_refresh_offset_seconds_obj ) ) } @@ -445,6 +449,7 @@ fn inner_create_with_ffi_stream<'local>( enable_stable_row_ids: JObject, // Optional data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map + s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> Result> { let stream_ptr = arrow_array_stream_addr as *mut FFI_ArrowArrayStream; let reader = unsafe { ArrowArrayStreamReader::from_raw(stream_ptr) }?; @@ -458,6 +463,7 @@ fn inner_create_with_ffi_stream<'local>( enable_stable_row_ids, data_storage_version, storage_options_obj, + s3_credentials_refresh_offset_seconds_obj, reader, ) } @@ -473,6 +479,7 @@ fn create_dataset<'local>( enable_stable_row_ids: JObject, data_storage_version: JObject, storage_options_obj: JObject, + s3_credentials_refresh_offset_seconds_obj: JObject, reader: impl RecordBatchReader + Send + 'static, ) -> Result> { let path_str = path.extract(env)?; @@ -489,6 +496,7 @@ fn create_dataset<'local>( &data_storage_version, &storage_options_obj, &empty_provider, + &s3_credentials_refresh_offset_seconds_obj, )?; let dataset = BlockingDataset::write(reader, &path_str, Some(write_params))?; diff --git a/java/lance-jni/src/fragment.rs b/java/lance-jni/src/fragment.rs index 731d4631e6c..9cee53a1108 100644 --- a/java/lance-jni/src/fragment.rs +++ b/java/lance-jni/src/fragment.rs @@ -91,6 +91,7 @@ pub extern "system" fn Java_com_lancedb_lance_Fragment_createWithFfiArray<'local data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional + s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> JObject<'local> { ok_or_throw_with_return!( env, @@ -106,7 +107,8 @@ pub extern "system" fn Java_com_lancedb_lance_Fragment_createWithFfiArray<'local enable_stable_row_ids, data_storage_version, storage_options_obj, - storage_options_provider_obj + storage_options_provider_obj, + s3_credentials_refresh_offset_seconds_obj ), JObject::default() ) @@ -126,6 +128,7 @@ fn inner_create_with_ffi_array<'local>( data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional + s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> Result> { let c_array_ptr = arrow_array_addr as *mut FFI_ArrowArray; let c_schema_ptr = arrow_schema_addr as *mut FFI_ArrowSchema; @@ -151,6 +154,7 @@ fn inner_create_with_ffi_array<'local>( data_storage_version, storage_options_obj, storage_options_provider_obj, + s3_credentials_refresh_offset_seconds_obj, reader, ) } @@ -169,6 +173,7 @@ pub extern "system" fn Java_com_lancedb_lance_Fragment_createWithFfiStream<'a>( data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional + s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> JObject<'a> { ok_or_throw_with_return!( env, @@ -183,7 +188,8 @@ pub extern "system" fn Java_com_lancedb_lance_Fragment_createWithFfiStream<'a>( enable_stable_row_ids, data_storage_version, storage_options_obj, - storage_options_provider_obj + storage_options_provider_obj, + s3_credentials_refresh_offset_seconds_obj ), JObject::null() ) @@ -202,6 +208,7 @@ fn inner_create_with_ffi_stream<'local>( data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional + s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> Result> { let stream_ptr = arrow_array_stream_addr as *mut FFI_ArrowArrayStream; let reader = unsafe { ArrowArrayStreamReader::from_raw(stream_ptr) }?; @@ -217,6 +224,7 @@ fn inner_create_with_ffi_stream<'local>( data_storage_version, storage_options_obj, storage_options_provider_obj, + s3_credentials_refresh_offset_seconds_obj, reader, ) } @@ -233,6 +241,7 @@ fn create_fragment<'a>( data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional + s3_credentials_refresh_offset_seconds_obj: JObject, // Optional source: impl StreamingWriteSource, ) -> Result> { let path_str = dataset_uri.extract(env)?; @@ -247,6 +256,7 @@ fn create_fragment<'a>( &data_storage_version, &storage_options_obj, &storage_options_provider_obj, + &s3_credentials_refresh_offset_seconds_obj, )?; let fragments = RT.block_on(FileFragment::create_fragments( diff --git a/java/lance-jni/src/transaction.rs b/java/lance-jni/src/transaction.rs index 8a2f2cc82a9..4f58274e311 100644 --- a/java/lance-jni/src/transaction.rs +++ b/java/lance-jni/src/transaction.rs @@ -679,7 +679,14 @@ fn inner_commit_transaction<'local>( .call_method(&java_transaction, "writeParams", "()Ljava/util/Map;", &[])? .l()?; let write_param_jmap = JMap::from_env(env, &write_param_jobj)?; - let write_param = to_rust_map(env, &write_param_jmap)?; + let mut write_param = to_rust_map(env, &write_param_jmap)?; + + // Extract s3_credentials_refresh_offset_seconds from write_param + let s3_credentials_refresh_offset = write_param + .remove("s3_credentials_refresh_offset_seconds") + .and_then(|v| v.parse::().ok()) + .map(std::time::Duration::from_secs) + .unwrap_or_else(|| std::time::Duration::from_secs(10)); // Get the Dataset's storage_options_provider let storage_options_provider = { @@ -692,6 +699,7 @@ fn inner_commit_transaction<'local>( let store_params = ObjectStoreParams { storage_options: Some(write_param), storage_options_provider, + s3_credentials_refresh_offset, ..Default::default() }; diff --git a/java/lance-jni/src/utils.rs b/java/lance-jni/src/utils.rs index 767cd4753af..50fc848313f 100644 --- a/java/lance-jni/src/utils.rs +++ b/java/lance-jni/src/utils.rs @@ -48,6 +48,7 @@ pub fn extract_write_params( data_storage_version: &JObject, storage_options_obj: &JObject, storage_options_provider_obj: &JObject, // Optional + s3_credentials_refresh_offset_seconds_obj: &JObject, // Optional ) -> Result { let mut write_params = WriteParams::default(); @@ -101,9 +102,16 @@ pub fn extract_write_params( let storage_options_provider_arc: Option> = storage_options_provider.map(|v| Arc::new(v) as Arc); + // Extract s3_credentials_refresh_offset_seconds if present + let s3_credentials_refresh_offset = env + .get_long_opt(s3_credentials_refresh_offset_seconds_obj)? + .map(|v| std::time::Duration::from_secs(v as u64)) + .unwrap_or_else(|| std::time::Duration::from_secs(10)); + write_params.store_params = Some(ObjectStoreParams { storage_options: Some(storage_options), storage_options_provider: storage_options_provider_arc, + s3_credentials_refresh_offset, ..Default::default() }); Ok(write_params) diff --git a/java/src/main/java/com/lancedb/lance/Dataset.java b/java/src/main/java/com/lancedb/lance/Dataset.java index f9574a832ad..b6df659b9f4 100644 --- a/java/src/main/java/com/lancedb/lance/Dataset.java +++ b/java/src/main/java/com/lancedb/lance/Dataset.java @@ -79,30 +79,33 @@ private Dataset() {} * Creates a builder for writing a dataset. * *

This builder supports writing datasets either directly to a URI or through a LanceNamespace. + * Data can be provided via reader() or stream() methods. * - *

Example usage with URI: + *

Example usage with URI and reader: * *

{@code
-   * Dataset dataset = Dataset.write(reader)
+   * Dataset dataset = Dataset.write()
+   *     .reader(myReader)
    *     .uri("s3://bucket/table.lance")
    *     .mode(WriteMode.CREATE)
    *     .execute();
    * }
* - *

Example usage with namespace: + *

Example usage with namespace and empty table: * *

{@code
-   * Dataset dataset = Dataset.write(reader)
-   *     .namespace(myNamespace, Arrays.asList("my_table"))
-   *     .mode(WriteMode.APPEND)
+   * Dataset dataset = Dataset.write()
+   *     .schema(mySchema)
+   *     .namespace(myNamespace)
+   *     .tableId(Arrays.asList("my_table"))
+   *     .mode(WriteMode.CREATE)
    *     .execute();
    * }
* - * @param reader ArrowReader containing the data to write * @return A new WriteDatasetBuilder instance */ - public static WriteDatasetBuilder write(ArrowReader reader) { - return new WriteDatasetBuilder(reader); + public static WriteDatasetBuilder write() { + return new WriteDatasetBuilder(); } /** @@ -113,8 +116,9 @@ public static WriteDatasetBuilder write(ArrowReader reader) { * @param schema dataset schema * @param params write params * @return Dataset - * @deprecated Use {@link #write(ArrowReader)} builder instead. For example: {@code - * Dataset.write(reader).uri(path).mode(WriteMode.CREATE).execute()} + * @deprecated Use {@link #write()} builder instead. For example: {@code + * Dataset.write().allocator(allocator).schema(schema).uri(path) + * .mode(WriteMode.CREATE).execute()} */ @Deprecated public static Dataset create( @@ -135,7 +139,8 @@ public static Dataset create( params.getMode(), params.getEnableStableRowIds(), params.getDataStorageVersion(), - params.getStorageOptions()); + params.getStorageOptions(), + params.getS3CredentialsRefreshOffsetSeconds()); dataset.allocator = allocator; return dataset; } @@ -149,8 +154,9 @@ public static Dataset create( * @param path dataset uri * @param params write parameters * @return Dataset - * @deprecated Use {@link #write(ArrowReader)} builder instead. For example: {@code - * Dataset.write(reader).uri(path).mode(WriteMode.CREATE).execute()} + * @deprecated Use {@link #write()} builder instead. For example: {@code + * Dataset.write().allocator(allocator).stream(stream).uri(path) + * .mode(WriteMode.CREATE).execute()} */ @Deprecated public static Dataset create( @@ -169,7 +175,8 @@ public static Dataset create( params.getMode(), params.getEnableStableRowIds(), params.getDataStorageVersion(), - params.getStorageOptions()); + params.getStorageOptions(), + params.getS3CredentialsRefreshOffsetSeconds()); dataset.allocator = allocator; return dataset; } @@ -183,7 +190,8 @@ private static native Dataset createWithFfiSchema( Optional mode, Optional enableStableRowIds, Optional dataStorageVersion, - Map storageOptions); + Map storageOptions, + Optional s3CredentialsRefreshOffsetSeconds); private static native Dataset createWithFfiStream( long arrowStreamMemoryAddress, @@ -194,14 +202,17 @@ private static native Dataset createWithFfiStream( Optional mode, Optional enableStableRowIds, Optional dataStorageVersion, - Map storageOptions); + Map storageOptions, + Optional s3CredentialsRefreshOffsetSeconds); /** * Open a dataset from the specified path. * * @param path file path * @return Dataset + * @deprecated Use {@link #open()} builder instead: {@code Dataset.open().uri(path).build()} */ + @Deprecated public static Dataset open(String path) { return open(new RootAllocator(Long.MAX_VALUE), true, path, new ReadOptions.Builder().build()); } @@ -212,7 +223,10 @@ public static Dataset open(String path) { * @param path file path * @param options the open options * @return Dataset + * @deprecated Use {@link #open()} builder instead: {@code + * Dataset.open().uri(path).readOptions(options).build()} */ + @Deprecated public static Dataset open(String path, ReadOptions options) { return open(new RootAllocator(Long.MAX_VALUE), true, path, options); } @@ -223,7 +237,10 @@ public static Dataset open(String path, ReadOptions options) { * @param path file path * @param allocator Arrow buffer allocator * @return Dataset + * @deprecated Use {@link #open()} builder instead: {@code + * Dataset.open().allocator(allocator).uri(path).build()} */ + @Deprecated public static Dataset open(String path, BufferAllocator allocator) { return open(allocator, path, new ReadOptions.Builder().build()); } @@ -235,7 +252,10 @@ public static Dataset open(String path, BufferAllocator allocator) { * @param path file path * @param options the open options * @return Dataset + * @deprecated Use {@link #open()} builder instead: {@code + * Dataset.open().allocator(allocator).uri(path).readOptions(options).build()} */ + @Deprecated public static Dataset open(BufferAllocator allocator, String path, ReadOptions options) { return open(allocator, false, path, options); } diff --git a/java/src/main/java/com/lancedb/lance/Fragment.java b/java/src/main/java/com/lancedb/lance/Fragment.java index 3864734f06e..d91a1d1d26a 100644 --- a/java/src/main/java/com/lancedb/lance/Fragment.java +++ b/java/src/main/java/com/lancedb/lance/Fragment.java @@ -247,7 +247,8 @@ public static List create( params.getEnableStableRowIds(), params.getDataStorageVersion(), params.getStorageOptions(), - Optional.ofNullable(storageOptionsProvider)); + Optional.ofNullable(storageOptionsProvider), + params.getS3CredentialsRefreshOffsetSeconds()); } } @@ -292,7 +293,8 @@ public static List create( params.getEnableStableRowIds(), params.getDataStorageVersion(), params.getStorageOptions(), - Optional.ofNullable(storageOptionsProvider)); + Optional.ofNullable(storageOptionsProvider), + params.getS3CredentialsRefreshOffsetSeconds()); } /** @@ -311,7 +313,8 @@ private static native List createWithFfiArray( Optional enableStableRowIds, Optional dataStorageVersion, Map storageOptions, - Optional storageOptionsProvider); + Optional storageOptionsProvider, + Optional s3CredentialsRefreshOffsetSeconds); /** * Create a fragment from the given arrow stream. @@ -328,5 +331,6 @@ private static native List createWithFfiStream( Optional enableStableRowIds, Optional dataStorageVersion, Map storageOptions, - Optional storageOptionsProvider); + Optional storageOptionsProvider, + Optional s3CredentialsRefreshOffsetSeconds); } diff --git a/java/src/main/java/com/lancedb/lance/Transaction.java b/java/src/main/java/com/lancedb/lance/Transaction.java index 057ca5466cf..f62b7018a86 100644 --- a/java/src/main/java/com/lancedb/lance/Transaction.java +++ b/java/src/main/java/com/lancedb/lance/Transaction.java @@ -118,6 +118,7 @@ public static class Builder { private Operation operation; private Map writeParams; private Map transactionProperties; + private Optional s3CredentialsRefreshOffsetSeconds = Optional.empty(); public Builder(Dataset dataset) { this.dataset = dataset; @@ -139,6 +140,21 @@ public Builder writeParams(Map writeParams) { return this; } + /** + * Sets the S3 credentials refresh offset in seconds. + * + *

This parameter controls how long before credential expiration to refresh them. For + * example, if credentials expire at T+60s and this is set to 10, credentials will be refreshed + * at T+50s. + * + * @param s3CredentialsRefreshOffsetSeconds Refresh offset in seconds + * @return this builder instance + */ + public Builder s3CredentialsRefreshOffsetSeconds(long s3CredentialsRefreshOffsetSeconds) { + this.s3CredentialsRefreshOffsetSeconds = Optional.of(s3CredentialsRefreshOffsetSeconds); + return this; + } + public Builder operation(Operation operation) { validateState(); this.operation = operation; @@ -154,8 +170,16 @@ private void validateState() { public Transaction build() { Preconditions.checkState(operation != null, "TransactionBuilder has no operations"); + + // Merge s3_credentials_refresh_offset_seconds into writeParams if present + Map finalWriteParams = + writeParams != null ? new HashMap<>(writeParams) : new HashMap<>(); + s3CredentialsRefreshOffsetSeconds.ifPresent( + value -> + finalWriteParams.put("s3_credentials_refresh_offset_seconds", String.valueOf(value))); + return new Transaction( - dataset, readVersion, uuid, operation, writeParams, transactionProperties); + dataset, readVersion, uuid, operation, finalWriteParams, transactionProperties); } } } diff --git a/java/src/main/java/com/lancedb/lance/WriteDatasetBuilder.java b/java/src/main/java/com/lancedb/lance/WriteDatasetBuilder.java index 76c67b6d1eb..6903bd00d9c 100644 --- a/java/src/main/java/com/lancedb/lance/WriteDatasetBuilder.java +++ b/java/src/main/java/com/lancedb/lance/WriteDatasetBuilder.java @@ -39,10 +39,11 @@ * URI or through a LanceNamespace. When using a namespace, the table location and storage options * are automatically managed with credential vending support. * - *

Example usage with URI: + *

Example usage with URI and reader: * *

{@code
- * Dataset dataset = Dataset.write(reader)
+ * Dataset dataset = Dataset.write(allocator)
+ *     .reader(myReader)
  *     .uri("s3://bucket/table.lance")
  *     .mode(WriteMode.CREATE)
  *     .execute();
@@ -51,7 +52,8 @@
  * 

Example usage with namespace: * *

{@code
- * Dataset dataset = Dataset.write(reader)
+ * Dataset dataset = Dataset.write(allocator)
+ *     .reader(myReader)
  *     .namespace(myNamespace)
  *     .tableId(Arrays.asList("my_table"))
  *     .mode(WriteMode.CREATE)
@@ -59,8 +61,9 @@
  * }
*/ public class WriteDatasetBuilder { - private final ArrowReader reader; private BufferAllocator allocator; + private ArrowReader reader; + private ArrowArrayStream stream; private String uri; private LanceNamespace namespace; private List tableId; @@ -73,25 +76,55 @@ public class WriteDatasetBuilder { private Optional maxBytesPerFile = Optional.empty(); private Optional enableStableRowIds = Optional.empty(); private Optional dataStorageVersion = Optional.empty(); + private Optional s3CredentialsRefreshOffsetSeconds = Optional.empty(); /** Creates a new builder instance. Package-private, use Dataset.write() instead. */ - WriteDatasetBuilder(ArrowReader reader) { - Preconditions.checkNotNull(reader, "reader must not be null"); - this.reader = reader; + WriteDatasetBuilder() { + // allocator is optional and can be set via allocator() method } /** - * Sets the buffer allocator. + * Sets the buffer allocator to use for Arrow operations. + * + *

If not provided, a default RootAllocator will be created automatically. * - * @param allocator Arrow buffer allocator + * @param allocator The buffer allocator * @return this builder instance */ public WriteDatasetBuilder allocator(BufferAllocator allocator) { - Preconditions.checkNotNull(allocator); + Preconditions.checkNotNull(allocator, "allocator must not be null"); this.allocator = allocator; return this; } + /** + * Sets the ArrowReader containing the data to write. + * + *

Either reader() or stream() or schema() (for empty tables) must be provided. + * + * @param reader ArrowReader containing the data + * @return this builder instance + */ + public WriteDatasetBuilder reader(ArrowReader reader) { + Preconditions.checkNotNull(reader); + this.reader = reader; + return this; + } + + /** + * Sets the ArrowArrayStream containing the data to write. + * + *

Either reader() or stream() or schema() (for empty tables) must be provided. + * + * @param stream ArrowArrayStream containing the data + * @return this builder instance + */ + public WriteDatasetBuilder stream(ArrowArrayStream stream) { + Preconditions.checkNotNull(stream); + this.stream = stream; + return this; + } + /** * Sets the dataset URI. * @@ -237,6 +270,21 @@ public WriteDatasetBuilder dataStorageVersion(WriteParams.LanceFileVersion dataS return this; } + /** + * Sets the S3 credentials refresh offset in seconds. + * + *

This parameter controls how long before credential expiration to refresh them. For example, + * if credentials expire at T+60s and this is set to 10, credentials will be refreshed at T+50s. + * + * @param s3CredentialsRefreshOffsetSeconds Refresh offset in seconds + * @return this builder instance + */ + public WriteDatasetBuilder s3CredentialsRefreshOffsetSeconds( + long s3CredentialsRefreshOffsetSeconds) { + this.s3CredentialsRefreshOffsetSeconds = Optional.of(s3CredentialsRefreshOffsetSeconds); + return this; + } + /** * Executes the write operation and returns the created dataset. * @@ -247,6 +295,11 @@ public WriteDatasetBuilder dataStorageVersion(WriteParams.LanceFileVersion dataS * @throws IllegalArgumentException if required parameters are missing or invalid */ public Dataset execute() { + // Auto-create allocator if not provided + if (allocator == null) { + allocator = new RootAllocator(Long.MAX_VALUE); + } + // Validate that exactly one of uri or namespace is provided boolean hasUri = uri != null; boolean hasNamespace = namespace != null && tableId != null; @@ -267,9 +320,20 @@ public Dataset execute() { } } - // Create allocator if not provided - if (allocator == null) { - allocator = new RootAllocator(Long.MAX_VALUE); + // Validate data source - exactly one of reader, stream, or schema must be provided + int dataSourceCount = 0; + if (reader != null) dataSourceCount++; + if (stream != null) dataSourceCount++; + if (schema != null && reader == null && stream == null) dataSourceCount++; + + if (dataSourceCount == 0) { + throw new IllegalArgumentException( + "Must provide data via reader(), stream(), or schema() (for empty tables)."); + } + if (dataSourceCount > 1) { + throw new IllegalArgumentException( + "Cannot specify multiple data sources. " + + "Use only one of: reader(), stream(), or schema()."); } // Handle namespace-based writing @@ -329,6 +393,8 @@ private Dataset executeWithNamespace() { maxBytesPerFile.ifPresent(paramsBuilder::withMaxBytesPerFile); enableStableRowIds.ifPresent(paramsBuilder::withEnableStableRowIds); dataStorageVersion.ifPresent(paramsBuilder::withDataStorageVersion); + s3CredentialsRefreshOffsetSeconds.ifPresent( + paramsBuilder::withS3CredentialsRefreshOffsetSeconds); WriteParams params = paramsBuilder.build(); @@ -345,6 +411,8 @@ private Dataset executeWithUri() { maxBytesPerFile.ifPresent(paramsBuilder::withMaxBytesPerFile); enableStableRowIds.ifPresent(paramsBuilder::withEnableStableRowIds); dataStorageVersion.ifPresent(paramsBuilder::withDataStorageVersion); + s3CredentialsRefreshOffsetSeconds.ifPresent( + paramsBuilder::withS3CredentialsRefreshOffsetSeconds); WriteParams params = paramsBuilder.build(); @@ -352,9 +420,24 @@ private Dataset executeWithUri() { } private Dataset createDatasetWithStream(String path, WriteParams params) { - try (ArrowArrayStream stream = ArrowArrayStream.allocateNew(allocator)) { - Data.exportArrayStream(allocator, reader, stream); + // If stream is directly provided, use it + if (stream != null) { return Dataset.create(allocator, stream, path, params); } + + // If reader is provided, convert to stream + if (reader != null) { + try (ArrowArrayStream tempStream = ArrowArrayStream.allocateNew(allocator)) { + Data.exportArrayStream(allocator, reader, tempStream); + return Dataset.create(allocator, tempStream, path, params); + } + } + + // If only schema is provided (empty table), use Dataset.create with schema + if (schema != null) { + return Dataset.create(allocator, path, schema, params); + } + + throw new IllegalStateException("No data source provided"); } } diff --git a/java/src/main/java/com/lancedb/lance/WriteParams.java b/java/src/main/java/com/lancedb/lance/WriteParams.java index 85884042a05..a49cc57c3a0 100644 --- a/java/src/main/java/com/lancedb/lance/WriteParams.java +++ b/java/src/main/java/com/lancedb/lance/WriteParams.java @@ -56,6 +56,7 @@ public String getVersionString() { private final Optional enableStableRowIds; private final Optional dataStorageVersion; private Map storageOptions = new HashMap<>(); + private final Optional s3CredentialsRefreshOffsetSeconds; private WriteParams( Optional maxRowsPerFile, @@ -64,7 +65,8 @@ private WriteParams( Optional mode, Optional enableStableRowIds, Optional dataStorageVersion, - Map storageOptions) { + Map storageOptions, + Optional s3CredentialsRefreshOffsetSeconds) { this.maxRowsPerFile = maxRowsPerFile; this.maxRowsPerGroup = maxRowsPerGroup; this.maxBytesPerFile = maxBytesPerFile; @@ -72,6 +74,7 @@ private WriteParams( this.enableStableRowIds = enableStableRowIds; this.dataStorageVersion = dataStorageVersion; this.storageOptions = storageOptions; + this.s3CredentialsRefreshOffsetSeconds = s3CredentialsRefreshOffsetSeconds; } public Optional getMaxRowsPerFile() { @@ -107,6 +110,10 @@ public Map getStorageOptions() { return storageOptions; } + public Optional getS3CredentialsRefreshOffsetSeconds() { + return s3CredentialsRefreshOffsetSeconds; + } + @Override public String toString() { return MoreObjects.toStringHelper(this) @@ -127,6 +134,7 @@ public static class Builder { private Optional enableStableRowIds = Optional.empty(); private Optional dataStorageVersion = Optional.empty(); private Map storageOptions = new HashMap<>(); + private Optional s3CredentialsRefreshOffsetSeconds = Optional.empty(); public Builder withMaxRowsPerFile(int maxRowsPerFile) { this.maxRowsPerFile = Optional.of(maxRowsPerFile); @@ -163,6 +171,11 @@ public Builder withEnableStableRowIds(boolean enableStableRowIds) { return this; } + public Builder withS3CredentialsRefreshOffsetSeconds(long s3CredentialsRefreshOffsetSeconds) { + this.s3CredentialsRefreshOffsetSeconds = Optional.of(s3CredentialsRefreshOffsetSeconds); + return this; + } + public WriteParams build() { return new WriteParams( maxRowsPerFile, @@ -171,7 +184,8 @@ public WriteParams build() { mode, enableStableRowIds, dataStorageVersion, - storageOptions); + storageOptions, + s3CredentialsRefreshOffsetSeconds); } } } diff --git a/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java b/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java index bb64ef795a1..538b6d78199 100644 --- a/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java +++ b/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java @@ -504,11 +504,12 @@ public VectorSchemaRoot getVectorSchemaRoot() { // Use the write builder to create a dataset through namespace try (Dataset dataset = - Dataset.write(testReader) + Dataset.write() + .allocator(allocator) + .reader(testReader) .namespace(namespace) .tableId(Arrays.asList(tableName)) .mode(WriteParams.WriteMode.CREATE) - .allocator(allocator) .execute()) { // Verify createEmptyTable was called @@ -617,11 +618,12 @@ public VectorSchemaRoot getVectorSchemaRoot() { // Use the write builder to append to dataset through namespace try (Dataset dataset = - Dataset.write(testReader) + Dataset.write() + .allocator(allocator) + .reader(testReader) .namespace(namespace) .tableId(Arrays.asList(tableName)) .mode(WriteParams.WriteMode.APPEND) - .allocator(allocator) .execute()) { // Verify describeTable was called From 6d8438cd02e6d7332af4547c4db2d7e030479fa3 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Thu, 20 Nov 2025 22:56:03 -0800 Subject: [PATCH 05/12] use tracking ns --- .../lance/NamespaceIntegrationTest.java | 473 +++++++++++------- 1 file changed, 288 insertions(+), 185 deletions(-) diff --git a/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java b/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java index 538b6d78199..18f456e338d 100644 --- a/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java +++ b/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java @@ -13,7 +13,10 @@ */ package com.lancedb.lance; +import com.lancedb.lance.namespace.DirectoryNamespace; import com.lancedb.lance.namespace.LanceNamespace; +import com.lancedb.lance.namespace.model.CreateEmptyTableRequest; +import com.lancedb.lance.namespace.model.CreateEmptyTableResponse; import com.lancedb.lance.namespace.model.DescribeTableRequest; import com.lancedb.lance.namespace.model.DescribeTableResponse; @@ -21,6 +24,7 @@ import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.IntVector; import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.ipc.ArrowReader; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; @@ -44,7 +48,6 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.Optional; import java.util.UUID; import java.util.concurrent.atomic.AtomicInteger; @@ -53,7 +56,7 @@ /** * Integration tests for Lance with S3 and credential refresh using StorageOptionsProvider. * - *

This test simulates a mock credential provider that returns incrementing credentials and + *

This test simulates a tracking credential provider that returns incrementing credentials and * verifies that the credential refresh mechanism works correctly. * *

These tests require LocalStack to be running. Run with: docker compose up -d @@ -120,85 +123,100 @@ private static void deleteBucket() { } /** - * Mock LanceNamespace implementation for testing. + * Tracking LanceNamespace implementation for testing. * - *

This implementation: - Returns table location and storage options via describeTable() - - * Tracks the number of times describeTable has been called - Returns credentials with short - * expiration times for testing refresh + *

This implementation wraps DirectoryNamespace and tracks API calls. It returns incrementing + * credentials with expiration timestamps to test the credential refresh mechanism. */ - static class MockLanceNamespace implements LanceNamespace { - private final Map tableLocations = new HashMap<>(); + static class TrackingNamespace implements LanceNamespace { + private final String bucketName; private final Map baseStorageOptions; private final int credentialExpiresInSeconds; - private final AtomicInteger callCount = new AtomicInteger(0); + private final AtomicInteger describeCallCount = new AtomicInteger(0); + private final AtomicInteger createCallCount = new AtomicInteger(0); + private final DirectoryNamespace inner; - public MockLanceNamespace(Map storageOptions, int credentialExpiresInSeconds) { + public TrackingNamespace( + String bucketName, Map storageOptions, int credentialExpiresInSeconds) { + this.bucketName = bucketName; this.baseStorageOptions = new HashMap<>(storageOptions); this.credentialExpiresInSeconds = credentialExpiresInSeconds; - } - @Override - public void initialize(Map configProperties, BufferAllocator allocator) { - // Not needed for test + // Create underlying DirectoryNamespace with storage options + Map dirProps = new HashMap<>(); + for (Map.Entry entry : storageOptions.entrySet()) { + dirProps.put("storage." + entry.getKey(), entry.getValue()); + } + + // Set root based on bucket type + if (bucketName.startsWith("/") || bucketName.startsWith("file://")) { + dirProps.put("root", bucketName + "/namespace_root"); + } else { + dirProps.put("root", "s3://" + bucketName + "/namespace_root"); + } + + this.inner = new DirectoryNamespace(); + try (BufferAllocator allocator = new RootAllocator()) { + this.inner.initialize(dirProps, allocator); + } } - public void registerTable(String tableName, String location) { - tableLocations.put(tableName, location); + public int getDescribeCallCount() { + return describeCallCount.get(); } - public int getCallCount() { - return callCount.get(); + public int getCreateCallCount() { + return createCallCount.get(); } @Override - public String namespaceId() { - return "MockLanceNamespace { }"; + public void initialize(Map configProperties, BufferAllocator allocator) { + // Already initialized in constructor } @Override - public com.lancedb.lance.namespace.model.CreateEmptyTableResponse createEmptyTable( - com.lancedb.lance.namespace.model.CreateEmptyTableRequest request) { - callCount.incrementAndGet(); + public String namespaceId() { + return "TrackingNamespace { inner: " + inner.namespaceId() + " }"; + } - String tableName = String.join("/", request.getId()); - // Generate a table URI for the new table - String tableUri = "s3://" + BUCKET_NAME + "/" + tableName + ".lance"; - tableLocations.put(tableName, tableUri); + /** + * Modifies storage options to add incrementing credentials with expiration timestamp. + * + * @param storageOptions Original storage options + * @param count Call count to use for credential generation + * @return Modified storage options with new credentials + */ + private Map modifyStorageOptions( + Map storageOptions, int count) { + Map modified = + storageOptions != null ? new HashMap<>(storageOptions) : new HashMap<>(); + + modified.put("aws_access_key_id", "AKID_" + count); + modified.put("aws_secret_access_key", "SECRET_" + count); + modified.put("aws_session_token", "TOKEN_" + count); - // Create storage options with expiration - Map storageOptions = new HashMap<>(baseStorageOptions); long expiresAtMillis = System.currentTimeMillis() + (credentialExpiresInSeconds * 1000L); - storageOptions.put("expires_at_millis", String.valueOf(expiresAtMillis)); + modified.put("expires_at_millis", String.valueOf(expiresAtMillis)); - com.lancedb.lance.namespace.model.CreateEmptyTableResponse response = - new com.lancedb.lance.namespace.model.CreateEmptyTableResponse(); - response.setLocation(tableUri); - response.setStorageOptions(storageOptions); + return modified; + } + + @Override + public CreateEmptyTableResponse createEmptyTable(CreateEmptyTableRequest request) { + int count = createCallCount.incrementAndGet(); + + CreateEmptyTableResponse response = inner.createEmptyTable(request); + response.setStorageOptions(modifyStorageOptions(response.getStorageOptions(), count)); return response; } @Override public DescribeTableResponse describeTable(DescribeTableRequest request) { - int count = callCount.incrementAndGet(); + int count = describeCallCount.incrementAndGet(); - String tableName = String.join("/", request.getId()); - String location = tableLocations.get(tableName); - if (location == null) { - throw new IllegalArgumentException("Table not found: " + tableName); - } - - // Create storage options with expiration - Map storageOptions = new HashMap<>(baseStorageOptions); - long expiresAtMillis = System.currentTimeMillis() + (credentialExpiresInSeconds * 1000L); - storageOptions.put("expires_at_millis", String.valueOf(expiresAtMillis)); - - DescribeTableResponse response = new DescribeTableResponse(); - response.setLocation(location); - response.setStorageOptions(storageOptions); - if (request.getVersion() != null) { - response.setVersion(request.getVersion()); - } + DescribeTableResponse response = inner.describeTable(request); + response.setStorageOptions(modifyStorageOptions(response.getStorageOptions(), count)); return response; } @@ -207,10 +225,7 @@ public DescribeTableResponse describeTable(DescribeTableRequest request) { @Test void testOpenDatasetWithoutRefresh() throws Exception { try (BufferAllocator allocator = new RootAllocator()) { - // Create test dataset directly on S3 - String tableName = UUID.randomUUID().toString(); - String tableUri = "s3://" + BUCKET_NAME + "/" + tableName + ".lance"; - + // Set up storage options Map storageOptions = new HashMap<>(); storageOptions.put("allow_http", "true"); storageOptions.put("aws_access_key_id", ACCESS_KEY); @@ -218,7 +233,11 @@ void testOpenDatasetWithoutRefresh() throws Exception { storageOptions.put("aws_endpoint", ENDPOINT_URL); storageOptions.put("aws_region", REGION); - // Create schema and write dataset + // Create tracking namespace with 60-second expiration (long enough to not expire during test) + TrackingNamespace namespace = new TrackingNamespace(BUCKET_NAME, storageOptions, 60); + String tableName = UUID.randomUUID().toString(); + + // Create schema and data Schema schema = new Schema( Arrays.asList( @@ -241,26 +260,54 @@ void testOpenDatasetWithoutRefresh() throws Exception { bVector.setValueCount(2); root.setRowCount(2); - WriteParams writeParams = - new WriteParams.Builder().withStorageOptions(storageOptions).build(); - - // Create dataset using Dataset.create - try (Dataset dataset = Dataset.create(allocator, tableUri, schema, writeParams)) { - // Add data via fragments - List fragments = - Fragment.create(tableUri, allocator, root, writeParams); - FragmentOperation.Append appendOp = new FragmentOperation.Append(fragments); - try (Dataset updatedDataset = - Dataset.commit(allocator, tableUri, appendOp, Optional.of(1L), storageOptions)) { - assertEquals(2, updatedDataset.version()); - assertEquals(2, updatedDataset.countRows()); - } + // Create a test reader that returns our VectorSchemaRoot + ArrowReader testReader = + new ArrowReader(allocator) { + boolean firstRead = true; + + @Override + public boolean loadNextBatch() { + if (firstRead) { + firstRead = false; + return true; + } + return false; + } + + @Override + public long bytesRead() { + return 0; + } + + @Override + protected void closeReadSource() {} + + @Override + protected Schema readSchema() { + return schema; + } + + @Override + public VectorSchemaRoot getVectorSchemaRoot() { + return root; + } + }; + + // Create dataset through namespace + try (Dataset dataset = + Dataset.write() + .allocator(allocator) + .reader(testReader) + .namespace(namespace) + .tableId(Arrays.asList(tableName)) + .mode(WriteParams.WriteMode.CREATE) + .execute()) { + assertEquals(2, dataset.countRows()); } } - // Create mock namespace with 60-second expiration (long enough to not expire during test) - MockLanceNamespace namespace = new MockLanceNamespace(storageOptions, 60); - namespace.registerTable(tableName, tableUri); + // Verify createEmptyTable was called + assertEquals(1, namespace.getCreateCallCount(), "createEmptyTable should be called once"); // Open dataset through namespace WITH refresh enabled // Use 10-second refresh offset, so credentials effectively expire at T+50s @@ -269,7 +316,7 @@ void testOpenDatasetWithoutRefresh() throws Exception { .setS3CredentialsRefreshOffsetSeconds(10) // Refresh 10s before expiration .build(); - int callCountBeforeOpen = namespace.getCallCount(); + int callCountBeforeOpen = namespace.getDescribeCallCount(); try (Dataset dsFromNamespace = Dataset.open() .allocator(allocator) @@ -279,7 +326,7 @@ void testOpenDatasetWithoutRefresh() throws Exception { .build()) { // With the fix, describeTable should only be called once during open // to get the table location and initial storage options - int callCountAfterOpen = namespace.getCallCount(); + int callCountAfterOpen = namespace.getDescribeCallCount(); assertEquals( 1, callCountAfterOpen - callCountBeforeOpen, @@ -295,10 +342,10 @@ void testOpenDatasetWithoutRefresh() throws Exception { List fragments = dsFromNamespace.getFragments(); assertEquals(1, fragments.size()); List versions = dsFromNamespace.listVersions(); - assertEquals(2, versions.size()); + assertEquals(1, versions.size()); // With the fix, credentials are cached so no additional calls are made - int finalCallCount = namespace.getCallCount(); + int finalCallCount = namespace.getDescribeCallCount(); int totalCalls = finalCallCount - callCountBeforeOpen; assertEquals( 1, @@ -312,10 +359,7 @@ void testOpenDatasetWithoutRefresh() throws Exception { @Test void testStorageOptionsProviderWithRefresh() throws Exception { try (BufferAllocator allocator = new RootAllocator()) { - // Create test dataset - String tableName = UUID.randomUUID().toString(); - String tableUri = "s3://" + BUCKET_NAME + "/" + tableName + ".lance"; - + // Set up storage options Map storageOptions = new HashMap<>(); storageOptions.put("allow_http", "true"); storageOptions.put("aws_access_key_id", ACCESS_KEY); @@ -323,7 +367,11 @@ void testStorageOptionsProviderWithRefresh() throws Exception { storageOptions.put("aws_endpoint", ENDPOINT_URL); storageOptions.put("aws_region", REGION); - // Create schema and write dataset + // Create tracking namespace with 5-second expiration for faster testing + TrackingNamespace namespace = new TrackingNamespace(BUCKET_NAME, storageOptions, 5); + String tableName = UUID.randomUUID().toString(); + + // Create schema and data Schema schema = new Schema( Arrays.asList( @@ -346,25 +394,55 @@ void testStorageOptionsProviderWithRefresh() throws Exception { bVector.setValueCount(2); root.setRowCount(2); - WriteParams writeParams = - new WriteParams.Builder().withStorageOptions(storageOptions).build(); - - // Create dataset using Dataset.create - try (Dataset dataset = Dataset.create(allocator, tableUri, schema, writeParams)) { - // Add data via fragments - List fragments = - Fragment.create(tableUri, allocator, root, writeParams); - FragmentOperation.Append appendOp = new FragmentOperation.Append(fragments); - try (Dataset updatedDataset = - Dataset.commit(allocator, tableUri, appendOp, Optional.of(1L), storageOptions)) { - assertEquals(2, updatedDataset.countRows()); - } + // Create a test reader that returns our VectorSchemaRoot + ArrowReader testReader = + new ArrowReader(allocator) { + boolean firstRead = true; + + @Override + public boolean loadNextBatch() { + if (firstRead) { + firstRead = false; + return true; + } + return false; + } + + @Override + public long bytesRead() { + return 0; + } + + @Override + protected void closeReadSource() {} + + @Override + protected Schema readSchema() { + return schema; + } + + @Override + public VectorSchemaRoot getVectorSchemaRoot() { + return root; + } + }; + + // Create dataset through namespace with refresh enabled + try (Dataset dataset = + Dataset.write() + .allocator(allocator) + .reader(testReader) + .namespace(namespace) + .tableId(Arrays.asList(tableName)) + .mode(WriteParams.WriteMode.CREATE) + .s3CredentialsRefreshOffsetSeconds(2) // Refresh 2s before expiration + .execute()) { + assertEquals(2, dataset.countRows()); } } - // Create mock namespace with 5-second expiration for faster testing - MockLanceNamespace namespace = new MockLanceNamespace(storageOptions, 5); - namespace.registerTable(tableName, tableUri); + // Verify createEmptyTable was called + assertEquals(1, namespace.getCreateCallCount(), "createEmptyTable should be called once"); // Open dataset through namespace with refresh enabled // Use 2-second refresh offset so credentials effectively expire at T+3s (5s - 2s) @@ -373,7 +451,7 @@ void testStorageOptionsProviderWithRefresh() throws Exception { .setS3CredentialsRefreshOffsetSeconds(2) // Refresh 2s before expiration .build(); - int callCountBeforeOpen = namespace.getCallCount(); + int callCountBeforeOpen = namespace.getDescribeCallCount(); try (Dataset dsFromNamespace = Dataset.open() .allocator(allocator) @@ -382,7 +460,7 @@ void testStorageOptionsProviderWithRefresh() throws Exception { .readOptions(readOptions) .build()) { // With the fix, describeTable should only be called once during open - int callCountAfterOpen = namespace.getCallCount(); + int callCountAfterOpen = namespace.getDescribeCallCount(); assertEquals( 1, callCountAfterOpen - callCountBeforeOpen, @@ -393,7 +471,7 @@ void testStorageOptionsProviderWithRefresh() throws Exception { assertEquals(2, dsFromNamespace.countRows()); // Record call count after initial reads - int callCountAfterInitialReads = namespace.getCallCount(); + int callCountAfterInitialReads = namespace.getDescribeCallCount(); int callsAfterFirstRead = callCountAfterInitialReads - callCountBeforeOpen; assertEquals( 1, @@ -410,9 +488,9 @@ void testStorageOptionsProviderWithRefresh() throws Exception { List fragments = dsFromNamespace.getFragments(); assertEquals(1, fragments.size()); List versions = dsFromNamespace.listVersions(); - assertEquals(2, versions.size()); + assertEquals(1, versions.size()); - int finalCallCount = namespace.getCallCount(); + int finalCallCount = namespace.getDescribeCallCount(); int totalCallsAfterExpiration = finalCallCount - callCountBeforeOpen; assertEquals( 2, @@ -435,8 +513,8 @@ void testWriteDatasetBuilderWithNamespaceCreate() throws Exception { storageOptions.put("aws_endpoint", ENDPOINT_URL); storageOptions.put("aws_region", REGION); - // Create mock namespace - MockLanceNamespace namespace = new MockLanceNamespace(storageOptions, 60); + // Create tracking namespace + TrackingNamespace namespace = new TrackingNamespace(BUCKET_NAME, storageOptions, 60); String tableName = UUID.randomUUID().toString(); // Create schema and data @@ -462,65 +540,59 @@ void testWriteDatasetBuilderWithNamespaceCreate() throws Exception { bVector.setValueCount(2); root.setRowCount(2); - // Use the write builder with namespace - try (org.apache.arrow.vector.ipc.ArrowReader reader = - new org.apache.arrow.vector.ipc.ArrowStreamReader( - new java.io.ByteArrayInputStream(new byte[0]), allocator)) { - - // Create a test reader that returns our VectorSchemaRoot - org.apache.arrow.vector.ipc.ArrowReader testReader = - new org.apache.arrow.vector.ipc.ArrowReader(allocator) { - boolean firstRead = true; - - @Override - public boolean loadNextBatch() { - if (firstRead) { - firstRead = false; - return true; - } - return false; - } + // Create a test reader that returns our VectorSchemaRoot + ArrowReader testReader = + new ArrowReader(allocator) { + boolean firstRead = true; - @Override - public long bytesRead() { - return 0; + @Override + public boolean loadNextBatch() { + if (firstRead) { + firstRead = false; + return true; } + return false; + } - @Override - protected void closeReadSource() {} + @Override + public long bytesRead() { + return 0; + } - @Override - protected org.apache.arrow.vector.types.pojo.Schema readSchema() { - return schema; - } + @Override + protected void closeReadSource() {} - @Override - public VectorSchemaRoot getVectorSchemaRoot() { - return root; - } - }; - - int callCountBefore = namespace.getCallCount(); - - // Use the write builder to create a dataset through namespace - try (Dataset dataset = - Dataset.write() - .allocator(allocator) - .reader(testReader) - .namespace(namespace) - .tableId(Arrays.asList(tableName)) - .mode(WriteParams.WriteMode.CREATE) - .execute()) { - - // Verify createEmptyTable was called - int callCountAfter = namespace.getCallCount(); - assertEquals( - 1, callCountAfter - callCountBefore, "createEmptyTable should be called once"); - - // Verify dataset was created successfully - assertEquals(2, dataset.countRows()); - assertEquals(schema, dataset.getSchema()); - } + @Override + protected Schema readSchema() { + return schema; + } + + @Override + public VectorSchemaRoot getVectorSchemaRoot() { + return root; + } + }; + + int callCountBefore = namespace.getCreateCallCount(); + + // Use the write builder to create a dataset through namespace + try (Dataset dataset = + Dataset.write() + .allocator(allocator) + .reader(testReader) + .namespace(namespace) + .tableId(Arrays.asList(tableName)) + .mode(WriteParams.WriteMode.CREATE) + .execute()) { + + // Verify createEmptyTable was called + int callCountAfter = namespace.getCreateCallCount(); + assertEquals( + 1, callCountAfter - callCountBefore, "createEmptyTable should be called once"); + + // Verify dataset was created successfully + assertEquals(2, dataset.countRows()); + assertEquals(schema, dataset.getSchema()); } } } @@ -537,9 +609,9 @@ void testWriteDatasetBuilderWithNamespaceAppend() throws Exception { storageOptions.put("aws_endpoint", ENDPOINT_URL); storageOptions.put("aws_region", REGION); - // Create initial dataset directly + // Create tracking namespace + TrackingNamespace namespace = new TrackingNamespace(BUCKET_NAME, storageOptions, 60); String tableName = UUID.randomUUID().toString(); - String tableUri = "s3://" + BUCKET_NAME + "/" + tableName + ".lance"; Schema schema = new Schema( @@ -563,27 +635,57 @@ void testWriteDatasetBuilderWithNamespaceAppend() throws Exception { bVector.setValueCount(2); root.setRowCount(2); - WriteParams writeParams = - new WriteParams.Builder().withStorageOptions(storageOptions).build(); - - // Create initial dataset - try (Dataset dataset = Dataset.create(allocator, tableUri, schema, writeParams)) { - List fragments = - Fragment.create(tableUri, allocator, root, writeParams); - FragmentOperation.Append appendOp = new FragmentOperation.Append(fragments); - try (Dataset updatedDataset = - Dataset.commit(allocator, tableUri, appendOp, Optional.of(1L), storageOptions)) { - assertEquals(2, updatedDataset.countRows()); - } + // Create a test reader that returns our VectorSchemaRoot + ArrowReader testReader = + new ArrowReader(allocator) { + boolean firstRead = true; + + @Override + public boolean loadNextBatch() { + if (firstRead) { + firstRead = false; + return true; + } + return false; + } + + @Override + public long bytesRead() { + return 0; + } + + @Override + protected void closeReadSource() {} + + @Override + protected Schema readSchema() { + return schema; + } + + @Override + public VectorSchemaRoot getVectorSchemaRoot() { + return root; + } + }; + + // Create initial dataset through namespace + try (Dataset dataset = + Dataset.write() + .allocator(allocator) + .reader(testReader) + .namespace(namespace) + .tableId(Arrays.asList(tableName)) + .mode(WriteParams.WriteMode.CREATE) + .execute()) { + assertEquals(2, dataset.countRows()); } - // Create mock namespace and register the table - MockLanceNamespace namespace = new MockLanceNamespace(storageOptions, 60); - namespace.registerTable(tableName, tableUri); + assertEquals(1, namespace.getCreateCallCount(), "createEmptyTable should be called once"); + int initialDescribeCount = namespace.getDescribeCallCount(); // Now append data using the write builder with namespace - org.apache.arrow.vector.ipc.ArrowReader testReader = - new org.apache.arrow.vector.ipc.ArrowReader(allocator) { + ArrowReader appendReader = + new ArrowReader(allocator) { boolean firstRead = true; @Override @@ -604,7 +706,7 @@ public long bytesRead() { protected void closeReadSource() {} @Override - protected org.apache.arrow.vector.types.pojo.Schema readSchema() { + protected Schema readSchema() { return schema; } @@ -614,21 +716,22 @@ public VectorSchemaRoot getVectorSchemaRoot() { } }; - int callCountBefore = namespace.getCallCount(); - // Use the write builder to append to dataset through namespace try (Dataset dataset = Dataset.write() .allocator(allocator) - .reader(testReader) + .reader(appendReader) .namespace(namespace) .tableId(Arrays.asList(tableName)) .mode(WriteParams.WriteMode.APPEND) .execute()) { // Verify describeTable was called - int callCountAfter = namespace.getCallCount(); - assertEquals(1, callCountAfter - callCountBefore, "describeTable should be called once"); + int callCountAfter = namespace.getDescribeCallCount(); + assertEquals( + 1, + callCountAfter - initialDescribeCount, + "describeTable should be called once for append"); // Verify data was appended successfully assertEquals(4, dataset.countRows()); // Original 2 + appended 2 From 2df89173214d614dfcd9dec1f9eb343446607dcb Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Thu, 20 Nov 2025 23:05:56 -0800 Subject: [PATCH 06/12] move fragment also to builder --- .../main/java/com/lancedb/lance/Fragment.java | 35 +++ .../lancedb/lance/WriteFragmentBuilder.java | 272 ++++++++++++++++++ 2 files changed, 307 insertions(+) create mode 100644 java/src/main/java/com/lancedb/lance/WriteFragmentBuilder.java diff --git a/java/src/main/java/com/lancedb/lance/Fragment.java b/java/src/main/java/com/lancedb/lance/Fragment.java index d91a1d1d26a..6077c68a324 100644 --- a/java/src/main/java/com/lancedb/lance/Fragment.java +++ b/java/src/main/java/com/lancedb/lance/Fragment.java @@ -198,6 +198,27 @@ private native FragmentUpdateResult nativeUpdateColumns( String leftOn, String rightOn); + /** + * Create a new fragment writer builder. + * + *

Example usage: + * + *

{@code
+   * List fragments = Fragment.write()
+   *     .datasetUri("s3://bucket/dataset.lance")
+   *     .allocator(allocator)
+   *     .data(vectorSchemaRoot)
+   *     .storageOptions(storageOptions)
+   *     .s3CredentialsRefreshOffsetSeconds(10)
+   *     .execute();
+   * }
+ * + * @return a new fragment writer builder + */ + public static WriteFragmentBuilder write() { + return new WriteFragmentBuilder(); + } + /** * Create a fragment from the given data. * @@ -206,7 +227,10 @@ private native FragmentUpdateResult nativeUpdateColumns( * @param root the vector schema root * @param params the write params * @return the fragment metadata + * @deprecated Use {@link #write()} builder instead. For example: {@code Fragment.write() + * .datasetUri(uri).allocator(allocator).data(root).writeParams(params).execute()} */ + @Deprecated public static List create( String datasetUri, BufferAllocator allocator, VectorSchemaRoot root, WriteParams params) { return create(datasetUri, allocator, root, params, null); @@ -222,7 +246,11 @@ public static List create( * @param storageOptionsProvider optional provider for dynamic storage options with automatic * credential refresh * @return the fragment metadata + * @deprecated Use {@link #write()} builder instead. For example: {@code Fragment.write() + * .datasetUri(uri).allocator(allocator).data(root).writeParams(params) + * .storageOptionsProvider(provider).execute()} */ + @Deprecated public static List create( String datasetUri, BufferAllocator allocator, @@ -259,7 +287,10 @@ public static List create( * @param stream the arrow stream * @param params the write params * @return the fragment metadata + * @deprecated Use {@link #write()} builder instead. For example: {@code Fragment.write() + * .datasetUri(uri).data(stream).writeParams(params).execute()} */ + @Deprecated public static List create( String datasetUri, ArrowArrayStream stream, WriteParams params) { return create(datasetUri, stream, params, null); @@ -274,7 +305,11 @@ public static List create( * @param storageOptionsProvider optional provider for dynamic storage options with automatic * credential refresh * @return the fragment metadata + * @deprecated Use {@link #write()} builder instead. For example: {@code + * Fragment.write().datasetUri(uri).data(stream).writeParams(params) + * .storageOptionsProvider(provider).execute()} */ + @Deprecated public static List create( String datasetUri, ArrowArrayStream stream, diff --git a/java/src/main/java/com/lancedb/lance/WriteFragmentBuilder.java b/java/src/main/java/com/lancedb/lance/WriteFragmentBuilder.java new file mode 100644 index 00000000000..d504d0a9373 --- /dev/null +++ b/java/src/main/java/com/lancedb/lance/WriteFragmentBuilder.java @@ -0,0 +1,272 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.lancedb.lance; + +import com.lancedb.lance.io.StorageOptionsProvider; + +import org.apache.arrow.c.ArrowArrayStream; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.VectorSchemaRoot; + +import java.util.List; +import java.util.Map; + +/** + * Builder for writing fragments. + * + *

This builder provides a fluent API for creating fragments with various configuration options. + * It supports both VectorSchemaRoot and ArrowArrayStream as data sources. + * + *

Example usage: + * + *

{@code
+ * List fragments = Fragment.write()
+ *     .datasetUri("s3://bucket/dataset.lance")
+ *     .allocator(allocator)
+ *     .data(vectorSchemaRoot)
+ *     .storageOptions(storageOptions)
+ *     .s3CredentialsRefreshOffsetSeconds(10)
+ *     .execute();
+ * }
+ */ +public class WriteFragmentBuilder { + private String datasetUri; + private BufferAllocator allocator; + private VectorSchemaRoot vectorSchemaRoot; + private ArrowArrayStream arrowArrayStream; + private WriteParams writeParams; + private WriteParams.Builder writeParamsBuilder; + private StorageOptionsProvider storageOptionsProvider; + + WriteFragmentBuilder() {} + + /** + * Set the dataset URI where fragments will be written. + * + * @param datasetUri the dataset URI + * @return this builder + */ + public WriteFragmentBuilder datasetUri(String datasetUri) { + this.datasetUri = datasetUri; + return this; + } + + /** + * Set the buffer allocator for Arrow operations. + * + * @param allocator the buffer allocator + * @return this builder + */ + public WriteFragmentBuilder allocator(BufferAllocator allocator) { + this.allocator = allocator; + return this; + } + + /** + * Set the data to write using a VectorSchemaRoot. + * + * @param root the vector schema root containing the data + * @return this builder + */ + public WriteFragmentBuilder data(VectorSchemaRoot root) { + Preconditions.checkState( + this.arrowArrayStream == null, "Cannot set both VectorSchemaRoot and ArrowArrayStream"); + this.vectorSchemaRoot = root; + return this; + } + + /** + * Set the data to write using an ArrowArrayStream. + * + * @param stream the arrow array stream containing the data + * @return this builder + */ + public WriteFragmentBuilder data(ArrowArrayStream stream) { + Preconditions.checkState( + this.vectorSchemaRoot == null, "Cannot set both VectorSchemaRoot and ArrowArrayStream"); + this.arrowArrayStream = stream; + return this; + } + + /** + * Set the write parameters. + * + * @param params the write parameters + * @return this builder + */ + public WriteFragmentBuilder writeParams(WriteParams params) { + this.writeParams = params; + return this; + } + + /** + * Set storage options for object store access. + * + * @param storageOptions the storage options + * @return this builder + */ + public WriteFragmentBuilder storageOptions(Map storageOptions) { + ensureWriteParamsBuilder(); + this.writeParamsBuilder.withStorageOptions(storageOptions); + return this; + } + + /** + * Set the storage options provider for dynamic credential refresh. + * + * @param provider the storage options provider + * @return this builder + */ + public WriteFragmentBuilder storageOptionsProvider(StorageOptionsProvider provider) { + this.storageOptionsProvider = provider; + return this; + } + + /** + * Set the S3 credentials refresh offset in seconds. + * + *

This parameter controls how long before credential expiration to refresh them. For example, + * if credentials expire at T+60s and this is set to 10, credentials will be refreshed at T+50s. + * + * @param seconds refresh offset in seconds + * @return this builder + */ + public WriteFragmentBuilder s3CredentialsRefreshOffsetSeconds(long seconds) { + ensureWriteParamsBuilder(); + this.writeParamsBuilder.withS3CredentialsRefreshOffsetSeconds(seconds); + return this; + } + + /** + * Set the maximum number of rows per file. + * + * @param maxRowsPerFile maximum rows per file + * @return this builder + */ + public WriteFragmentBuilder maxRowsPerFile(int maxRowsPerFile) { + ensureWriteParamsBuilder(); + this.writeParamsBuilder.withMaxRowsPerFile(maxRowsPerFile); + return this; + } + + /** + * Set the maximum number of rows per group. + * + * @param maxRowsPerGroup maximum rows per group + * @return this builder + */ + public WriteFragmentBuilder maxRowsPerGroup(int maxRowsPerGroup) { + ensureWriteParamsBuilder(); + this.writeParamsBuilder.withMaxRowsPerGroup(maxRowsPerGroup); + return this; + } + + /** + * Set the maximum number of bytes per file. + * + * @param maxBytesPerFile maximum bytes per file + * @return this builder + */ + public WriteFragmentBuilder maxBytesPerFile(long maxBytesPerFile) { + ensureWriteParamsBuilder(); + this.writeParamsBuilder.withMaxBytesPerFile(maxBytesPerFile); + return this; + } + + /** + * Set the write mode. + * + * @param mode the write mode + * @return this builder + */ + public WriteFragmentBuilder mode(WriteParams.WriteMode mode) { + ensureWriteParamsBuilder(); + this.writeParamsBuilder.withMode(mode); + return this; + } + + /** + * Enable or disable stable row IDs. + * + * @param enable whether to enable stable row IDs + * @return this builder + */ + public WriteFragmentBuilder enableStableRowIds(boolean enable) { + ensureWriteParamsBuilder(); + this.writeParamsBuilder.withEnableStableRowIds(enable); + return this; + } + + /** + * Set the data storage version. + * + * @param version the data storage version + * @return this builder + */ + public WriteFragmentBuilder dataStorageVersion(WriteParams.LanceFileVersion version) { + ensureWriteParamsBuilder(); + this.writeParamsBuilder.withDataStorageVersion(version); + return this; + } + + /** + * Execute the fragment write operation. + * + * @return the list of fragment metadata for the created fragments + */ + public List execute() { + validate(); + + // Build the write params if builder was used + WriteParams finalWriteParams = buildWriteParams(); + + if (vectorSchemaRoot != null) { + return Fragment.create( + datasetUri, allocator, vectorSchemaRoot, finalWriteParams, storageOptionsProvider); + } else { + return Fragment.create( + datasetUri, arrowArrayStream, finalWriteParams, storageOptionsProvider); + } + } + + private void ensureWriteParamsBuilder() { + if (this.writeParamsBuilder == null) { + this.writeParamsBuilder = new WriteParams.Builder(); + } + } + + private WriteParams buildWriteParams() { + if (writeParams != null) { + return writeParams; + } else if (writeParamsBuilder != null) { + return writeParamsBuilder.build(); + } else { + return new WriteParams.Builder().build(); + } + } + + private void validate() { + Preconditions.checkNotNull(datasetUri, "datasetUri is required"); + Preconditions.checkState( + vectorSchemaRoot != null || arrowArrayStream != null, + "Either VectorSchemaRoot or ArrowArrayStream must be provided"); + Preconditions.checkState( + vectorSchemaRoot == null || allocator != null, + "allocator is required when using VectorSchemaRoot"); + Preconditions.checkState( + writeParams == null || writeParamsBuilder == null, + "Cannot use both writeParams() and individual parameter methods"); + } +} From 1952f0c0074965c892d930e8b7a5136123a9fff7 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Thu, 20 Nov 2025 23:19:48 -0800 Subject: [PATCH 07/12] more tests --- .../lance/NamespaceIntegrationTest.java | 292 ++++++++++++++++++ 1 file changed, 292 insertions(+) diff --git a/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java b/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java index 18f456e338d..8936f1d1c3d 100644 --- a/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java +++ b/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java @@ -15,6 +15,7 @@ import com.lancedb.lance.namespace.DirectoryNamespace; import com.lancedb.lance.namespace.LanceNamespace; +import com.lancedb.lance.namespace.LanceNamespaceStorageOptionsProvider; import com.lancedb.lance.namespace.model.CreateEmptyTableRequest; import com.lancedb.lance.namespace.model.CreateEmptyTableResponse; import com.lancedb.lance.namespace.model.DescribeTableRequest; @@ -44,10 +45,12 @@ import software.amazon.awssdk.services.s3.model.S3Object; import java.net.URI; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.UUID; import java.util.concurrent.atomic.AtomicInteger; @@ -739,4 +742,293 @@ public VectorSchemaRoot getVectorSchemaRoot() { } } } + + @Test + void testWriteDatasetBuilderWithNamespaceOverwrite() throws Exception { + try (BufferAllocator allocator = new RootAllocator()) { + // Set up storage options + Map storageOptions = new HashMap<>(); + storageOptions.put("allow_http", "true"); + storageOptions.put("aws_access_key_id", ACCESS_KEY); + storageOptions.put("aws_secret_access_key", SECRET_KEY); + storageOptions.put("aws_endpoint", ENDPOINT_URL); + storageOptions.put("aws_region", REGION); + + // Create tracking namespace + TrackingNamespace namespace = new TrackingNamespace(BUCKET_NAME, storageOptions, 60); + String tableName = UUID.randomUUID().toString(); + + Schema schema = + new Schema( + Arrays.asList( + new Field("a", FieldType.nullable(new ArrowType.Int(32, true)), null), + new Field("b", FieldType.nullable(new ArrowType.Int(32, true)), null))); + + // Create initial dataset with 1 row + try (VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator)) { + IntVector aVector = (IntVector) root.getVector("a"); + IntVector bVector = (IntVector) root.getVector("b"); + + aVector.allocateNew(1); + bVector.allocateNew(1); + + aVector.set(0, 1); + bVector.set(0, 2); + + aVector.setValueCount(1); + bVector.setValueCount(1); + root.setRowCount(1); + + ArrowReader createReader = + new ArrowReader(allocator) { + boolean firstRead = true; + + @Override + public boolean loadNextBatch() { + if (firstRead) { + firstRead = false; + return true; + } + return false; + } + + @Override + public long bytesRead() { + return 0; + } + + @Override + protected void closeReadSource() {} + + @Override + protected Schema readSchema() { + return schema; + } + + @Override + public VectorSchemaRoot getVectorSchemaRoot() { + return root; + } + }; + + try (Dataset dataset = + Dataset.write() + .allocator(allocator) + .reader(createReader) + .namespace(namespace) + .tableId(Arrays.asList(tableName)) + .mode(WriteParams.WriteMode.CREATE) + .execute()) { + assertEquals(1, dataset.countRows()); + } + + assertEquals(1, namespace.getCreateCallCount(), "createEmptyTable should be called once"); + assertEquals(0, namespace.getDescribeCallCount(), "describeTable should not be called yet"); + + // Now overwrite with 2 rows + aVector.allocateNew(2); + bVector.allocateNew(2); + + aVector.set(0, 10); + bVector.set(0, 20); + aVector.set(1, 100); + bVector.set(1, 200); + + aVector.setValueCount(2); + bVector.setValueCount(2); + root.setRowCount(2); + + ArrowReader overwriteReader = + new ArrowReader(allocator) { + boolean firstRead = true; + + @Override + public boolean loadNextBatch() { + if (firstRead) { + firstRead = false; + return true; + } + return false; + } + + @Override + public long bytesRead() { + return 0; + } + + @Override + protected void closeReadSource() {} + + @Override + protected Schema readSchema() { + return schema; + } + + @Override + public VectorSchemaRoot getVectorSchemaRoot() { + return root; + } + }; + + try (Dataset dataset = + Dataset.write() + .allocator(allocator) + .reader(overwriteReader) + .namespace(namespace) + .tableId(Arrays.asList(tableName)) + .mode(WriteParams.WriteMode.OVERWRITE) + .execute()) { + + // Verify describeTable was called for overwrite + assertEquals(1, namespace.getCreateCallCount(), "createEmptyTable should still be 1"); + int describeCountAfterOverwrite = namespace.getDescribeCallCount(); + assertEquals( + 1, describeCountAfterOverwrite, "describeTable should be called once for overwrite"); + + // Verify data was overwritten successfully + assertEquals(2, dataset.countRows()); + assertEquals( + 2, dataset.listVersions().size()); // Version 1 (create) + Version 2 (overwrite) + } + + // Verify we can open and read the dataset through namespace + try (Dataset ds = + Dataset.open() + .allocator(allocator) + .namespace(namespace) + .tableId(Arrays.asList(tableName)) + .build()) { + assertEquals(2, ds.countRows(), "Should have 2 rows after overwrite"); + assertEquals(2, ds.listVersions().size(), "Should have 2 versions"); + } + } + } + } + + @Test + void testDistributedWriteWithNamespace() throws Exception { + try (BufferAllocator allocator = new RootAllocator()) { + // Set up storage options + Map storageOptions = new HashMap<>(); + storageOptions.put("allow_http", "true"); + storageOptions.put("aws_access_key_id", ACCESS_KEY); + storageOptions.put("aws_secret_access_key", SECRET_KEY); + storageOptions.put("aws_endpoint", ENDPOINT_URL); + storageOptions.put("aws_region", REGION); + + // Create tracking namespace + TrackingNamespace namespace = new TrackingNamespace(BUCKET_NAME, storageOptions, 60); + String tableName = UUID.randomUUID().toString(); + + Schema schema = + new Schema( + Arrays.asList( + new Field("a", FieldType.nullable(new ArrowType.Int(32, true)), null), + new Field("b", FieldType.nullable(new ArrowType.Int(32, true)), null))); + + // Step 1: Create empty table via namespace + CreateEmptyTableRequest request = new CreateEmptyTableRequest(); + request.setId(Arrays.asList(tableName)); + CreateEmptyTableResponse response = namespace.createEmptyTable(request); + + assertEquals(1, namespace.getCreateCallCount(), "createEmptyTable should be called once"); + assertEquals(0, namespace.getDescribeCallCount(), "describeTable should not be called yet"); + + String tableUri = response.getLocation(); + Map namespaceStorageOptions = response.getStorageOptions(); + + // Merge storage options + Map mergedOptions = new HashMap<>(storageOptions); + if (namespaceStorageOptions != null) { + mergedOptions.putAll(namespaceStorageOptions); + } + + // Create storage options provider + LanceNamespaceStorageOptionsProvider storageOptionsProvider = + new LanceNamespaceStorageOptionsProvider(namespace, Arrays.asList(tableName)); + + WriteParams writeParams = new WriteParams.Builder().withStorageOptions(mergedOptions).build(); + + // Step 2: Write multiple fragments in parallel (simulated) + List allFragments = new ArrayList<>(); + + // Fragment 1: 2 rows + try (VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator)) { + IntVector aVector = (IntVector) root.getVector("a"); + IntVector bVector = (IntVector) root.getVector("b"); + + aVector.allocateNew(2); + bVector.allocateNew(2); + aVector.set(0, 1); + bVector.set(0, 2); + aVector.set(1, 3); + bVector.set(1, 4); + aVector.setValueCount(2); + bVector.setValueCount(2); + root.setRowCount(2); + + List fragment1 = + Fragment.create(tableUri, allocator, root, writeParams, storageOptionsProvider); + allFragments.addAll(fragment1); + } + + // Fragment 2: 2 rows + try (VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator)) { + IntVector aVector = (IntVector) root.getVector("a"); + IntVector bVector = (IntVector) root.getVector("b"); + + aVector.allocateNew(2); + bVector.allocateNew(2); + aVector.set(0, 10); + bVector.set(0, 20); + aVector.set(1, 30); + bVector.set(1, 40); + aVector.setValueCount(2); + bVector.setValueCount(2); + root.setRowCount(2); + + List fragment2 = + Fragment.create(tableUri, allocator, root, writeParams, storageOptionsProvider); + allFragments.addAll(fragment2); + } + + // Fragment 3: 1 row + try (VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator)) { + IntVector aVector = (IntVector) root.getVector("a"); + IntVector bVector = (IntVector) root.getVector("b"); + + aVector.allocateNew(1); + bVector.allocateNew(1); + aVector.set(0, 100); + bVector.set(0, 200); + aVector.setValueCount(1); + bVector.setValueCount(1); + root.setRowCount(1); + + List fragment3 = + Fragment.create(tableUri, allocator, root, writeParams, storageOptionsProvider); + allFragments.addAll(fragment3); + } + + // Step 3: Commit all fragments as one operation + FragmentOperation.Overwrite overwriteOp = + new FragmentOperation.Overwrite(allFragments, schema); + + try (Dataset dataset = + Dataset.commit(allocator, tableUri, overwriteOp, Optional.empty(), mergedOptions)) { + assertEquals(5, dataset.countRows(), "Should have 5 total rows from all fragments"); + assertEquals(1, dataset.listVersions().size(), "Should have 1 version after commit"); + } + + // Step 4: Open dataset through namespace and verify + try (Dataset dsFromNamespace = + Dataset.open() + .allocator(allocator) + .namespace(namespace) + .tableId(Arrays.asList(tableName)) + .build()) { + assertEquals(5, dsFromNamespace.countRows(), "Should read 5 rows through namespace"); + } + } + } } From 95efe90e9252b67f3cdf23f01df75ba080fa6108 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Fri, 21 Nov 2025 12:04:04 -0800 Subject: [PATCH 08/12] address comments --- .../src/main/java/com/lancedb/lance/OpenDatasetBuilder.java | 6 +++--- .../main/java/com/lancedb/lance/WriteDatasetBuilder.java | 6 +++--- .../main/java/com/lancedb/lance/WriteFragmentBuilder.java | 3 +++ .../com/lancedb/lance/StorageOptionsProviderWriteTest.java | 5 ++++- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/java/src/main/java/com/lancedb/lance/OpenDatasetBuilder.java b/java/src/main/java/com/lancedb/lance/OpenDatasetBuilder.java index 1114bdcf876..268f274a2f4 100644 --- a/java/src/main/java/com/lancedb/lance/OpenDatasetBuilder.java +++ b/java/src/main/java/com/lancedb/lance/OpenDatasetBuilder.java @@ -129,10 +129,10 @@ public OpenDatasetBuilder readOptions(ReadOptions options) { } /** - * Sets whether to ignore storage options from the namespace's describe_table(). + * Sets whether to ignore storage options from the namespace's describeTable(). * * @param ignoreNamespaceTableStorageOptions If true, storage options returned from - * describe_table() will be ignored (treated as null) + * describeTable() will be ignored (treated as null) * @return this builder instance */ public OpenDatasetBuilder ignoreNamespaceTableStorageOptions( @@ -145,7 +145,7 @@ public OpenDatasetBuilder ignoreNamespaceTableStorageOptions( * Opens the dataset with the configured parameters. * *

If a namespace is configured, this automatically fetches the table location and storage - * options from the namespace via describe_table(). + * options from the namespace via describeTable(). * * @return Dataset * @throws IllegalArgumentException if required parameters are missing or invalid diff --git a/java/src/main/java/com/lancedb/lance/WriteDatasetBuilder.java b/java/src/main/java/com/lancedb/lance/WriteDatasetBuilder.java index 6903bd00d9c..b8e7092b420 100644 --- a/java/src/main/java/com/lancedb/lance/WriteDatasetBuilder.java +++ b/java/src/main/java/com/lancedb/lance/WriteDatasetBuilder.java @@ -181,7 +181,7 @@ public WriteDatasetBuilder mode(WriteParams.WriteMode mode) { /** * Sets the schema for the dataset. * - *

If not provided, the schema will be inferred from the reader. + *

If the reader and stream not provided, this is used to create an empty dataset * * @param schema The dataset schema * @return this builder instance @@ -203,8 +203,8 @@ public WriteDatasetBuilder storageOptions(Map storageOptions) { } /** - * Sets whether to ignore storage options from the namespace's describe_table() or - * create_empty_table(). + * Sets whether to ignore storage options from the namespace's describeTable() or + * createEmptyTable(). * * @param ignoreNamespaceStorageOptions If true, storage options returned from namespace will be * ignored diff --git a/java/src/main/java/com/lancedb/lance/WriteFragmentBuilder.java b/java/src/main/java/com/lancedb/lance/WriteFragmentBuilder.java index d504d0a9373..10b4dd6735f 100644 --- a/java/src/main/java/com/lancedb/lance/WriteFragmentBuilder.java +++ b/java/src/main/java/com/lancedb/lance/WriteFragmentBuilder.java @@ -262,6 +262,9 @@ private void validate() { Preconditions.checkState( vectorSchemaRoot != null || arrowArrayStream != null, "Either VectorSchemaRoot or ArrowArrayStream must be provided"); + Preconditions.checkState( + vectorSchemaRoot == null || arrowArrayStream == null, + "Cannot set both VectorSchemaRoot and ArrowArrayStream"); Preconditions.checkState( vectorSchemaRoot == null || allocator != null, "allocator is required when using VectorSchemaRoot"); diff --git a/java/src/test/java/com/lancedb/lance/StorageOptionsProviderWriteTest.java b/java/src/test/java/com/lancedb/lance/StorageOptionsProviderWriteTest.java index 6d8408f3286..e56833e1c5a 100644 --- a/java/src/test/java/com/lancedb/lance/StorageOptionsProviderWriteTest.java +++ b/java/src/test/java/com/lancedb/lance/StorageOptionsProviderWriteTest.java @@ -319,7 +319,10 @@ void testTransactionCommitWithStorageOptionsProviderNewDataset() throws Exceptio nameVector.setValueCount(2); root.setRowCount(2); - WriteParams writeParams = new WriteParams.Builder().build(); + // Get initial storage options from provider (which includes expires_at_millis) + Map initialOptions = provider.fetchStorageOptions(); + WriteParams writeParams = + new WriteParams.Builder().withStorageOptions(initialOptions).build(); // Create fragments with provider List fragments = From 30f81cade1c12d04454d1ab98687aa7da9bba208 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Fri, 21 Nov 2025 12:35:50 -0800 Subject: [PATCH 09/12] fix ci --- .github/workflows/java.yml | 22 +- .../lance/NamespaceIntegrationTest.java | 245 ++++++++ .../StorageOptionsProviderWriteTest.java | 575 ------------------ 3 files changed, 250 insertions(+), 592 deletions(-) delete mode 100644 java/src/test/java/com/lancedb/lance/StorageOptionsProviderWriteTest.java diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml index 27254d70fe2..5bce3470b8e 100644 --- a/.github/workflows/java.yml +++ b/.github/workflows/java.yml @@ -49,22 +49,9 @@ jobs: matrix: java-version: [8, 11, 17] name: Build and Test with Java ${{ matrix.java-version }} - services: - localstack: - image: localstack/localstack:4.0 - ports: - - 4566:4566 - env: - SERVICES: s3,dynamodb,kms - AWS_ACCESS_KEY_ID: ACCESS_KEY - AWS_SECRET_ACCESS_KEY: SECRET_KEY - options: >- - --health-cmd "curl -s http://localhost:4566/_localstack/health" - --health-interval 5s - --health-timeout 3s - --health-retries 3 - --health-start-period 10s steps: + - name: Checkout repository + uses: actions/checkout@v4 - name: Install dependencies run: | sudo apt update @@ -76,8 +63,6 @@ jobs: - uses: rui314/setup-mold@v1 - name: Install cargo-llvm-cov uses: taiki-e/install-action@cargo-llvm-cov - - name: Checkout repository - uses: actions/checkout@v4 - uses: Swatinem/rust-cache@v2 with: workspaces: java/lance-jni -> ../target/rust-maven-plugin/lance-jni @@ -91,6 +76,9 @@ jobs: working-directory: java run: | mvn spotless:check + - name: Start localstack + run: | + docker compose -f docker-compose.yml up -d --wait - name: Running tests with Java ${{ matrix.java-version }} working-directory: java env: diff --git a/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java b/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java index 8936f1d1c3d..df8fa0efd1f 100644 --- a/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java +++ b/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java @@ -20,6 +20,7 @@ import com.lancedb.lance.namespace.model.CreateEmptyTableResponse; import com.lancedb.lance.namespace.model.DescribeTableRequest; import com.lancedb.lance.namespace.model.DescribeTableResponse; +import com.lancedb.lance.operation.Append; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; @@ -1031,4 +1032,248 @@ void testDistributedWriteWithNamespace() throws Exception { } } } + + @Test + void testFragmentCreateWithStorageOptionsProvider() throws Exception { + try (BufferAllocator allocator = new RootAllocator()) { + // Set up storage options + Map storageOptions = new HashMap<>(); + storageOptions.put("allow_http", "true"); + storageOptions.put("aws_access_key_id", ACCESS_KEY); + storageOptions.put("aws_secret_access_key", SECRET_KEY); + storageOptions.put("aws_endpoint", ENDPOINT_URL); + storageOptions.put("aws_region", REGION); + + // Create tracking namespace with 60-second expiration + TrackingNamespace namespace = new TrackingNamespace(BUCKET_NAME, storageOptions, 60); + String tableName = UUID.randomUUID().toString(); + + Schema schema = + new Schema( + Arrays.asList( + new Field("id", FieldType.nullable(new ArrowType.Int(32, true)), null), + new Field("value", FieldType.nullable(new ArrowType.Int(32, true)), null))); + + // Create empty table via namespace + CreateEmptyTableRequest request = new CreateEmptyTableRequest(); + request.setId(Arrays.asList(tableName)); + CreateEmptyTableResponse response = namespace.createEmptyTable(request); + + String tableUri = response.getLocation(); + Map namespaceStorageOptions = response.getStorageOptions(); + + // Merge storage options + Map mergedOptions = new HashMap<>(storageOptions); + if (namespaceStorageOptions != null) { + mergedOptions.putAll(namespaceStorageOptions); + } + + // Create storage options provider + LanceNamespaceStorageOptionsProvider provider = + new LanceNamespaceStorageOptionsProvider(namespace, Arrays.asList(tableName)); + + WriteParams writeParams = new WriteParams.Builder().withStorageOptions(mergedOptions).build(); + + try (VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator)) { + IntVector idVector = (IntVector) root.getVector("id"); + IntVector valueVector = (IntVector) root.getVector("value"); + + // Write first fragment + idVector.allocateNew(3); + valueVector.allocateNew(3); + + idVector.set(0, 1); + valueVector.set(0, 100); + idVector.set(1, 2); + valueVector.set(1, 200); + idVector.set(2, 3); + valueVector.set(2, 300); + + idVector.setValueCount(3); + valueVector.setValueCount(3); + root.setRowCount(3); + + int describeCountBefore = namespace.getDescribeCallCount(); + + // Create fragment with StorageOptionsProvider + List fragments1 = + Fragment.create(tableUri, allocator, root, writeParams, provider); + + assertEquals(1, fragments1.size()); + + // Verify provider was called (describeTable is called) + int describeCountAfter1 = namespace.getDescribeCallCount(); + assertEquals( + 1, + describeCountAfter1 - describeCountBefore, + "Provider should call describeTable during fragment creation"); + + // Write second fragment with different data + idVector.set(0, 4); + valueVector.set(0, 400); + idVector.set(1, 5); + valueVector.set(1, 500); + idVector.set(2, 6); + valueVector.set(2, 600); + root.setRowCount(3); + + // Create another fragment with the same provider + List fragments2 = + Fragment.create(tableUri, allocator, root, writeParams, provider); + + assertEquals(1, fragments2.size()); + + int describeCountAfter2 = namespace.getDescribeCallCount(); + assertEquals( + 2, + describeCountAfter2 - describeCountBefore, + "Provider should call describeTable again for second fragment"); + + // Commit both fragments to the dataset + FragmentOperation.Append appendOp = new FragmentOperation.Append(fragments1); + try (Dataset updatedDataset = + Dataset.commit(allocator, tableUri, appendOp, Optional.empty(), mergedOptions)) { + assertEquals(1, updatedDataset.version()); + assertEquals(3, updatedDataset.countRows()); + + // Append second fragment + FragmentOperation.Append appendOp2 = new FragmentOperation.Append(fragments2); + try (Dataset finalDataset = + Dataset.commit( + allocator, tableUri, appendOp2, Optional.of(1L), mergedOptions)) { + assertEquals(2, finalDataset.version()); + assertEquals(6, finalDataset.countRows()); + } + } + } + } + } + + @Test + void testTransactionCommitWithStorageOptionsProvider() throws Exception { + try (BufferAllocator allocator = new RootAllocator()) { + // Set up storage options + Map storageOptions = new HashMap<>(); + storageOptions.put("allow_http", "true"); + storageOptions.put("aws_access_key_id", ACCESS_KEY); + storageOptions.put("aws_secret_access_key", SECRET_KEY); + storageOptions.put("aws_endpoint", ENDPOINT_URL); + storageOptions.put("aws_region", REGION); + + // Create tracking namespace + TrackingNamespace namespace = new TrackingNamespace(BUCKET_NAME, storageOptions, 60); + String tableName = UUID.randomUUID().toString(); + + Schema schema = + new Schema( + Arrays.asList( + new Field("id", FieldType.nullable(new ArrowType.Int(32, true)), null), + new Field("name", FieldType.nullable(new ArrowType.Utf8()), null))); + + // Create empty table via namespace + CreateEmptyTableRequest request = new CreateEmptyTableRequest(); + request.setId(Arrays.asList(tableName)); + CreateEmptyTableResponse response = namespace.createEmptyTable(request); + + String tableUri = response.getLocation(); + Map namespaceStorageOptions = response.getStorageOptions(); + + // Merge storage options + Map mergedOptions = new HashMap<>(storageOptions); + if (namespaceStorageOptions != null) { + mergedOptions.putAll(namespaceStorageOptions); + } + + // Create storage options provider + LanceNamespaceStorageOptionsProvider provider = + new LanceNamespaceStorageOptionsProvider(namespace, Arrays.asList(tableName)); + + // First, write some initial data using Fragment.create and commit + WriteParams writeParams = new WriteParams.Builder().withStorageOptions(mergedOptions).build(); + + List initialFragments; + try (VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator)) { + IntVector idVector = (IntVector) root.getVector("id"); + org.apache.arrow.vector.VarCharVector nameVector = + (org.apache.arrow.vector.VarCharVector) root.getVector("name"); + + idVector.allocateNew(2); + nameVector.allocateNew(2); + + idVector.set(0, 1); + nameVector.setSafe(0, "Alice".getBytes()); + idVector.set(1, 2); + nameVector.setSafe(1, "Bob".getBytes()); + + idVector.setValueCount(2); + nameVector.setValueCount(2); + root.setRowCount(2); + + initialFragments = Fragment.create(tableUri, allocator, root, writeParams, provider); + } + + // Commit initial fragments + FragmentOperation.Overwrite overwriteOp = + new FragmentOperation.Overwrite(initialFragments, schema); + try (Dataset dataset = + Dataset.commit(allocator, tableUri, overwriteOp, Optional.empty(), mergedOptions)) { + assertEquals(1, dataset.version()); + assertEquals(2, dataset.countRows()); + } + + // Now test Transaction.commit with provider + // Open dataset with provider + ReadOptions readOptions = + new ReadOptions.Builder() + .setStorageOptions(storageOptions) + .setStorageOptionsProvider(provider) + .build(); + + try (Dataset datasetWithProvider = Dataset.open(allocator, tableUri, readOptions)) { + // Create more fragments to append + List newFragments; + try (VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator)) { + IntVector idVector = (IntVector) root.getVector("id"); + org.apache.arrow.vector.VarCharVector nameVector = + (org.apache.arrow.vector.VarCharVector) root.getVector("name"); + + idVector.allocateNew(2); + nameVector.allocateNew(2); + + idVector.set(0, 3); + nameVector.setSafe(0, "Charlie".getBytes()); + idVector.set(1, 4); + nameVector.setSafe(1, "Diana".getBytes()); + + idVector.setValueCount(2); + nameVector.setValueCount(2); + root.setRowCount(2); + + newFragments = Fragment.create(tableUri, allocator, root, writeParams, provider); + } + + int describeCountBefore = namespace.getDescribeCallCount(); + + // Create and commit transaction + Append appendOp = Append.builder().fragments(newFragments).build(); + Transaction transaction = + new Transaction.Builder(datasetWithProvider) + .readVersion(datasetWithProvider.version()) + .operation(appendOp) + .build(); + + try (Dataset committedDataset = transaction.commit()) { + assertEquals(2, committedDataset.version()); + assertEquals(4, committedDataset.countRows()); + + // Verify provider was called during commit + int describeCountAfter = namespace.getDescribeCallCount(); + assertEquals( + 1, + describeCountAfter - describeCountBefore, + "Provider should call describeTable during transaction commit"); + } + } + } + } } diff --git a/java/src/test/java/com/lancedb/lance/StorageOptionsProviderWriteTest.java b/java/src/test/java/com/lancedb/lance/StorageOptionsProviderWriteTest.java deleted file mode 100644 index e56833e1c5a..00000000000 --- a/java/src/test/java/com/lancedb/lance/StorageOptionsProviderWriteTest.java +++ /dev/null @@ -1,575 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package com.lancedb.lance; - -import com.lancedb.lance.io.StorageOptionsProvider; -import com.lancedb.lance.operation.Append; - -import org.apache.arrow.memory.BufferAllocator; -import org.apache.arrow.memory.RootAllocator; -import org.apache.arrow.vector.IntVector; -import org.apache.arrow.vector.VectorSchemaRoot; -import org.apache.arrow.vector.types.pojo.ArrowType; -import org.apache.arrow.vector.types.pojo.Field; -import org.apache.arrow.vector.types.pojo.FieldType; -import org.apache.arrow.vector.types.pojo.Schema; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.condition.EnabledIfEnvironmentVariable; -import software.amazon.awssdk.auth.credentials.AwsBasicCredentials; -import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider; -import software.amazon.awssdk.regions.Region; -import software.amazon.awssdk.services.s3.S3Client; -import software.amazon.awssdk.services.s3.model.CreateBucketRequest; -import software.amazon.awssdk.services.s3.model.DeleteBucketRequest; -import software.amazon.awssdk.services.s3.model.DeleteObjectRequest; -import software.amazon.awssdk.services.s3.model.ListObjectsV2Request; -import software.amazon.awssdk.services.s3.model.S3Object; - -import java.net.URI; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.UUID; -import java.util.concurrent.atomic.AtomicInteger; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.assertTrue; - -/** - * Integration tests for Fragment write and Transaction commit with StorageOptionsProvider. - * - *

These tests verify that: - Fragment.create() can use StorageOptionsProvider for credential - * refresh - Transaction.commit() can use StorageOptionsProvider when creating new datasets - - * Transaction.commit() can use StorageOptionsProvider when committing to existing datasets - * - *

These tests require LocalStack to be running. Run with: docker compose up -d - * - *

Set LANCE_INTEGRATION_TEST=1 environment variable to enable these tests. - */ -@EnabledIfEnvironmentVariable(named = "LANCE_INTEGRATION_TEST", matches = "1") -public class StorageOptionsProviderWriteTest { - - private static final String ENDPOINT_URL = "http://localhost:4566"; - private static final String REGION = "us-east-1"; - private static final String ACCESS_KEY = "ACCESS_KEY"; - private static final String SECRET_KEY = "SECRET_KEY"; - private static final String BUCKET_NAME = "lance-write-provider-test-java"; - - private static S3Client s3Client; - - @BeforeAll - static void setup() { - s3Client = - S3Client.builder() - .endpointOverride(URI.create(ENDPOINT_URL)) - .region(Region.of(REGION)) - .credentialsProvider( - StaticCredentialsProvider.create( - AwsBasicCredentials.create(ACCESS_KEY, SECRET_KEY))) - .forcePathStyle(true) - .build(); - - // Delete bucket if it exists from previous run - try { - deleteBucket(); - } catch (Exception e) { - // Ignore if bucket doesn't exist - } - - // Create test bucket - s3Client.createBucket(CreateBucketRequest.builder().bucket(BUCKET_NAME).build()); - } - - @AfterAll - static void tearDown() { - if (s3Client != null) { - try { - deleteBucket(); - } catch (Exception e) { - // Ignore cleanup errors - } - s3Client.close(); - } - } - - private static void deleteBucket() { - // Delete all objects first - List objects = - s3Client - .listObjectsV2(ListObjectsV2Request.builder().bucket(BUCKET_NAME).build()) - .contents(); - for (S3Object obj : objects) { - s3Client.deleteObject( - DeleteObjectRequest.builder().bucket(BUCKET_NAME).key(obj.key()).build()); - } - s3Client.deleteBucket(DeleteBucketRequest.builder().bucket(BUCKET_NAME).build()); - } - - /** - * Mock StorageOptionsProvider that tracks how many times it's been called and provides - * credentials with expiration times. - */ - static class MockStorageOptionsProvider implements StorageOptionsProvider { - private final Map baseStorageOptions; - private final int credentialExpiresInSeconds; - private final AtomicInteger callCount = new AtomicInteger(0); - - public MockStorageOptionsProvider( - Map storageOptions, int credentialExpiresInSeconds) { - this.baseStorageOptions = new HashMap<>(storageOptions); - this.credentialExpiresInSeconds = credentialExpiresInSeconds; - } - - @Override - public Map fetchStorageOptions() { - int count = callCount.incrementAndGet(); - - Map storageOptions = new HashMap<>(baseStorageOptions); - long expiresAtMillis = System.currentTimeMillis() + (credentialExpiresInSeconds * 1000L); - storageOptions.put("expires_at_millis", String.valueOf(expiresAtMillis)); - - // Add a marker to track which credential set this is - storageOptions.put("credential_version", String.valueOf(count)); - - return storageOptions; - } - - public int getCallCount() { - return callCount.get(); - } - } - - @Test - void testFragmentCreateWithStorageOptionsProvider() throws Exception { - try (BufferAllocator allocator = new RootAllocator()) { - String tableName = UUID.randomUUID().toString(); - String tableUri = "s3://" + BUCKET_NAME + "/" + tableName + ".lance"; - - // Create base storage options - Map storageOptions = new HashMap<>(); - storageOptions.put("allow_http", "true"); - storageOptions.put("aws_access_key_id", ACCESS_KEY); - storageOptions.put("aws_secret_access_key", SECRET_KEY); - storageOptions.put("aws_endpoint", ENDPOINT_URL); - storageOptions.put("aws_region", REGION); - - // Create mock provider with 60-second expiration - MockStorageOptionsProvider provider = new MockStorageOptionsProvider(storageOptions, 60); - - // Create schema - Schema schema = - new Schema( - Arrays.asList( - new Field("id", FieldType.nullable(new ArrowType.Int(32, true)), null), - new Field("value", FieldType.nullable(new ArrowType.Int(32, true)), null))); - - // First, create an empty dataset - WriteParams createParams = - new WriteParams.Builder().withStorageOptions(storageOptions).build(); - try (Dataset dataset = Dataset.create(allocator, tableUri, schema, createParams)) { - assertEquals(1, dataset.version()); - - // Now write fragments using StorageOptionsProvider - try (VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator)) { - IntVector idVector = (IntVector) root.getVector("id"); - IntVector valueVector = (IntVector) root.getVector("value"); - - // Write first fragment - idVector.allocateNew(3); - valueVector.allocateNew(3); - - idVector.set(0, 1); - valueVector.set(0, 100); - idVector.set(1, 2); - valueVector.set(1, 200); - idVector.set(2, 3); - valueVector.set(2, 300); - - idVector.setValueCount(3); - valueVector.setValueCount(3); - root.setRowCount(3); - - WriteParams writeParams = new WriteParams.Builder().build(); - - int callCountBefore = provider.getCallCount(); - - // Create fragment with StorageOptionsProvider - List fragments1 = - Fragment.create(tableUri, allocator, root, writeParams, provider); - - assertNotNull(fragments1); - assertEquals(1, fragments1.size()); - - // Verify provider was called - int callCountAfter1 = provider.getCallCount(); - assertTrue( - callCountAfter1 > callCountBefore, - "Provider should be called during fragment creation"); - - // Write second fragment with different data - idVector.set(0, 4); - valueVector.set(0, 400); - idVector.set(1, 5); - valueVector.set(1, 500); - idVector.set(2, 6); - valueVector.set(2, 600); - root.setRowCount(3); - - // Create another fragment with the same provider - List fragments2 = - Fragment.create(tableUri, allocator, root, writeParams, provider); - - assertNotNull(fragments2); - assertEquals(1, fragments2.size()); - - int callCountAfter2 = provider.getCallCount(); - assertTrue( - callCountAfter2 > callCountAfter1, - "Provider should be called again for second fragment"); - - // Commit both fragments to the dataset - FragmentOperation.Append appendOp = new FragmentOperation.Append(fragments1); - try (Dataset updatedDataset = - Dataset.commit(allocator, tableUri, appendOp, Optional.of(1L), storageOptions)) { - assertEquals(2, updatedDataset.version()); - assertEquals(3, updatedDataset.countRows()); - - // Append second fragment - FragmentOperation.Append appendOp2 = new FragmentOperation.Append(fragments2); - try (Dataset finalDataset = - Dataset.commit(allocator, tableUri, appendOp2, Optional.of(2L), storageOptions)) { - assertEquals(3, finalDataset.version()); - assertEquals(6, finalDataset.countRows()); - } - } - } - } - } - } - - @Test - void testTransactionCommitWithStorageOptionsProviderNewDataset() throws Exception { - try (BufferAllocator allocator = new RootAllocator()) { - String tableName = UUID.randomUUID().toString(); - String tableUri = "s3://" + BUCKET_NAME + "/" + tableName + ".lance"; - - // Create base storage options - Map storageOptions = new HashMap<>(); - storageOptions.put("allow_http", "true"); - storageOptions.put("aws_access_key_id", ACCESS_KEY); - storageOptions.put("aws_secret_access_key", SECRET_KEY); - storageOptions.put("aws_endpoint", ENDPOINT_URL); - storageOptions.put("aws_region", REGION); - - // Create mock provider - MockStorageOptionsProvider provider = new MockStorageOptionsProvider(storageOptions, 60); - - // Create schema - Schema schema = - new Schema( - Arrays.asList( - new Field("id", FieldType.nullable(new ArrowType.Int(32, true)), null), - new Field("name", FieldType.nullable(new ArrowType.Utf8()), null))); - - // Create empty dataset first with provider - WriteParams createParams = - new WriteParams.Builder().withStorageOptions(storageOptions).build(); - try (Dataset dataset = Dataset.create(allocator, tableUri, schema, createParams)) { - assertEquals(1, dataset.version()); - - // Re-open dataset with provider so transactions can use it - ReadOptions readOptions = - new ReadOptions.Builder() - .setStorageOptions(storageOptions) - .setStorageOptionsProvider(provider) - .build(); - try (Dataset datasetWithProvider = Dataset.open(allocator, tableUri, readOptions)) { - - // Create fragments to commit - try (VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator)) { - IntVector idVector = (IntVector) root.getVector("id"); - org.apache.arrow.vector.VarCharVector nameVector = - (org.apache.arrow.vector.VarCharVector) root.getVector("name"); - - idVector.allocateNew(2); - nameVector.allocateNew(2); - - idVector.set(0, 1); - nameVector.setSafe(0, "Alice".getBytes()); - idVector.set(1, 2); - nameVector.setSafe(1, "Bob".getBytes()); - - idVector.setValueCount(2); - nameVector.setValueCount(2); - root.setRowCount(2); - - // Get initial storage options from provider (which includes expires_at_millis) - Map initialOptions = provider.fetchStorageOptions(); - WriteParams writeParams = - new WriteParams.Builder().withStorageOptions(initialOptions).build(); - - // Create fragments with provider - List fragments = - Fragment.create(tableUri, allocator, root, writeParams, provider); - - // Create transaction - provider will be inherited from dataset - Append appendOp = Append.builder().fragments(fragments).build(); - - int callCountBefore = provider.getCallCount(); - - Transaction transaction = - new Transaction.Builder(datasetWithProvider) - .readVersion(1L) - .operation(appendOp) - .build(); - - // Commit transaction - try (Dataset committedDataset = transaction.commit()) { - assertNotNull(committedDataset); - assertEquals(2, committedDataset.version()); - assertEquals(2, committedDataset.countRows()); - - // Verify provider was called during commit - int callCountAfter = provider.getCallCount(); - assertTrue( - callCountAfter > callCountBefore, - "Provider should be called during transaction commit"); - } - } - } - } - } - } - - @Test - void testTransactionCommitWithStorageOptionsProviderExistingDataset() throws Exception { - try (BufferAllocator allocator = new RootAllocator()) { - String tableName = UUID.randomUUID().toString(); - String tableUri = "s3://" + BUCKET_NAME + "/" + tableName + ".lance"; - - // Create base storage options - Map storageOptions = new HashMap<>(); - storageOptions.put("allow_http", "true"); - storageOptions.put("aws_access_key_id", ACCESS_KEY); - storageOptions.put("aws_secret_access_key", SECRET_KEY); - storageOptions.put("aws_endpoint", ENDPOINT_URL); - storageOptions.put("aws_region", REGION); - - // Create schema - Schema schema = - new Schema( - Arrays.asList( - new Field("x", FieldType.nullable(new ArrowType.Int(32, true)), null), - new Field("y", FieldType.nullable(new ArrowType.Int(32, true)), null))); - - // Create initial dataset with some data - try (VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator)) { - IntVector xVector = (IntVector) root.getVector("x"); - IntVector yVector = (IntVector) root.getVector("y"); - - xVector.allocateNew(2); - yVector.allocateNew(2); - - xVector.set(0, 1); - yVector.set(0, 10); - xVector.set(1, 2); - yVector.set(1, 20); - - xVector.setValueCount(2); - yVector.setValueCount(2); - root.setRowCount(2); - - WriteParams createParams = - new WriteParams.Builder().withStorageOptions(storageOptions).build(); - - try (Dataset dataset = Dataset.create(allocator, tableUri, schema, createParams)) { - List initialFragments = - Fragment.create(tableUri, allocator, root, createParams); - FragmentOperation.Append initialAppend = new FragmentOperation.Append(initialFragments); - - try (Dataset datasetV2 = - Dataset.commit(allocator, tableUri, initialAppend, Optional.of(1L), storageOptions)) { - assertEquals(2, datasetV2.version()); - assertEquals(2, datasetV2.countRows()); - - // Now test committing additional data with StorageOptionsProvider - MockStorageOptionsProvider provider = - new MockStorageOptionsProvider(storageOptions, 60); - - // Create more data to append - xVector.set(0, 3); - yVector.set(0, 30); - xVector.set(1, 4); - yVector.set(1, 40); - root.setRowCount(2); - - WriteParams writeParams = new WriteParams.Builder().build(); - - // Create fragments with provider - List newFragments = - Fragment.create(tableUri, allocator, root, writeParams, provider); - - int callCountBefore = provider.getCallCount(); - - // Open the existing dataset with provider - ReadOptions readOptions = - new ReadOptions.Builder() - .setStorageOptions(storageOptions) - .setStorageOptionsProvider(provider) - .build(); - try (Dataset existingDataset = Dataset.open(allocator, tableUri, readOptions)) { - // Create transaction - provider will be inherited from dataset - Append appendOp = Append.builder().fragments(newFragments).build(); - - Transaction transaction = - new Transaction.Builder(existingDataset) - .readVersion(existingDataset.version()) - .operation(appendOp) - .build(); - - // Commit transaction to existing dataset - try (Dataset committedDataset = transaction.commit()) { - assertNotNull(committedDataset); - assertEquals(3, committedDataset.version()); - assertEquals(4, committedDataset.countRows()); - - // Verify provider was called during commit - int callCountAfter = provider.getCallCount(); - assertTrue( - callCountAfter > callCountBefore, - "Provider should be called during transaction commit to existing dataset"); - } - } - } - } - } - } - } - - @Test - void testMultipleFragmentsAndCommitWithProvider() throws Exception { - try (BufferAllocator allocator = new RootAllocator()) { - String tableName = UUID.randomUUID().toString(); - String tableUri = "s3://" + BUCKET_NAME + "/" + tableName + ".lance"; - - // Create base storage options - Map storageOptions = new HashMap<>(); - storageOptions.put("allow_http", "true"); - storageOptions.put("aws_access_key_id", ACCESS_KEY); - storageOptions.put("aws_secret_access_key", SECRET_KEY); - storageOptions.put("aws_endpoint", ENDPOINT_URL); - storageOptions.put("aws_region", REGION); - - // Create mock provider with short expiration to test refresh - MockStorageOptionsProvider provider = new MockStorageOptionsProvider(storageOptions, 5); - - // Create schema - Schema schema = - new Schema( - Arrays.asList( - new Field("data", FieldType.nullable(new ArrowType.Int(32, true)), null))); - - // Create empty dataset - WriteParams createParams = - new WriteParams.Builder().withStorageOptions(storageOptions).build(); - try (Dataset dataset = Dataset.create(allocator, tableUri, schema, createParams)) { - assertEquals(1, dataset.version()); - - try (VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator)) { - IntVector dataVector = (IntVector) root.getVector("data"); - WriteParams writeParams = new WriteParams.Builder().build(); - - // Create multiple fragments with the provider - List> allFragments = new java.util.ArrayList<>(); - - for (int i = 0; i < 3; i++) { - dataVector.allocateNew(2); - dataVector.set(0, i * 10); - dataVector.set(1, i * 10 + 1); - dataVector.setValueCount(2); - root.setRowCount(2); - - List fragments = - Fragment.create(tableUri, allocator, root, writeParams, provider); - allFragments.add(fragments); - - // Add a small delay to simulate real-world usage - Thread.sleep(100); - } - - // Verify provider was called multiple times - int callCountAfterFragments = provider.getCallCount(); - assertTrue(callCountAfterFragments >= 3, "Provider should be called for each fragment"); - - // Commit all fragments in one transaction with provider - int callCountBeforeCommit = provider.getCallCount(); - - FragmentOperation.Append appendOp1 = new FragmentOperation.Append(allFragments.get(0)); - try (Dataset v2 = - Dataset.commit(allocator, tableUri, appendOp1, Optional.of(1L), storageOptions)) { - - // Open v2 with provider so transaction can inherit it - ReadOptions readOptions = - new ReadOptions.Builder() - .setStorageOptions(storageOptions) - .setStorageOptionsProvider(provider) - .build(); - try (Dataset v2WithProvider = Dataset.open(allocator, tableUri, readOptions)) { - - Append appendOp2 = Append.builder().fragments(allFragments.get(1)).build(); - Transaction transaction = - new Transaction.Builder(v2WithProvider) - .readVersion(v2WithProvider.version()) - .operation(appendOp2) - .build(); - - try (Dataset v3 = transaction.commit()) { - assertEquals(3, v3.version()); - - // Re-open v3 with provider for next transaction - try (Dataset v3WithProvider = Dataset.open(allocator, tableUri, readOptions)) { - - // Commit third fragment - Append appendOp3 = Append.builder().fragments(allFragments.get(2)).build(); - Transaction transaction2 = - new Transaction.Builder(v3WithProvider) - .readVersion(v3WithProvider.version()) - .operation(appendOp3) - .build(); - - try (Dataset finalDataset = transaction2.commit()) { - assertNotNull(finalDataset); - assertEquals(4, finalDataset.version()); - assertEquals(6, finalDataset.countRows()); // 3 fragments * 2 rows each - - // Verify provider was used during commits - int finalCallCount = provider.getCallCount(); - assertTrue( - finalCallCount > callCountBeforeCommit, - "Provider should be called during transaction commits"); - } - } - } - } - } - } - } - } - } -} From 540a6d4980ac4d6a4ee1b5528a040728ed490f2e Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Fri, 21 Nov 2025 13:29:38 -0800 Subject: [PATCH 10/12] spotless --- .../lance/NamespaceIntegrationTest.java | 66 +++++++++---------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java b/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java index df8fa0efd1f..457f6f7799f 100644 --- a/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java +++ b/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java @@ -1034,7 +1034,7 @@ void testDistributedWriteWithNamespace() throws Exception { } @Test - void testFragmentCreateWithStorageOptionsProvider() throws Exception { + void testFragmentCreateAndCommitWithNamespace() throws Exception { try (BufferAllocator allocator = new RootAllocator()) { // Set up storage options Map storageOptions = new HashMap<>(); @@ -1059,6 +1059,8 @@ void testFragmentCreateWithStorageOptionsProvider() throws Exception { request.setId(Arrays.asList(tableName)); CreateEmptyTableResponse response = namespace.createEmptyTable(request); + assertEquals(1, namespace.getCreateCallCount(), "createEmptyTable should be called once"); + String tableUri = response.getLocation(); Map namespaceStorageOptions = response.getStorageOptions(); @@ -1093,21 +1095,12 @@ void testFragmentCreateWithStorageOptionsProvider() throws Exception { valueVector.setValueCount(3); root.setRowCount(3); - int describeCountBefore = namespace.getDescribeCallCount(); - // Create fragment with StorageOptionsProvider List fragments1 = Fragment.create(tableUri, allocator, root, writeParams, provider); assertEquals(1, fragments1.size()); - // Verify provider was called (describeTable is called) - int describeCountAfter1 = namespace.getDescribeCallCount(); - assertEquals( - 1, - describeCountAfter1 - describeCountBefore, - "Provider should call describeTable during fragment creation"); - // Write second fragment with different data idVector.set(0, 4); valueVector.set(0, 400); @@ -1123,34 +1116,39 @@ void testFragmentCreateWithStorageOptionsProvider() throws Exception { assertEquals(1, fragments2.size()); - int describeCountAfter2 = namespace.getDescribeCallCount(); - assertEquals( - 2, - describeCountAfter2 - describeCountBefore, - "Provider should call describeTable again for second fragment"); - - // Commit both fragments to the dataset - FragmentOperation.Append appendOp = new FragmentOperation.Append(fragments1); + // Commit first fragment to the dataset using Overwrite (for empty table) + FragmentOperation.Overwrite overwriteOp = + new FragmentOperation.Overwrite(fragments1, schema); try (Dataset updatedDataset = - Dataset.commit(allocator, tableUri, appendOp, Optional.empty(), mergedOptions)) { + Dataset.commit(allocator, tableUri, overwriteOp, Optional.empty(), mergedOptions)) { assertEquals(1, updatedDataset.version()); assertEquals(3, updatedDataset.countRows()); // Append second fragment FragmentOperation.Append appendOp2 = new FragmentOperation.Append(fragments2); try (Dataset finalDataset = - Dataset.commit( - allocator, tableUri, appendOp2, Optional.of(1L), mergedOptions)) { + Dataset.commit(allocator, tableUri, appendOp2, Optional.of(1L), mergedOptions)) { assertEquals(2, finalDataset.version()); assertEquals(6, finalDataset.countRows()); } } } + + // Verify we can open and read the dataset through namespace + try (Dataset ds = + Dataset.open() + .allocator(allocator) + .namespace(namespace) + .tableId(Arrays.asList(tableName)) + .build()) { + assertEquals(6, ds.countRows(), "Should have 6 rows total"); + assertEquals(2, ds.listVersions().size(), "Should have 2 versions"); + } } } @Test - void testTransactionCommitWithStorageOptionsProvider() throws Exception { + void testTransactionCommitWithNamespace() throws Exception { try (BufferAllocator allocator = new RootAllocator()) { // Set up storage options Map storageOptions = new HashMap<>(); @@ -1222,10 +1220,10 @@ void testTransactionCommitWithStorageOptionsProvider() throws Exception { } // Now test Transaction.commit with provider - // Open dataset with provider + // Open dataset with provider using mergedOptions (which has expires_at_millis) ReadOptions readOptions = new ReadOptions.Builder() - .setStorageOptions(storageOptions) + .setStorageOptions(mergedOptions) .setStorageOptionsProvider(provider) .build(); @@ -1252,8 +1250,6 @@ void testTransactionCommitWithStorageOptionsProvider() throws Exception { newFragments = Fragment.create(tableUri, allocator, root, writeParams, provider); } - int describeCountBefore = namespace.getDescribeCallCount(); - // Create and commit transaction Append appendOp = Append.builder().fragments(newFragments).build(); Transaction transaction = @@ -1265,15 +1261,19 @@ void testTransactionCommitWithStorageOptionsProvider() throws Exception { try (Dataset committedDataset = transaction.commit()) { assertEquals(2, committedDataset.version()); assertEquals(4, committedDataset.countRows()); - - // Verify provider was called during commit - int describeCountAfter = namespace.getDescribeCallCount(); - assertEquals( - 1, - describeCountAfter - describeCountBefore, - "Provider should call describeTable during transaction commit"); } } + + // Verify we can open and read the dataset through namespace + try (Dataset ds = + Dataset.open() + .allocator(allocator) + .namespace(namespace) + .tableId(Arrays.asList(tableName)) + .build()) { + assertEquals(4, ds.countRows(), "Should have 4 rows total"); + assertEquals(2, ds.listVersions().size(), "Should have 2 versions"); + } } } } From de8c9156f8d706c43532c3d0ab034ac32eecd17d Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Fri, 21 Nov 2025 14:21:44 -0800 Subject: [PATCH 11/12] commit --- java/lance-jni/src/blocking_dataset.rs | 58 +++++-- .../main/java/com/lancedb/lance/Dataset.java | 38 ++++- .../lancedb/lance/WriteDatasetBuilder.java | 19 ++- .../lance/NamespaceIntegrationTest.java | 161 ++++++++++++++++++ 4 files changed, 260 insertions(+), 16 deletions(-) diff --git a/java/lance-jni/src/blocking_dataset.rs b/java/lance-jni/src/blocking_dataset.rs index 50b64596e83..c1896820f1e 100644 --- a/java/lance-jni/src/blocking_dataset.rs +++ b/java/lance-jni/src/blocking_dataset.rs @@ -385,6 +385,7 @@ fn inner_create_with_ffi_schema<'local>( enable_stable_row_ids, data_storage_version, storage_options_obj, + JObject::null(), // No provider for schema-only creation s3_credentials_refresh_offset_seconds_obj, reader, ) @@ -432,6 +433,42 @@ pub extern "system" fn Java_com_lancedb_lance_Dataset_createWithFfiStream<'local enable_stable_row_ids, data_storage_version, storage_options_obj, + JObject::null(), + s3_credentials_refresh_offset_seconds_obj + ) + ) +} + +#[no_mangle] +pub extern "system" fn Java_com_lancedb_lance_Dataset_createWithFfiStreamAndProvider<'local>( + mut env: JNIEnv<'local>, + _obj: JObject, + arrow_array_stream_addr: jlong, + path: JString, + max_rows_per_file: JObject, // Optional + max_rows_per_group: JObject, // Optional + max_bytes_per_file: JObject, // Optional + mode: JObject, // Optional + enable_stable_row_ids: JObject, // Optional + data_storage_version: JObject, // Optional + storage_options_obj: JObject, // Map + storage_options_provider_obj: JObject, // Optional + s3_credentials_refresh_offset_seconds_obj: JObject, // Optional +) -> JObject<'local> { + ok_or_throw!( + env, + inner_create_with_ffi_stream( + &mut env, + arrow_array_stream_addr, + path, + max_rows_per_file, + max_rows_per_group, + max_bytes_per_file, + mode, + enable_stable_row_ids, + data_storage_version, + storage_options_obj, + storage_options_provider_obj, s3_credentials_refresh_offset_seconds_obj ) ) @@ -442,13 +479,14 @@ fn inner_create_with_ffi_stream<'local>( env: &mut JNIEnv<'local>, arrow_array_stream_addr: jlong, path: JString, - max_rows_per_file: JObject, // Optional - max_rows_per_group: JObject, // Optional - max_bytes_per_file: JObject, // Optional - mode: JObject, // Optional - enable_stable_row_ids: JObject, // Optional - data_storage_version: JObject, // Optional - storage_options_obj: JObject, // Map + max_rows_per_file: JObject, // Optional + max_rows_per_group: JObject, // Optional + max_bytes_per_file: JObject, // Optional + mode: JObject, // Optional + enable_stable_row_ids: JObject, // Optional + data_storage_version: JObject, // Optional + storage_options_obj: JObject, // Map + storage_options_provider_obj: JObject, // Optional s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> Result> { let stream_ptr = arrow_array_stream_addr as *mut FFI_ArrowArrayStream; @@ -463,6 +501,7 @@ fn inner_create_with_ffi_stream<'local>( enable_stable_row_ids, data_storage_version, storage_options_obj, + storage_options_provider_obj, s3_credentials_refresh_offset_seconds_obj, reader, ) @@ -479,13 +518,12 @@ fn create_dataset<'local>( enable_stable_row_ids: JObject, data_storage_version: JObject, storage_options_obj: JObject, + storage_options_provider_obj: JObject, // Optional s3_credentials_refresh_offset_seconds_obj: JObject, reader: impl RecordBatchReader + Send + 'static, ) -> Result> { let path_str = path.extract(env)?; - // Dataset.create() doesn't support storage_options_provider yet, pass empty Optional - let empty_provider = JObject::null(); let write_params = extract_write_params( env, &max_rows_per_file, @@ -495,7 +533,7 @@ fn create_dataset<'local>( &enable_stable_row_ids, &data_storage_version, &storage_options_obj, - &empty_provider, + &storage_options_provider_obj, &s3_credentials_refresh_offset_seconds_obj, )?; diff --git a/java/src/main/java/com/lancedb/lance/Dataset.java b/java/src/main/java/com/lancedb/lance/Dataset.java index b6df659b9f4..9a3aa5f1fd9 100644 --- a/java/src/main/java/com/lancedb/lance/Dataset.java +++ b/java/src/main/java/com/lancedb/lance/Dataset.java @@ -161,12 +161,34 @@ public static Dataset create( @Deprecated public static Dataset create( BufferAllocator allocator, ArrowArrayStream stream, String path, WriteParams params) { + return create(allocator, stream, path, params, null); + } + + /** + * Create a dataset with given stream and storage options provider. + * + *

This method supports credential vending through the StorageOptionsProvider interface, which + * allows for dynamic credential refresh during long-running write operations. + * + * @param allocator buffer allocator + * @param stream arrow stream + * @param path dataset uri + * @param params write parameters + * @param storageOptionsProvider optional provider for dynamic storage options/credentials + * @return Dataset + */ + static Dataset create( + BufferAllocator allocator, + ArrowArrayStream stream, + String path, + WriteParams params, + StorageOptionsProvider storageOptionsProvider) { Preconditions.checkNotNull(allocator); Preconditions.checkNotNull(stream); Preconditions.checkNotNull(path); Preconditions.checkNotNull(params); Dataset dataset = - createWithFfiStream( + createWithFfiStreamAndProvider( stream.memoryAddress(), path, params.getMaxRowsPerFile(), @@ -176,6 +198,7 @@ public static Dataset create( params.getEnableStableRowIds(), params.getDataStorageVersion(), params.getStorageOptions(), + Optional.ofNullable(storageOptionsProvider), params.getS3CredentialsRefreshOffsetSeconds()); dataset.allocator = allocator; return dataset; @@ -205,6 +228,19 @@ private static native Dataset createWithFfiStream( Map storageOptions, Optional s3CredentialsRefreshOffsetSeconds); + private static native Dataset createWithFfiStreamAndProvider( + long arrowStreamMemoryAddress, + String path, + Optional maxRowsPerFile, + Optional maxRowsPerGroup, + Optional maxBytesPerFile, + Optional mode, + Optional enableStableRowIds, + Optional dataStorageVersion, + Map storageOptions, + Optional storageOptionsProvider, + Optional s3CredentialsRefreshOffsetSeconds); + /** * Open a dataset from the specified path. * diff --git a/java/src/main/java/com/lancedb/lance/WriteDatasetBuilder.java b/java/src/main/java/com/lancedb/lance/WriteDatasetBuilder.java index b8e7092b420..e9e7729d037 100644 --- a/java/src/main/java/com/lancedb/lance/WriteDatasetBuilder.java +++ b/java/src/main/java/com/lancedb/lance/WriteDatasetBuilder.java @@ -13,7 +13,9 @@ */ package com.lancedb.lance; +import com.lancedb.lance.io.StorageOptionsProvider; import com.lancedb.lance.namespace.LanceNamespace; +import com.lancedb.lance.namespace.LanceNamespaceStorageOptionsProvider; import com.lancedb.lance.namespace.model.CreateEmptyTableRequest; import com.lancedb.lance.namespace.model.CreateEmptyTableResponse; import com.lancedb.lance.namespace.model.DescribeTableRequest; @@ -398,8 +400,14 @@ private Dataset executeWithNamespace() { WriteParams params = paramsBuilder.build(); + // Create storage options provider for credential refresh during long-running writes + StorageOptionsProvider storageOptionsProvider = + ignoreNamespaceStorageOptions + ? null + : new LanceNamespaceStorageOptionsProvider(namespace, tableId); + // Use Dataset.create() which handles CREATE/APPEND/OVERWRITE modes - return createDatasetWithStream(tableUri, params); + return createDatasetWithStream(tableUri, params, storageOptionsProvider); } private Dataset executeWithUri() { @@ -416,20 +424,21 @@ private Dataset executeWithUri() { WriteParams params = paramsBuilder.build(); - return createDatasetWithStream(uri, params); + return createDatasetWithStream(uri, params, null); } - private Dataset createDatasetWithStream(String path, WriteParams params) { + private Dataset createDatasetWithStream( + String path, WriteParams params, StorageOptionsProvider storageOptionsProvider) { // If stream is directly provided, use it if (stream != null) { - return Dataset.create(allocator, stream, path, params); + return Dataset.create(allocator, stream, path, params, storageOptionsProvider); } // If reader is provided, convert to stream if (reader != null) { try (ArrowArrayStream tempStream = ArrowArrayStream.allocateNew(allocator)) { Data.exportArrayStream(allocator, reader, tempStream); - return Dataset.create(allocator, tempStream, path, params); + return Dataset.create(allocator, tempStream, path, params, storageOptionsProvider); } } diff --git a/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java b/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java index 457f6f7799f..eabdc68ca6c 100644 --- a/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java +++ b/java/src/test/java/com/lancedb/lance/NamespaceIntegrationTest.java @@ -602,6 +602,167 @@ public VectorSchemaRoot getVectorSchemaRoot() { } } + @Test + void testWriteDatasetBuilderWithNamespaceCreateCallCounts() throws Exception { + try (BufferAllocator allocator = new RootAllocator()) { + // Set up storage options + Map storageOptions = new HashMap<>(); + storageOptions.put("allow_http", "true"); + storageOptions.put("aws_access_key_id", ACCESS_KEY); + storageOptions.put("aws_secret_access_key", SECRET_KEY); + storageOptions.put("aws_endpoint", ENDPOINT_URL); + storageOptions.put("aws_region", REGION); + + // Create tracking namespace with 60-second expiration (long enough that no refresh happens) + // Credentials expire at T+60s. With a 1s refresh offset, refresh would happen at T+59s. + // Since writes complete well under 59 seconds, NO credential refresh should occur. + TrackingNamespace namespace = new TrackingNamespace(BUCKET_NAME, storageOptions, 60); + String tableName = UUID.randomUUID().toString(); + + // Verify initial call counts + assertEquals(0, namespace.getCreateCallCount(), "createEmptyTable should not be called yet"); + assertEquals(0, namespace.getDescribeCallCount(), "describeTable should not be called yet"); + + // Create schema and data + Schema schema = + new Schema( + Arrays.asList( + new Field("a", FieldType.nullable(new ArrowType.Int(32, true)), null), + new Field("b", FieldType.nullable(new ArrowType.Int(32, true)), null))); + + try (VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator)) { + IntVector aVector = (IntVector) root.getVector("a"); + IntVector bVector = (IntVector) root.getVector("b"); + + aVector.allocateNew(2); + bVector.allocateNew(2); + + aVector.set(0, 1); + bVector.set(0, 2); + aVector.set(1, 10); + bVector.set(1, 20); + + aVector.setValueCount(2); + bVector.setValueCount(2); + root.setRowCount(2); + + // Create a test reader that returns our VectorSchemaRoot + ArrowReader testReader = + new ArrowReader(allocator) { + boolean firstRead = true; + + @Override + public boolean loadNextBatch() { + if (firstRead) { + firstRead = false; + return true; + } + return false; + } + + @Override + public long bytesRead() { + return 0; + } + + @Override + protected void closeReadSource() {} + + @Override + protected Schema readSchema() { + return schema; + } + + @Override + public VectorSchemaRoot getVectorSchemaRoot() { + return root; + } + }; + + // Use the write builder to create a dataset through namespace + // Set a 1-second refresh offset. Credentials expire at T+60s, so refresh at T+59s. + // Write completes instantly, so NO describeTable call should happen for refresh. + try (Dataset dataset = + Dataset.write() + .allocator(allocator) + .reader(testReader) + .namespace(namespace) + .tableId(Arrays.asList(tableName)) + .mode(WriteParams.WriteMode.CREATE) + .s3CredentialsRefreshOffsetSeconds(1) + .execute()) { + + // Verify createEmptyTable was called exactly ONCE + assertEquals( + 1, namespace.getCreateCallCount(), "createEmptyTable should be called exactly once"); + + // Verify describeTable was NOT called during CREATE + // Initial credentials come from createEmptyTable response, and since credentials + // don't expire during the fast write, NO refresh (describeTable) is needed + assertEquals( + 0, + namespace.getDescribeCallCount(), + "describeTable should NOT be called during CREATE - " + + "initial credentials come from createEmptyTable response and don't expire"); + + // Verify dataset was created successfully + assertEquals(2, dataset.countRows()); + assertEquals(schema, dataset.getSchema()); + } + } + + // Verify counts after dataset is closed + assertEquals( + 1, namespace.getCreateCallCount(), "createEmptyTable should still be 1 after close"); + assertEquals( + 0, + namespace.getDescribeCallCount(), + "describeTable should still be 0 after close (no refresh needed)"); + + // Now open the dataset through namespace with long-lived credentials (60s expiration) + // With 1s refresh offset, credentials are valid for 59s - plenty of time for reads + ReadOptions readOptions = + new ReadOptions.Builder().setS3CredentialsRefreshOffsetSeconds(1).build(); + + try (Dataset dsFromNamespace = + Dataset.open() + .allocator(allocator) + .namespace(namespace) + .tableId(Arrays.asList(tableName)) + .readOptions(readOptions) + .build()) { + + // createEmptyTable should NOT be called during open (only during CREATE) + assertEquals( + 1, + namespace.getCreateCallCount(), + "createEmptyTable should still be 1 (not called during open)"); + + // describeTable is called exactly ONCE during open to get table location + assertEquals( + 1, + namespace.getDescribeCallCount(), + "describeTable should be called exactly once during open"); + + // Verify we can read the data multiple times + assertEquals(2, dsFromNamespace.countRows()); + assertEquals(2, dsFromNamespace.countRows()); + assertEquals(2, dsFromNamespace.countRows()); + + // After multiple reads, no additional describeTable calls should be made + // (credentials are cached and don't expire during this fast test) + assertEquals( + 1, + namespace.getDescribeCallCount(), + "describeTable should still be 1 after reads (credentials cached, no refresh needed)"); + } + + // Final verification + assertEquals(1, namespace.getCreateCallCount(), "Final: createEmptyTable = 1"); + assertEquals(1, namespace.getDescribeCallCount(), "Final: describeTable = 1"); + } + } + @Test void testWriteDatasetBuilderWithNamespaceAppend() throws Exception { try (BufferAllocator allocator = new RootAllocator()) { From a58dbd5e131c6093e25bad83a13ff6fdad95baa3 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Mon, 24 Nov 2025 23:28:27 -0800 Subject: [PATCH 12/12] use get_optional --- java/lance-jni/src/utils.rs | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/java/lance-jni/src/utils.rs b/java/lance-jni/src/utils.rs index 50fc848313f..5a1a8b95089 100644 --- a/java/lance-jni/src/utils.rs +++ b/java/lance-jni/src/utils.rs @@ -76,28 +76,12 @@ pub fn extract_write_params( extract_storage_options(env, storage_options_obj)?; // Extract storage options provider if present - let storage_options_provider = if !storage_options_provider_obj.is_null() { - // Check if it's an Optional.empty() - let is_present = env - .call_method(storage_options_provider_obj, "isPresent", "()Z", &[])? - .z()?; - if is_present { - // Get the value from Optional - let provider_obj = env - .call_method( - storage_options_provider_obj, - "get", - "()Ljava/lang/Object;", - &[], - )? - .l()?; - Some(JavaStorageOptionsProvider::new(env, provider_obj)?) - } else { - None - } - } else { - None - }; + let storage_options_provider = env.get_optional(storage_options_provider_obj, |env, obj| { + let provider_obj = env + .call_method(obj, "get", "()Ljava/lang/Object;", &[])? + .l()?; + JavaStorageOptionsProvider::new(env, provider_obj) + })?; let storage_options_provider_arc: Option> = storage_options_provider.map(|v| Arc::new(v) as Arc);