diff --git a/java/lance-jni/src/blocking_dataset.rs b/java/lance-jni/src/blocking_dataset.rs index dde2b81879b..10647e0540c 100644 --- a/java/lance-jni/src/blocking_dataset.rs +++ b/java/lance-jni/src/blocking_dataset.rs @@ -76,16 +76,32 @@ pub struct BlockingDataset { } impl BlockingDataset { - /// Get the storage options provider that was used when opening this dataset - pub fn get_storage_options_provider(&self) -> Option> { - self.inner.storage_options_provider() + /// Get the initial storage options used to open this dataset. + /// + /// Returns the options that were provided when the dataset was opened, + /// without any refresh from the provider. Returns None if no storage options + /// were provided. + pub fn initial_storage_options(&self) -> Option> { + self.inner.initial_storage_options().cloned() + } + + /// Get the latest storage options, potentially refreshed from the provider. + /// + /// If a storage options provider was configured and credentials are expiring, + /// this will refresh them. + pub fn latest_storage_options(&self) -> Result>> { + RT.block_on(async { self.inner.latest_storage_options().await }) + .map(|opt| opt.map(|opts| opts.0)) + .map_err(|e| Error::io_error(e.to_string())) } pub fn drop(uri: &str, storage_options: HashMap) -> Result<()> { RT.block_on(async move { let registry = Arc::new(ObjectStoreRegistry::default()); let object_store_params = ObjectStoreParams { - storage_options: Some(storage_options.clone()), + storage_options_accessor: Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(storage_options), + )), ..Default::default() }; let (object_store, path) = @@ -117,20 +133,29 @@ impl BlockingDataset { storage_options: HashMap, serialized_manifest: Option<&[u8]>, storage_options_provider: Option>, - s3_credentials_refresh_offset_seconds: Option, ) -> Result { - let mut store_params = ObjectStoreParams { + // Create storage options accessor from storage_options and provider + let accessor = match (storage_options.is_empty(), storage_options_provider) { + (false, Some(provider)) => Some(Arc::new( + lance::io::StorageOptionsAccessor::with_initial_and_provider( + storage_options, + provider, + ), + )), + (false, None) => Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(storage_options), + )), + (true, Some(provider)) => Some(Arc::new( + lance::io::StorageOptionsAccessor::with_provider(provider), + )), + (true, None) => None, + }; + + let store_params = ObjectStoreParams { block_size: block_size.map(|size| size as usize), - storage_options: Some(storage_options.clone()), + storage_options_accessor: accessor, ..Default::default() }; - if let Some(offset_seconds) = s3_credentials_refresh_offset_seconds { - store_params.s3_credentials_refresh_offset = - std::time::Duration::from_secs(offset_seconds); - } - if let Some(provider) = storage_options_provider.clone() { - store_params.storage_options_provider = Some(provider); - } let params = ReadParams { index_cache_size_bytes: index_cache_size_bytes as usize, metadata_cache_size_bytes: metadata_cache_size_bytes as usize, @@ -143,14 +168,6 @@ impl BlockingDataset { if let Some(ver) = version { builder = builder.with_version(ver as u64); } - builder = builder.with_storage_options(storage_options); - if let Some(provider) = storage_options_provider.clone() { - builder = builder.with_storage_options_provider(provider) - } - if let Some(offset_seconds) = s3_credentials_refresh_offset_seconds { - builder = builder - .with_s3_credentials_refresh_offset(std::time::Duration::from_secs(offset_seconds)); - } if let Some(serialized_manifest) = serialized_manifest { builder = builder.with_serialized_manifest(serialized_manifest)?; @@ -166,12 +183,19 @@ impl BlockingDataset { read_version: Option, storage_options: HashMap, ) -> Result { + let accessor = if storage_options.is_empty() { + None + } else { + Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(storage_options), + )) + }; let inner = RT.block_on(Dataset::commit( uri, operation, read_version, Some(ObjectStoreParams { - storage_options: Some(storage_options), + storage_options_accessor: accessor, ..Default::default() }), None, @@ -335,7 +359,6 @@ pub extern "system" fn Java_org_lance_Dataset_createWithFfiSchema<'local>( data_storage_version: JObject, // Optional enable_v2_manifest_paths: JObject, // Optional storage_options_obj: JObject, // Map - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional initial_bases: JObject, target_bases: JObject, ) -> JObject<'local> { @@ -353,7 +376,6 @@ pub extern "system" fn Java_org_lance_Dataset_createWithFfiSchema<'local>( data_storage_version, enable_v2_manifest_paths, storage_options_obj, - s3_credentials_refresh_offset_seconds_obj, initial_bases, target_bases, ) @@ -373,7 +395,6 @@ fn inner_create_with_ffi_schema<'local>( data_storage_version: JObject, // Optional enable_v2_manifest_paths: JObject, // Optional storage_options_obj: JObject, // Map - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional initial_bases: JObject, target_bases: JObject, ) -> Result> { @@ -394,7 +415,6 @@ fn inner_create_with_ffi_schema<'local>( enable_v2_manifest_paths, storage_options_obj, JObject::null(), // No provider for schema-only creation - s3_credentials_refresh_offset_seconds_obj, initial_bases, target_bases, reader, @@ -447,7 +467,6 @@ pub extern "system" fn Java_org_lance_Dataset_createWithFfiStream<'local>( data_storage_version: JObject, // Optional enable_v2_manifest_paths: JObject, // Optional storage_options_obj: JObject, // Map - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional initial_bases: JObject, target_bases: JObject, ) -> JObject<'local> { @@ -466,7 +485,6 @@ pub extern "system" fn Java_org_lance_Dataset_createWithFfiStream<'local>( data_storage_version, storage_options_obj, JObject::null(), - s3_credentials_refresh_offset_seconds_obj, initial_bases, target_bases ) @@ -488,7 +506,6 @@ pub extern "system" fn Java_org_lance_Dataset_createWithFfiStreamAndProvider<'lo enable_v2_manifest_paths: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional initial_bases: JObject, // Optional> target_bases: JObject, // Optional> ) -> JObject<'local> { @@ -507,7 +524,6 @@ pub extern "system" fn Java_org_lance_Dataset_createWithFfiStreamAndProvider<'lo enable_v2_manifest_paths, storage_options_obj, storage_options_provider_obj, - s3_credentials_refresh_offset_seconds_obj, initial_bases, target_bases ) @@ -528,7 +544,6 @@ fn inner_create_with_ffi_stream<'local>( enable_v2_manifest_paths: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional initial_bases: JObject, // Optional> target_bases: JObject, // Optional> ) -> Result> { @@ -546,7 +561,6 @@ fn inner_create_with_ffi_stream<'local>( enable_v2_manifest_paths, storage_options_obj, storage_options_provider_obj, - s3_credentials_refresh_offset_seconds_obj, initial_bases, target_bases, reader, @@ -566,7 +580,6 @@ fn create_dataset<'local>( enable_v2_manifest_paths: JObject, storage_options_obj: JObject, storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, initial_bases: JObject, target_bases: JObject, reader: impl RecordBatchReader + Send + 'static, @@ -584,7 +597,6 @@ fn create_dataset<'local>( Some(&enable_v2_manifest_paths), &storage_options_obj, &storage_options_provider_obj, - &s3_credentials_refresh_offset_seconds_obj, &initial_bases, &target_bases, )?; @@ -1061,7 +1073,6 @@ pub extern "system" fn Java_org_lance_Dataset_openNative<'local>( storage_options_obj: JObject, // Map serialized_manifest: JObject, // Optional storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> JObject<'local> { ok_or_throw!( env, @@ -1075,7 +1086,6 @@ pub extern "system" fn Java_org_lance_Dataset_openNative<'local>( storage_options_obj, serialized_manifest, storage_options_provider_obj, - s3_credentials_refresh_offset_seconds_obj ) ) } @@ -1091,7 +1101,6 @@ fn inner_open_native<'local>( storage_options_obj: JObject, // Map serialized_manifest: JObject, // Optional storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> Result> { let path_str: String = path.extract(env)?; let version = env.get_int_opt(&version_obj)?; @@ -1108,11 +1117,6 @@ fn inner_open_native<'local>( let storage_options_provider_arc = storage_options_provider.map(|v| Arc::new(v) as Arc); - // Extract s3_credentials_refresh_offset_seconds - let s3_credentials_refresh_offset_seconds = env - .get_long_opt(&s3_credentials_refresh_offset_seconds_obj)? - .map(|v| v as u64); - let serialized_manifest = env.get_bytes_opt(&serialized_manifest)?; let dataset = BlockingDataset::open( &path_str, @@ -1123,7 +1127,6 @@ fn inner_open_native<'local>( storage_options, serialized_manifest, storage_options_provider_arc, - s3_credentials_refresh_offset_seconds, )?; dataset.into_java(env) } @@ -1319,6 +1322,58 @@ fn inner_latest_version_id(env: &mut JNIEnv, java_dataset: JObject) -> Result( + mut env: JNIEnv<'local>, + java_dataset: JObject, +) -> JObject<'local> { + ok_or_throw!( + env, + inner_get_initial_storage_options(&mut env, java_dataset) + ) +} + +fn inner_get_initial_storage_options<'local>( + env: &mut JNIEnv<'local>, + java_dataset: JObject, +) -> Result> { + let storage_options = { + let dataset_guard = + unsafe { env.get_rust_field::<_, _, BlockingDataset>(java_dataset, NATIVE_DATASET) }?; + dataset_guard.initial_storage_options() + }; + match storage_options { + Some(opts) => opts.into_java(env), + None => Ok(JObject::null()), + } +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_Dataset_nativeGetLatestStorageOptions<'local>( + mut env: JNIEnv<'local>, + java_dataset: JObject, +) -> JObject<'local> { + ok_or_throw!( + env, + inner_get_latest_storage_options(&mut env, java_dataset) + ) +} + +fn inner_get_latest_storage_options<'local>( + env: &mut JNIEnv<'local>, + java_dataset: JObject, +) -> Result> { + let storage_options = { + let dataset_guard = + unsafe { env.get_rust_field::<_, _, BlockingDataset>(java_dataset, NATIVE_DATASET) }?; + dataset_guard.latest_storage_options()? + }; + match storage_options { + Some(opts) => opts.into_java(env), + None => Ok(JObject::null()), + } +} + #[no_mangle] pub extern "system" fn Java_org_lance_Dataset_nativeCheckoutLatest( mut env: JNIEnv, @@ -2178,7 +2233,9 @@ fn transform_jstorage_options( Ok(storage_options .map(|options| { Some(ObjectStoreParams { - storage_options: Some(options), + storage_options_accessor: Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(options), + )), ..Default::default() }) }) diff --git a/java/lance-jni/src/file_reader.rs b/java/lance-jni/src/file_reader.rs index 11591b3acea..ccaac121579 100644 --- a/java/lance-jni/src/file_reader.rs +++ b/java/lance-jni/src/file_reader.rs @@ -112,7 +112,9 @@ fn inner_open<'local>( let storage_options = to_rust_map(env, &jmap)?; let reader = RT.block_on(async move { let object_params = ObjectStoreParams { - storage_options: Some(storage_options), + storage_options_accessor: Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(storage_options), + )), ..Default::default() }; let (obj_store, path) = ObjectStore::from_uri_and_params( diff --git a/java/lance-jni/src/file_writer.rs b/java/lance-jni/src/file_writer.rs index 600d7de2845..ebc5b1c328b 100644 --- a/java/lance-jni/src/file_writer.rs +++ b/java/lance-jni/src/file_writer.rs @@ -94,7 +94,9 @@ fn inner_open<'local>( let writer = RT.block_on(async move { let object_params = ObjectStoreParams { - storage_options: Some(storage_options), + storage_options_accessor: Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(storage_options), + )), ..Default::default() }; let (obj_store, path) = ObjectStore::from_uri_and_params( diff --git a/java/lance-jni/src/fragment.rs b/java/lance-jni/src/fragment.rs index 2ca3fe4824f..07cc0d53d73 100644 --- a/java/lance-jni/src/fragment.rs +++ b/java/lance-jni/src/fragment.rs @@ -91,7 +91,6 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiArray<'local>( data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> JObject<'local> { ok_or_throw_with_return!( env, @@ -108,7 +107,6 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiArray<'local>( data_storage_version, storage_options_obj, storage_options_provider_obj, - s3_credentials_refresh_offset_seconds_obj ), JObject::default() ) @@ -128,7 +126,6 @@ fn inner_create_with_ffi_array<'local>( data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> Result> { let c_array_ptr = arrow_array_addr as *mut FFI_ArrowArray; let c_schema_ptr = arrow_schema_addr as *mut FFI_ArrowSchema; @@ -154,7 +151,6 @@ fn inner_create_with_ffi_array<'local>( data_storage_version, storage_options_obj, storage_options_provider_obj, - s3_credentials_refresh_offset_seconds_obj, reader, ) } @@ -173,7 +169,6 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiStream<'a>( data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> JObject<'a> { ok_or_throw_with_return!( env, @@ -189,7 +184,6 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiStream<'a>( data_storage_version, storage_options_obj, storage_options_provider_obj, - s3_credentials_refresh_offset_seconds_obj ), JObject::null() ) @@ -208,7 +202,6 @@ fn inner_create_with_ffi_stream<'local>( data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> Result> { let stream_ptr = arrow_array_stream_addr as *mut FFI_ArrowArrayStream; let reader = unsafe { ArrowArrayStreamReader::from_raw(stream_ptr) }?; @@ -224,7 +217,6 @@ fn inner_create_with_ffi_stream<'local>( data_storage_version, storage_options_obj, storage_options_provider_obj, - s3_credentials_refresh_offset_seconds_obj, reader, ) } @@ -241,7 +233,6 @@ fn create_fragment<'a>( data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional source: impl StreamingWriteSource, ) -> Result> { let path_str = dataset_uri.extract(env)?; @@ -257,7 +248,6 @@ fn create_fragment<'a>( None, &storage_options_obj, &storage_options_provider_obj, - &s3_credentials_refresh_offset_seconds_obj, &JObject::null(), // not used when creating fragments &JObject::null(), // not used when creating fragments )?; diff --git a/java/lance-jni/src/traits.rs b/java/lance-jni/src/traits.rs index 7da64d453c2..ebc53b1679a 100644 --- a/java/lance-jni/src/traits.rs +++ b/java/lance-jni/src/traits.rs @@ -1,6 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors +use std::collections::HashMap; + use jni::objects::{JIntArray, JLongArray, JMap, JObject, JString, JValue, JValueGen}; use jni::JNIEnv; @@ -224,6 +226,26 @@ impl IntoJava for &String { } } +impl IntoJava for HashMap { + fn into_java<'a>(self, env: &mut JNIEnv<'a>) -> Result> { + let hash_map = env.new_object("java/util/HashMap", "()V", &[])?; + for (key, value) in self { + let java_key = env.new_string(&key)?; + let java_value = env.new_string(&value)?; + env.call_method( + &hash_map, + "put", + "(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;", + &[ + JValueGen::Object(&java_key.into()), + JValueGen::Object(&java_value.into()), + ], + )?; + } + Ok(hash_map) + } +} + impl IntoJava for JLance> { fn into_java<'a>(self, env: &mut JNIEnv<'a>) -> Result> { let obj = match self.0 { diff --git a/java/lance-jni/src/transaction.rs b/java/lance-jni/src/transaction.rs index 5f43f454154..42331f99b5f 100644 --- a/java/lance-jni/src/transaction.rs +++ b/java/lance-jni/src/transaction.rs @@ -754,27 +754,49 @@ fn inner_commit_transaction<'local>( .call_method(&java_transaction, "writeParams", "()Ljava/util/Map;", &[])? .l()?; let write_param_jmap = JMap::from_env(env, &write_param_jobj)?; - let mut write_param = to_rust_map(env, &write_param_jmap)?; + let write_param = to_rust_map(env, &write_param_jmap)?; - // Extract s3_credentials_refresh_offset_seconds from write_param - let s3_credentials_refresh_offset = write_param - .remove("s3_credentials_refresh_offset_seconds") - .and_then(|v| v.parse::().ok()) - .map(std::time::Duration::from_secs) - .unwrap_or_else(|| std::time::Duration::from_secs(10)); - - // Get the Dataset's storage_options_provider - let storage_options_provider = { + // Get the Dataset's storage_options_accessor and merge with write_param + let storage_options_accessor = { let dataset_guard = unsafe { env.get_rust_field::<_, _, BlockingDataset>(&java_dataset, NATIVE_DATASET) }?; - dataset_guard.get_storage_options_provider() + let existing_accessor = dataset_guard.inner.storage_options_accessor(); + + // Merge write_param with existing accessor's initial options + match existing_accessor { + Some(accessor) => { + let mut merged = accessor + .initial_storage_options() + .cloned() + .unwrap_or_default(); + merged.extend(write_param); + if let Some(provider) = accessor.provider().cloned() { + Some(Arc::new( + lance::io::StorageOptionsAccessor::with_initial_and_provider( + merged, provider, + ), + )) + } else { + Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(merged), + )) + } + } + None => { + if !write_param.is_empty() { + Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(write_param), + )) + } else { + None + } + } + } }; - // Build ObjectStoreParams using write_param for storage_options and provider from Dataset + // Build ObjectStoreParams using the merged accessor let store_params = ObjectStoreParams { - storage_options: Some(write_param), - storage_options_provider, - s3_credentials_refresh_offset, + storage_options_accessor, ..Default::default() }; diff --git a/java/lance-jni/src/utils.rs b/java/lance-jni/src/utils.rs index a87df02040f..02c5596d74b 100644 --- a/java/lance-jni/src/utils.rs +++ b/java/lance-jni/src/utils.rs @@ -50,7 +50,6 @@ pub fn extract_write_params( enable_v2_manifest_paths: Option<&JObject>, storage_options_obj: &JObject, storage_options_provider_obj: &JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: &JObject, // Optional initial_bases: &JObject, // Optional target_bases: &JObject, // Optional ) -> Result { @@ -90,19 +89,11 @@ pub fn extract_write_params( extract_storage_options(env, storage_options_obj)?; // Extract storage options provider if present - let storage_options_provider = env + let storage_options_provider: Option> = env .get_optional(storage_options_provider_obj, |env, provider_obj| { JavaStorageOptionsProvider::new(env, provider_obj) - })?; - - let storage_options_provider_arc: Option> = - storage_options_provider.map(|v| Arc::new(v) as Arc); - - // Extract s3_credentials_refresh_offset_seconds if present - let s3_credentials_refresh_offset = env - .get_long_opt(s3_credentials_refresh_offset_seconds_obj)? - .map(|v| std::time::Duration::from_secs(v as u64)) - .unwrap_or_else(|| std::time::Duration::from_secs(10)); + })? + .map(|p| Arc::new(p) as Arc); if let Some(initial_bases) = env.get_list_opt(initial_bases, |env, elem| elem.extract_object(env))? @@ -114,10 +105,22 @@ pub fn extract_write_params( write_params.target_base_names_or_paths = Some(names); } + // Create storage options accessor from storage_options and provider + let accessor = match (storage_options.is_empty(), storage_options_provider) { + (false, Some(provider)) => Some(Arc::new( + lance::io::StorageOptionsAccessor::with_initial_and_provider(storage_options, provider), + )), + (false, None) => Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(storage_options), + )), + (true, Some(provider)) => Some(Arc::new(lance::io::StorageOptionsAccessor::with_provider( + provider, + ))), + (true, None) => None, + }; + write_params.store_params = Some(ObjectStoreParams { - storage_options: Some(storage_options), - storage_options_provider: storage_options_provider_arc, - s3_credentials_refresh_offset, + storage_options_accessor: accessor, ..Default::default() }); Ok(write_params) diff --git a/java/src/main/java/org/lance/Dataset.java b/java/src/main/java/org/lance/Dataset.java index cc4e0d03dbb..596e9f83c91 100644 --- a/java/src/main/java/org/lance/Dataset.java +++ b/java/src/main/java/org/lance/Dataset.java @@ -144,7 +144,6 @@ public static Dataset create( params.getDataStorageVersion(), params.getEnableV2ManifestPaths(), params.getStorageOptions(), - params.getS3CredentialsRefreshOffsetSeconds(), params.getInitialBases(), params.getTargetBases()); dataset.allocator = allocator; @@ -206,7 +205,6 @@ static Dataset create( params.getEnableV2ManifestPaths(), params.getStorageOptions(), Optional.ofNullable(storageOptionsProvider), - params.getS3CredentialsRefreshOffsetSeconds(), params.getInitialBases(), params.getTargetBases()); dataset.allocator = allocator; @@ -224,7 +222,6 @@ private static native Dataset createWithFfiSchema( Optional dataStorageVersion, Optional enableV2ManifestPaths, Map storageOptions, - Optional s3CredentialsRefreshOffsetSeconds, Optional> initialBases, Optional> targetBases); @@ -239,7 +236,6 @@ private static native Dataset createWithFfiStream( Optional dataStorageVersion, Optional enableV2ManifestPaths, Map storageOptions, - Optional s3CredentialsRefreshOffsetSeconds, Optional> initialBases, Optional> targetBases); @@ -255,7 +251,6 @@ private static native Dataset createWithFfiStreamAndProvider( Optional enableV2ManifestPaths, Map storageOptions, Optional storageOptionsProvider, - Optional s3CredentialsRefreshOffsetSeconds, Optional> initialBases, Optional> targetBases); @@ -335,8 +330,7 @@ static Dataset open( options.getMetadataCacheSizeBytes(), options.getStorageOptions(), options.getSerializedManifest(), - options.getStorageOptionsProvider(), - options.getS3CredentialsRefreshOffsetSeconds()); + options.getStorageOptionsProvider()); dataset.allocator = allocator; dataset.selfManagedAllocator = selfManagedAllocator; return dataset; @@ -350,8 +344,7 @@ private static native Dataset openNative( long metadataCacheSizeBytes, Map storageOptions, Optional serializedManifest, - Optional storageOptionsProvider, - Optional s3CredentialsRefreshOffsetSeconds); + Optional storageOptionsProvider); /** * Creates a builder for opening a dataset. @@ -758,6 +751,42 @@ public long latestVersion() { private native long nativeGetLatestVersionId(); + /** + * Get the initial storage options used to open this dataset. + * + *

This returns the options that were provided when the dataset was opened, without any refresh + * from the provider. Returns null if no storage options were provided. + * + * @return the initial storage options, or null if none were provided + */ + public Map getInitialStorageOptions() { + try (LockManager.ReadLock readLock = lockManager.acquireReadLock()) { + Preconditions.checkArgument(nativeDatasetHandle != 0, "Dataset is closed"); + return nativeGetInitialStorageOptions(); + } + } + + private native Map nativeGetInitialStorageOptions(); + + /** + * Get the latest storage options, potentially refreshed from the provider. + * + *

If a storage options provider was configured and credentials are expiring, this will refresh + * them. + * + * @return the latest storage options (static or refreshed from provider), or null if no storage + * options were configured for this dataset + * @throws RuntimeException if an error occurs while fetching/refreshing options from the provider + */ + public Map getLatestStorageOptions() { + try (LockManager.ReadLock readLock = lockManager.acquireReadLock()) { + Preconditions.checkArgument(nativeDatasetHandle != 0, "Dataset is closed"); + return nativeGetLatestStorageOptions(); + } + } + + private native Map nativeGetLatestStorageOptions(); + /** Checkout the dataset to the latest version. */ public void checkoutLatest() { try (LockManager.WriteLock writeLock = lockManager.acquireWriteLock()) { diff --git a/java/src/main/java/org/lance/Fragment.java b/java/src/main/java/org/lance/Fragment.java index 62bcabf9bdd..8eb1f70053d 100644 --- a/java/src/main/java/org/lance/Fragment.java +++ b/java/src/main/java/org/lance/Fragment.java @@ -211,7 +211,6 @@ private native FragmentUpdateResult nativeUpdateColumns( * .allocator(allocator) * .data(vectorSchemaRoot) * .storageOptions(storageOptions) - * .s3CredentialsRefreshOffsetSeconds(10) * .execute(); * } * @@ -277,8 +276,7 @@ public static List create( params.getEnableStableRowIds(), params.getDataStorageVersion(), params.getStorageOptions(), - Optional.ofNullable(storageOptionsProvider), - params.getS3CredentialsRefreshOffsetSeconds()); + Optional.ofNullable(storageOptionsProvider)); } } @@ -330,8 +328,7 @@ public static List create( params.getEnableStableRowIds(), params.getDataStorageVersion(), params.getStorageOptions(), - Optional.ofNullable(storageOptionsProvider), - params.getS3CredentialsRefreshOffsetSeconds()); + Optional.ofNullable(storageOptionsProvider)); } /** @@ -350,8 +347,7 @@ private static native List createWithFfiArray( Optional enableStableRowIds, Optional dataStorageVersion, Map storageOptions, - Optional storageOptionsProvider, - Optional s3CredentialsRefreshOffsetSeconds); + Optional storageOptionsProvider); /** * Create a fragment from the given arrow stream. @@ -368,6 +364,5 @@ private static native List createWithFfiStream( Optional enableStableRowIds, Optional dataStorageVersion, Map storageOptions, - Optional storageOptionsProvider, - Optional s3CredentialsRefreshOffsetSeconds); + Optional storageOptionsProvider); } diff --git a/java/src/main/java/org/lance/OpenDatasetBuilder.java b/java/src/main/java/org/lance/OpenDatasetBuilder.java index ae082e14ceb..fc350fb0fcf 100644 --- a/java/src/main/java/org/lance/OpenDatasetBuilder.java +++ b/java/src/main/java/org/lance/OpenDatasetBuilder.java @@ -58,7 +58,6 @@ public class OpenDatasetBuilder { private LanceNamespace namespace; private List tableId; private ReadOptions options = new ReadOptions.Builder().build(); - private boolean ignoreNamespaceTableStorageOptions = false; /** Creates a new builder instance. Package-private, use Dataset.open() instead. */ OpenDatasetBuilder() {} @@ -128,19 +127,6 @@ public OpenDatasetBuilder readOptions(ReadOptions options) { return this; } - /** - * Sets whether to ignore storage options from the namespace's describeTable(). - * - * @param ignoreNamespaceTableStorageOptions If true, storage options returned from - * describeTable() will be ignored (treated as null) - * @return this builder instance - */ - public OpenDatasetBuilder ignoreNamespaceTableStorageOptions( - boolean ignoreNamespaceTableStorageOptions) { - this.ignoreNamespaceTableStorageOptions = ignoreNamespaceTableStorageOptions; - return this; - } - /** * Opens the dataset with the configured parameters. * @@ -204,8 +190,7 @@ private Dataset buildFromNamespace() { throw new IllegalArgumentException("Namespace did not return a table location"); } - Map namespaceStorageOptions = - ignoreNamespaceTableStorageOptions ? null : response.getStorageOptions(); + Map namespaceStorageOptions = response.getStorageOptions(); ReadOptions.Builder optionsBuilder = new ReadOptions.Builder() @@ -221,9 +206,6 @@ private Dataset buildFromNamespace() { options.getVersion().ifPresent(optionsBuilder::setVersion); options.getBlockSize().ifPresent(optionsBuilder::setBlockSize); options.getSerializedManifest().ifPresent(optionsBuilder::setSerializedManifest); - options - .getS3CredentialsRefreshOffsetSeconds() - .ifPresent(optionsBuilder::setS3CredentialsRefreshOffsetSeconds); Map storageOptions = new HashMap<>(options.getStorageOptions()); if (namespaceStorageOptions != null) { diff --git a/java/src/main/java/org/lance/ReadOptions.java b/java/src/main/java/org/lance/ReadOptions.java index 9d08c834008..0a7a0343a79 100644 --- a/java/src/main/java/org/lance/ReadOptions.java +++ b/java/src/main/java/org/lance/ReadOptions.java @@ -32,7 +32,6 @@ public class ReadOptions { private final Optional serializedManifest; private final Map storageOptions; private final Optional storageOptionsProvider; - private final Optional s3CredentialsRefreshOffsetSeconds; private ReadOptions(Builder builder) { this.version = builder.version; @@ -42,7 +41,6 @@ private ReadOptions(Builder builder) { this.storageOptions = builder.storageOptions; this.serializedManifest = builder.serializedManifest; this.storageOptionsProvider = builder.storageOptionsProvider; - this.s3CredentialsRefreshOffsetSeconds = builder.s3CredentialsRefreshOffsetSeconds; } public Optional getVersion() { @@ -73,10 +71,6 @@ public Optional getStorageOptionsProvider() { return storageOptionsProvider; } - public Optional getS3CredentialsRefreshOffsetSeconds() { - return s3CredentialsRefreshOffsetSeconds; - } - @Override public String toString() { return MoreObjects.toStringHelper(this) @@ -100,7 +94,6 @@ public static class Builder { private Map storageOptions = new HashMap<>(); private Optional serializedManifest = Optional.empty(); private Optional storageOptionsProvider = Optional.empty(); - private Optional s3CredentialsRefreshOffsetSeconds = Optional.empty(); /** * Set the version of the dataset to read. If not set, read from latest version. @@ -221,22 +214,6 @@ public Builder setStorageOptionsProvider(StorageOptionsProvider storageOptionsPr return this; } - /** - * Set the number of seconds before credential expiration to trigger a refresh. - * - *

Default is 60 seconds. Only applicable when using AWS S3 with temporary credentials. For - * example, if set to 60, credentials will be refreshed when they have less than 60 seconds - * remaining before expiration. This should be set shorter than the credential lifetime to avoid - * using expired credentials. - * - * @param s3CredentialsRefreshOffsetSeconds the refresh offset in seconds - * @return this builder - */ - public Builder setS3CredentialsRefreshOffsetSeconds(long s3CredentialsRefreshOffsetSeconds) { - this.s3CredentialsRefreshOffsetSeconds = Optional.of(s3CredentialsRefreshOffsetSeconds); - return this; - } - public ReadOptions build() { return new ReadOptions(this); } diff --git a/java/src/main/java/org/lance/Transaction.java b/java/src/main/java/org/lance/Transaction.java index 67bc5f8d93d..2d565c73258 100644 --- a/java/src/main/java/org/lance/Transaction.java +++ b/java/src/main/java/org/lance/Transaction.java @@ -118,7 +118,6 @@ public static class Builder { private Operation operation; private Map writeParams; private Map transactionProperties; - private Optional s3CredentialsRefreshOffsetSeconds = Optional.empty(); public Builder(Dataset dataset) { this.dataset = dataset; @@ -140,21 +139,6 @@ public Builder writeParams(Map writeParams) { return this; } - /** - * Sets the S3 credentials refresh offset in seconds. - * - *

This parameter controls how long before credential expiration to refresh them. For - * example, if credentials expire at T+60s and this is set to 10, credentials will be refreshed - * at T+50s. - * - * @param s3CredentialsRefreshOffsetSeconds Refresh offset in seconds - * @return this builder instance - */ - public Builder s3CredentialsRefreshOffsetSeconds(long s3CredentialsRefreshOffsetSeconds) { - this.s3CredentialsRefreshOffsetSeconds = Optional.of(s3CredentialsRefreshOffsetSeconds); - return this; - } - public Builder operation(Operation operation) { validateState(); this.operation = operation; @@ -171,15 +155,8 @@ private void validateState() { public Transaction build() { Preconditions.checkState(operation != null, "TransactionBuilder has no operations"); - // Merge s3_credentials_refresh_offset_seconds into writeParams if present - Map finalWriteParams = - writeParams != null ? new HashMap<>(writeParams) : new HashMap<>(); - s3CredentialsRefreshOffsetSeconds.ifPresent( - value -> - finalWriteParams.put("s3_credentials_refresh_offset_seconds", String.valueOf(value))); - return new Transaction( - dataset, readVersion, uuid, operation, finalWriteParams, transactionProperties); + dataset, readVersion, uuid, operation, writeParams, transactionProperties); } } } diff --git a/java/src/main/java/org/lance/WriteDatasetBuilder.java b/java/src/main/java/org/lance/WriteDatasetBuilder.java index ec7e856b4f3..8a8c33163da 100644 --- a/java/src/main/java/org/lance/WriteDatasetBuilder.java +++ b/java/src/main/java/org/lance/WriteDatasetBuilder.java @@ -80,7 +80,6 @@ public class WriteDatasetBuilder { private Optional maxBytesPerFile = Optional.empty(); private Optional enableStableRowIds = Optional.empty(); private Optional dataStorageVersion = Optional.empty(); - private Optional s3CredentialsRefreshOffsetSeconds = Optional.empty(); private Optional> initialBases = Optional.empty(); private Optional> targetBases = Optional.empty(); @@ -276,21 +275,6 @@ public WriteDatasetBuilder dataStorageVersion(WriteParams.LanceFileVersion dataS return this; } - /** - * Sets the S3 credentials refresh offset in seconds. - * - *

This parameter controls how long before credential expiration to refresh them. For example, - * if credentials expire at T+60s and this is set to 10, credentials will be refreshed at T+50s. - * - * @param s3CredentialsRefreshOffsetSeconds Refresh offset in seconds - * @return this builder instance - */ - public WriteDatasetBuilder s3CredentialsRefreshOffsetSeconds( - long s3CredentialsRefreshOffsetSeconds) { - this.s3CredentialsRefreshOffsetSeconds = Optional.of(s3CredentialsRefreshOffsetSeconds); - return this; - } - public WriteDatasetBuilder initialBases(List bases) { this.initialBases = Optional.of(bases); return this; @@ -424,8 +408,6 @@ private Dataset executeWithNamespace() { maxBytesPerFile.ifPresent(paramsBuilder::withMaxBytesPerFile); enableStableRowIds.ifPresent(paramsBuilder::withEnableStableRowIds); dataStorageVersion.ifPresent(paramsBuilder::withDataStorageVersion); - s3CredentialsRefreshOffsetSeconds.ifPresent( - paramsBuilder::withS3CredentialsRefreshOffsetSeconds); initialBases.ifPresent(paramsBuilder::withInitialBases); targetBases.ifPresent(paramsBuilder::withTargetBases); @@ -451,8 +433,6 @@ private Dataset executeWithUri() { maxBytesPerFile.ifPresent(paramsBuilder::withMaxBytesPerFile); enableStableRowIds.ifPresent(paramsBuilder::withEnableStableRowIds); dataStorageVersion.ifPresent(paramsBuilder::withDataStorageVersion); - s3CredentialsRefreshOffsetSeconds.ifPresent( - paramsBuilder::withS3CredentialsRefreshOffsetSeconds); initialBases.ifPresent(paramsBuilder::withInitialBases); targetBases.ifPresent(paramsBuilder::withTargetBases); diff --git a/java/src/main/java/org/lance/WriteFragmentBuilder.java b/java/src/main/java/org/lance/WriteFragmentBuilder.java index 76882b14a29..56ce06a7b0a 100644 --- a/java/src/main/java/org/lance/WriteFragmentBuilder.java +++ b/java/src/main/java/org/lance/WriteFragmentBuilder.java @@ -37,7 +37,6 @@ * .allocator(allocator) * .data(vectorSchemaRoot) * .storageOptions(storageOptions) - * .s3CredentialsRefreshOffsetSeconds(10) * .execute(); * } */ @@ -134,21 +133,6 @@ public WriteFragmentBuilder storageOptionsProvider(StorageOptionsProvider provid return this; } - /** - * Set the S3 credentials refresh offset in seconds. - * - *

This parameter controls how long before credential expiration to refresh them. For example, - * if credentials expire at T+60s and this is set to 10, credentials will be refreshed at T+50s. - * - * @param seconds refresh offset in seconds - * @return this builder - */ - public WriteFragmentBuilder s3CredentialsRefreshOffsetSeconds(long seconds) { - ensureWriteParamsBuilder(); - this.writeParamsBuilder.withS3CredentialsRefreshOffsetSeconds(seconds); - return this; - } - /** * Set the maximum number of rows per file. * diff --git a/java/src/main/java/org/lance/WriteParams.java b/java/src/main/java/org/lance/WriteParams.java index 2a4853383dd..c095009bc67 100644 --- a/java/src/main/java/org/lance/WriteParams.java +++ b/java/src/main/java/org/lance/WriteParams.java @@ -58,7 +58,6 @@ public String getVersionString() { private final Optional dataStorageVersion; private final Optional enableV2ManifestPaths; private Map storageOptions = new HashMap<>(); - private final Optional s3CredentialsRefreshOffsetSeconds; private final Optional> initialBases; private final Optional> targetBases; @@ -71,7 +70,6 @@ private WriteParams( Optional dataStorageVersion, Optional enableV2ManifestPaths, Map storageOptions, - Optional s3CredentialsRefreshOffsetSeconds, Optional> initialBases, Optional> targetBases) { this.maxRowsPerFile = maxRowsPerFile; @@ -82,7 +80,6 @@ private WriteParams( this.dataStorageVersion = dataStorageVersion; this.enableV2ManifestPaths = enableV2ManifestPaths; this.storageOptions = storageOptions; - this.s3CredentialsRefreshOffsetSeconds = s3CredentialsRefreshOffsetSeconds; this.initialBases = initialBases; this.targetBases = targetBases; } @@ -124,10 +121,6 @@ public Map getStorageOptions() { return storageOptions; } - public Optional getS3CredentialsRefreshOffsetSeconds() { - return s3CredentialsRefreshOffsetSeconds; - } - public Optional> getInitialBases() { return initialBases; } @@ -157,7 +150,6 @@ public static class Builder { private Optional dataStorageVersion = Optional.empty(); private Optional enableV2ManifestPaths; private Map storageOptions = new HashMap<>(); - private Optional s3CredentialsRefreshOffsetSeconds = Optional.empty(); private Optional> initialBases = Optional.empty(); private Optional> targetBases = Optional.empty(); @@ -201,11 +193,6 @@ public Builder withEnableV2ManifestPaths(boolean enableV2ManifestPaths) { return this; } - public Builder withS3CredentialsRefreshOffsetSeconds(long s3CredentialsRefreshOffsetSeconds) { - this.s3CredentialsRefreshOffsetSeconds = Optional.of(s3CredentialsRefreshOffsetSeconds); - return this; - } - public Builder withInitialBases(List initialBases) { this.initialBases = Optional.of(initialBases); return this; @@ -226,7 +213,6 @@ public WriteParams build() { dataStorageVersion, enableV2ManifestPaths, storageOptions, - s3CredentialsRefreshOffsetSeconds, initialBases, targetBases); } diff --git a/java/src/test/java/org/lance/NamespaceIntegrationTest.java b/java/src/test/java/org/lance/NamespaceIntegrationTest.java index ad0b55dccdc..2d6f8ab1443 100644 --- a/java/src/test/java/org/lance/NamespaceIntegrationTest.java +++ b/java/src/test/java/org/lance/NamespaceIntegrationTest.java @@ -203,6 +203,8 @@ private Map modifyStorageOptions( long expiresAtMillis = System.currentTimeMillis() + (credentialExpiresInSeconds * 1000L); modified.put("expires_at_millis", String.valueOf(expiresAtMillis)); + // Set refresh offset to 1 second (1000ms) for short-lived credential tests + modified.put("refresh_offset_millis", "1000"); return modified; } @@ -326,11 +328,7 @@ public VectorSchemaRoot getVectorSchemaRoot() { assertEquals(1, namespace.getCreateCallCount(), "createEmptyTable should be called once"); // Open dataset through namespace WITH refresh enabled - // Use 10-second refresh offset, so credentials effectively expire at T+50s - ReadOptions readOptions = - new ReadOptions.Builder() - .setS3CredentialsRefreshOffsetSeconds(10) // Refresh 10s before expiration - .build(); + ReadOptions readOptions = new ReadOptions.Builder().build(); int callCountBeforeOpen = namespace.getDescribeCallCount(); try (Dataset dsFromNamespace = @@ -451,7 +449,6 @@ public VectorSchemaRoot getVectorSchemaRoot() { .namespace(namespace) .tableId(Arrays.asList(tableName)) .mode(WriteParams.WriteMode.CREATE) - .s3CredentialsRefreshOffsetSeconds(2) // Refresh 2s before expiration .execute()) { assertEquals(2, dataset.countRows()); } @@ -461,11 +458,7 @@ public VectorSchemaRoot getVectorSchemaRoot() { assertEquals(1, namespace.getCreateCallCount(), "createEmptyTable should be called once"); // Open dataset through namespace with refresh enabled - // Use 2-second refresh offset so credentials effectively expire at T+3s (5s - 2s) - ReadOptions readOptions = - new ReadOptions.Builder() - .setS3CredentialsRefreshOffsetSeconds(2) // Refresh 2s before expiration - .build(); + ReadOptions readOptions = new ReadOptions.Builder().build(); int callCountBeforeOpen = namespace.getDescribeCallCount(); try (Dataset dsFromNamespace = @@ -692,7 +685,6 @@ public VectorSchemaRoot getVectorSchemaRoot() { }; // Use the write builder to create a dataset through namespace - // Set a 1-second refresh offset. Credentials expire at T+60s, so refresh at T+59s. // Write completes instantly, so NO describeTable call should happen for refresh. try (Dataset dataset = Dataset.write() @@ -701,7 +693,6 @@ public VectorSchemaRoot getVectorSchemaRoot() { .namespace(namespace) .tableId(Arrays.asList(tableName)) .mode(WriteParams.WriteMode.CREATE) - .s3CredentialsRefreshOffsetSeconds(1) .execute()) { // Verify createEmptyTable was called exactly ONCE @@ -732,9 +723,7 @@ public VectorSchemaRoot getVectorSchemaRoot() { "describeTable should still be 0 after close (no refresh needed)"); // Now open the dataset through namespace with long-lived credentials (60s expiration) - // With 1s refresh offset, credentials are valid for 59s - plenty of time for reads - ReadOptions readOptions = - new ReadOptions.Builder().setS3CredentialsRefreshOffsetSeconds(1).build(); + ReadOptions readOptions = new ReadOptions.Builder().build(); try (Dataset dsFromNamespace = Dataset.open() diff --git a/python/python/lance/__init__.py b/python/python/lance/__init__.py index ff2870b6d1a..d98de0424ad 100644 --- a/python/python/lance/__init__.py +++ b/python/python/lance/__init__.py @@ -99,8 +99,6 @@ def dataset( session: Optional[Session] = None, namespace: Optional[LanceNamespace] = None, table_id: Optional[List[str]] = None, - ignore_namespace_table_storage_options: bool = False, - s3_credentials_refresh_offset_seconds: Optional[int] = None, ) -> LanceDataset: """ Opens the Lance dataset from the address specified. @@ -168,26 +166,13 @@ def dataset( table_id : optional, List[str] The table identifier when using a namespace (e.g., ["my_table"]). Must be provided together with `namespace`. Cannot be used with `uri`. - ignore_namespace_table_storage_options : bool, default False - Only applicable when using `namespace` and `table_id`. If True, storage - options returned from the namespace's describe_table() will be ignored - (treated as None). If False (default), storage options from describe_table() - will be used and a dynamic storage options provider will be created to - automatically refresh credentials before they expire. - s3_credentials_refresh_offset_seconds : optional, int - The number of seconds before credential expiration to trigger a refresh. - Default is 60 seconds. Only applicable when using AWS S3 with temporary - credentials. For example, if set to 60, credentials will be refreshed - when they have less than 60 seconds remaining before expiration. This - should be set shorter than the credential lifetime to avoid using - expired credentials. Notes ----- When using `namespace` and `table_id`: - The `uri` parameter is optional and will be fetched from the namespace - - Storage options from describe_table() will be used unless - `ignore_namespace_table_storage_options=True` + - Storage options from describe_table() will be used automatically + - A dynamic storage options provider will be created to refresh credentials - Initial storage options from describe_table() will be merged with any provided `storage_options` """ @@ -220,10 +205,7 @@ def dataset( if uri is None: raise ValueError("Namespace did not return a 'location' for the table") - if ignore_namespace_table_storage_options: - namespace_storage_options = None - else: - namespace_storage_options = response.storage_options + namespace_storage_options = response.storage_options if namespace_storage_options: storage_options_provider = LanceNamespaceStorageOptionsProvider( @@ -251,7 +233,6 @@ def dataset( read_params=read_params, session=session, storage_options_provider=storage_options_provider, - s3_credentials_refresh_offset_seconds=s3_credentials_refresh_offset_seconds, ) if version is None and asof is not None: ts_cutoff = sanitize_ts(asof) @@ -276,7 +257,6 @@ def dataset( read_params=read_params, session=session, storage_options_provider=storage_options_provider, - s3_credentials_refresh_offset_seconds=s3_credentials_refresh_offset_seconds, ) else: return ds diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 01066a88d34..3355b459ebf 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -433,7 +433,6 @@ def __init__( read_params: Optional[Dict[str, Any]] = None, session: Optional[Session] = None, storage_options_provider: Optional[Any] = None, - s3_credentials_refresh_offset_seconds: Optional[int] = None, ): uri = os.fspath(uri) if isinstance(uri, Path) else uri self._uri = uri @@ -463,7 +462,6 @@ def __init__( read_params=read_params, session=session, storage_options_provider=storage_options_provider, - s3_credentials_refresh_offset_seconds=s3_credentials_refresh_offset_seconds, ) self._default_scan_options = default_scan_options self._read_params = read_params @@ -2225,6 +2223,49 @@ def latest_version(self) -> int: """ return self._ds.latest_version() + @property + def initial_storage_options(self) -> Optional[Dict[str, str]]: + """ + Get the initial storage options used to open this dataset. + + This returns the options that were provided when the dataset was opened, + without any refresh from the provider. Returns None if no storage options + were provided. + """ + return self._ds.initial_storage_options() + + def latest_storage_options(self) -> Optional[Dict[str, str]]: + """ + Get the latest storage options, potentially refreshed from the provider. + + If a storage options provider was configured and credentials are expiring, + this will refresh them. + + Returns + ------- + Optional[Dict[str, str]] + - Storage options dict if configured (static or refreshed from provider) + - None if no storage options were configured for this dataset + + Raises + ------ + IOError + If an error occurs while fetching/refreshing options from the provider + """ + return self._ds.latest_storage_options() + + @property + def storage_options_accessor(self): + """ + Get the storage options accessor for this dataset. + + The accessor bundles static storage options and optional dynamic provider, + handling caching and refresh logic internally. + + Returns None if neither storage options nor a provider were configured. + """ + return self._ds.storage_options_accessor() + def checkout_version( self, version: int | str | Tuple[Optional[str], Optional[int]] ) -> "LanceDataset": @@ -5508,8 +5549,6 @@ def write_dataset( target_bases: Optional[List[str]] = None, namespace: Optional[LanceNamespace] = None, table_id: Optional[List[str]] = None, - ignore_namespace_table_storage_options: bool = False, - s3_credentials_refresh_offset_seconds: Optional[int] = None, ) -> LanceDataset: """Write a given data_obj to the given uri @@ -5611,29 +5650,16 @@ def write_dataset( table_id : optional, List[str] The table identifier when using a namespace (e.g., ["my_table"]). Must be provided together with `namespace`. Cannot be used with `uri`. - ignore_namespace_table_storage_options : bool, default False - If True, ignore the storage options returned by the namespace and only use - the provided `storage_options` parameter. The storage options provider will - not be created, so credentials will not be automatically refreshed. - This is useful when you want to use your own credentials instead of the - namespace-provided credentials. - s3_credentials_refresh_offset_seconds : optional, int - The number of seconds before credential expiration to trigger a refresh. - Default is 60 seconds. Only applicable when using AWS S3 with temporary - credentials. For example, if set to 60, credentials will be refreshed - when they have less than 60 seconds remaining before expiration. This - should be set shorter than the credential lifetime to avoid using - expired credentials. Notes ----- When using `namespace` and `table_id`: - The `uri` parameter is optional and will be fetched from the namespace + - Storage options from describe_table() will be used automatically - A `LanceNamespaceStorageOptionsProvider` will be created automatically for - storage options refresh (unless `ignore_namespace_table_storage_options=True`) + storage options refresh - Initial storage options from describe_table() will be merged with - any provided `storage_options` (unless - `ignore_namespace_table_storage_options=True`) + any provided `storage_options` """ # Validate that user provides either uri OR (namespace + table_id), not both has_uri = uri is not None @@ -5715,11 +5741,8 @@ def write_dataset( f"Namespace did not return a table location in {mode} response" ) - # Check if we should ignore namespace storage options - if ignore_namespace_table_storage_options: - namespace_storage_options = None - else: - namespace_storage_options = response.storage_options + # Use namespace storage options + namespace_storage_options = response.storage_options # Set up storage options and provider if namespace_storage_options: @@ -5782,12 +5805,6 @@ def write_dataset( if storage_options_provider is not None: params["storage_options_provider"] = storage_options_provider - # Add s3_credentials_refresh_offset_seconds if specified - if s3_credentials_refresh_offset_seconds is not None: - params["s3_credentials_refresh_offset_seconds"] = ( - s3_credentials_refresh_offset_seconds - ) - if commit_lock: if not callable(commit_lock): raise TypeError(f"commit_lock must be a function, got {type(commit_lock)}") diff --git a/python/python/lance/file.py b/python/python/lance/file.py index dec4aea00b6..8a20e4aff2f 100644 --- a/python/python/lance/file.py +++ b/python/python/lance/file.py @@ -68,7 +68,6 @@ def __init__( columns: Optional[List[str]] = None, *, storage_options_provider: Optional[StorageOptionsProvider] = None, - s3_credentials_refresh_offset_seconds: Optional[int] = None, _inner_reader: Optional[_LanceFileReader] = None, ): """ @@ -86,9 +85,6 @@ def __init__( storage_options_provider : optional A provider that can provide storage options dynamically. This is useful for credentials that need to be refreshed or vended on-demand. - s3_credentials_refresh_offset_seconds : optional, int - How early (in seconds) before expiration to refresh S3 credentials. - Default is 60 seconds. Only applies when using storage_options_provider. columns: list of str, default None List of column names to be fetched. All columns are fetched if None or unspecified. @@ -102,7 +98,6 @@ def __init__( path, storage_options=storage_options, storage_options_provider=storage_options_provider, - s3_credentials_refresh_offset_seconds=s3_credentials_refresh_offset_seconds, columns=columns, ) @@ -219,7 +214,6 @@ def __init__( base_path: str, storage_options: Optional[Dict[str, str]] = None, storage_options_provider: Optional[StorageOptionsProvider] = None, - s3_credentials_refresh_offset_seconds: Optional[int] = None, ): """ Creates a new file session @@ -236,9 +230,6 @@ def __init__( storage_options_provider : optional A provider that can provide storage options dynamically. This is useful for credentials that need to be refreshed or vended on-demand. - s3_credentials_refresh_offset_seconds : optional, int - How early (in seconds) before expiration to refresh S3 credentials. - Default is 60 seconds. Only applies when using storage_options_provider. """ if isinstance(base_path, Path): base_path = str(base_path) @@ -246,7 +237,6 @@ def __init__( base_path, storage_options=storage_options, storage_options_provider=storage_options_provider, - s3_credentials_refresh_offset_seconds=s3_credentials_refresh_offset_seconds, ) def open_reader( @@ -391,7 +381,6 @@ def __init__( version: Optional[str] = None, storage_options: Optional[Dict[str, str]] = None, storage_options_provider: Optional[StorageOptionsProvider] = None, - s3_credentials_refresh_offset_seconds: Optional[int] = None, max_page_bytes: Optional[int] = None, _inner_writer: Optional[_LanceFileWriter] = None, **kwargs, @@ -422,9 +411,6 @@ def __init__( A storage options provider that can fetch and refresh storage options dynamically. This is useful for credentials that expire and need to be refreshed automatically. - s3_credentials_refresh_offset_seconds : optional, int - How early (in seconds) before expiration to refresh S3 credentials. - Default is 60 seconds. Only applies when using storage_options_provider. max_page_bytes : optional, int The maximum size of a page in bytes, if a single array would create a page larger than this then it will be split into multiple pages. The @@ -442,7 +428,6 @@ def __init__( version=version, storage_options=storage_options, storage_options_provider=storage_options_provider, - s3_credentials_refresh_offset_seconds=s3_credentials_refresh_offset_seconds, max_page_bytes=max_page_bytes, **kwargs, ) diff --git a/python/python/lance/lance/__init__.pyi b/python/python/lance/lance/__init__.pyi index 506e4e267d5..a346b648ed5 100644 --- a/python/python/lance/lance/__init__.pyi +++ b/python/python/lance/lance/__init__.pyi @@ -95,7 +95,6 @@ class LanceFileWriter: version: Optional[str], storage_options: Optional[Dict[str, str]], storage_options_provider: Optional[StorageOptionsProvider], - s3_credentials_refresh_offset_seconds: Optional[int], keep_original_array: Optional[bool], max_page_bytes: Optional[int], ): ... @@ -110,7 +109,6 @@ class LanceFileSession: base_path: str, storage_options: Optional[Dict[str, str]] = None, storage_options_provider: Optional[StorageOptionsProvider] = None, - s3_credentials_refresh_offset_seconds: Optional[int] = None, ): ... def open_reader( self, path: str, columns: Optional[List[str]] = None @@ -135,7 +133,6 @@ class LanceFileReader: path: str, storage_options: Optional[Dict[str, str]], storage_options_provider: Optional[StorageOptionsProvider], - s3_credentials_refresh_offset_seconds: Optional[int], columns: Optional[List[str]] = None, ): ... def read_all( diff --git a/python/python/tests/test_namespace_integration.py b/python/python/tests/test_namespace_integration.py index 3c93dbcb504..30489496e38 100644 --- a/python/python/tests/test_namespace_integration.py +++ b/python/python/tests/test_namespace_integration.py @@ -128,6 +128,8 @@ def _modify_storage_options( (time.time() + self.credential_expires_in_seconds) * 1000 ) modified["expires_at_millis"] = str(expires_at_millis) + # Set refresh offset to 1 second (1000ms) for short-lived credential tests + modified["refresh_offset_millis"] = "1000" return modified @@ -235,7 +237,6 @@ def test_namespace_with_refresh(s3_bucket: str): namespace=namespace, table_id=table_id, mode="create", - s3_credentials_refresh_offset_seconds=1, ) assert ds.count_rows() == 2 assert namespace.get_create_call_count() == 1 @@ -243,7 +244,6 @@ def test_namespace_with_refresh(s3_bucket: str): ds_from_namespace = lance.dataset( namespace=namespace, table_id=table_id, - s3_credentials_refresh_offset_seconds=1, ) initial_call_count = namespace.get_describe_call_count() @@ -574,7 +574,6 @@ def test_file_writer_with_storage_options_provider(s3_bucket: str): schema=schema, storage_options=namespace_storage_options, storage_options_provider=provider, - s3_credentials_refresh_offset_seconds=1, ) batch = pa.RecordBatch.from_pydict({"x": [1, 2, 3], "y": [4, 5, 6]}, schema=schema) @@ -593,7 +592,6 @@ def test_file_writer_with_storage_options_provider(s3_bucket: str): file_uri, storage_options=namespace_storage_options, storage_options_provider=provider, - s3_credentials_refresh_offset_seconds=1, ) result = reader.read_all(batch_size=1024) result_table = result.to_table() @@ -613,7 +611,6 @@ def test_file_writer_with_storage_options_provider(s3_bucket: str): schema=schema, storage_options=namespace_storage_options, storage_options_provider=provider, - s3_credentials_refresh_offset_seconds=1, ) batch3 = pa.RecordBatch.from_pydict( @@ -629,7 +626,6 @@ def test_file_writer_with_storage_options_provider(s3_bucket: str): file_uri2, storage_options=namespace_storage_options, storage_options_provider=provider, - s3_credentials_refresh_offset_seconds=1, ) result2 = reader2.read_all(batch_size=1024) result_table2 = result2.to_table() @@ -696,7 +692,6 @@ def test_file_reader_with_storage_options_provider(s3_bucket: str): file_uri, storage_options=namespace_storage_options, storage_options_provider=provider, - s3_credentials_refresh_offset_seconds=1, ) result = reader.read_all(batch_size=1024) result_table = result.to_table() @@ -727,7 +722,6 @@ def test_file_reader_with_storage_options_provider(s3_bucket: str): file_uri2, storage_options=namespace_storage_options, storage_options_provider=provider, - s3_credentials_refresh_offset_seconds=1, ) result2 = reader2.read_all(batch_size=1024) result_table2 = result2.to_table() @@ -778,7 +772,6 @@ def test_file_session_with_storage_options_provider(s3_bucket: str): f"s3://{s3_bucket}/{table_name}_session", storage_options=namespace_storage_options, storage_options_provider=provider, - s3_credentials_refresh_offset_seconds=1, ) # Test contains method diff --git a/python/src/dataset.rs b/python/src/dataset.rs index df03763472f..11047f33c12 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -92,6 +92,7 @@ use crate::rt; use crate::scanner::ScanStatistics; use crate::schema::{logical_schema_from_lance, LanceSchema}; use crate::session::Session; +use crate::storage_options::PyStorageOptionsAccessor; use crate::utils::PyLance; use crate::{LanceReader, Scanner}; @@ -480,8 +481,9 @@ pub struct Dataset { #[pymethods] impl Dataset { #[allow(clippy::too_many_arguments)] + #[allow(deprecated)] #[new] - #[pyo3(signature=(uri, version=None, block_size=None, index_cache_size=None, metadata_cache_size=None, commit_handler=None, storage_options=None, manifest=None, metadata_cache_size_bytes=None, index_cache_size_bytes=None, read_params=None, session=None, storage_options_provider=None, s3_credentials_refresh_offset_seconds=None))] + #[pyo3(signature=(uri, version=None, block_size=None, index_cache_size=None, metadata_cache_size=None, commit_handler=None, storage_options=None, manifest=None, metadata_cache_size_bytes=None, index_cache_size_bytes=None, read_params=None, session=None, storage_options_provider=None))] fn new( py: Python, uri: String, @@ -497,7 +499,6 @@ impl Dataset { read_params: Option<&Bound>, session: Option, storage_options_provider: Option<&Bound<'_, PyAny>>, - s3_credentials_refresh_offset_seconds: Option, ) -> PyResult { let mut params = ReadParams::default(); if let Some(metadata_cache_size_bytes) = metadata_cache_size_bytes { @@ -514,16 +515,12 @@ impl Dataset { let index_cache_size_bytes = index_cache_size * 20 * 1024 * 1024; params.index_cache_size_bytes(index_cache_size_bytes); } - // Set up store options (block size and S3 credentials refresh offset) - let mut store_params = params.store_options.take().unwrap_or_default(); + // Set up store options (block size) if let Some(block_size) = block_size { + let mut store_params = params.store_options.take().unwrap_or_default(); store_params.block_size = Some(block_size); + params.store_options = Some(store_params); } - if let Some(offset_seconds) = s3_credentials_refresh_offset_seconds { - store_params.s3_credentials_refresh_offset = - std::time::Duration::from_secs(offset_seconds); - } - params.store_options = Some(store_params); if let Some(commit_handler) = commit_handler { let py_commit_lock = PyCommitLock::new(commit_handler); params.set_commit_lock(Arc::new(py_commit_lock)); @@ -1518,6 +1515,37 @@ impl Dataset { .map_err(|err| PyIOError::new_err(err.to_string())) } + /// Get the initial storage options used to open this dataset. + /// + /// This returns the options that were provided when the dataset was opened, + /// without any refresh from the provider. Returns None if no storage options + /// were provided. + fn initial_storage_options(&self) -> Option> { + self.ds.initial_storage_options().cloned() + } + + /// Get the latest storage options, potentially refreshed from the provider. + /// + /// If a storage options provider was configured and credentials are expiring, + /// this will refresh them. Returns the current valid storage options, or None + /// if no storage options accessor is configured. + fn latest_storage_options(self_: PyRef<'_, Self>) -> PyResult>> { + let result = rt() + .block_on(Some(self_.py()), self_.ds.latest_storage_options())? + .map_err(|err| PyIOError::new_err(err.to_string()))?; + Ok(result.map(|opts| opts.0)) + } + + /// Get the storage options accessor for this dataset. + /// + /// The accessor bundles static storage options and optional dynamic provider, + /// handling caching and refresh logic internally. + fn storage_options_accessor(&self) -> Option { + self.ds + .storage_options_accessor() + .map(PyStorageOptionsAccessor::new) + } + fn checkout_version(&self, version: Bound) -> PyResult { let reference = self.transform_ref(Some(version))?; self._checkout_version(reference) @@ -1536,7 +1564,9 @@ impl Dataset { // `version` can be a version number or a tag name. // `storage_options` will be forwarded to the object store params for the new dataset. let store_params = storage_options.as_ref().map(|opts| ObjectStoreParams { - storage_options: Some(opts.clone()), + storage_options_accessor: Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(opts.clone()), + )), ..Default::default() }); @@ -1715,7 +1745,9 @@ impl Dataset { let mut new_self = self.ds.as_ref().clone(); let reference = self.transform_ref(reference)?; let store_params = storage_options.map(|opts| ObjectStoreParams { - storage_options: Some(opts), + storage_options_accessor: Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(opts), + )), ..Default::default() }); let created = rt() @@ -2222,6 +2254,7 @@ impl Dataset { } #[allow(clippy::too_many_arguments)] + #[allow(deprecated)] #[staticmethod] #[pyo3(signature = (dest, transaction, commit_lock = None, storage_options = None, storage_options_provider = None, enable_v2_manifest_paths = None, detached = None, max_retries = None))] fn commit_transaction( @@ -2234,14 +2267,14 @@ impl Dataset { detached: Option, max_retries: Option, ) -> PyResult { - let provider = storage_options_provider - .map(crate::storage_options::py_object_to_storage_options_provider) - .transpose()?; + let accessor = crate::storage_options::create_accessor_from_python( + storage_options.clone(), + storage_options_provider, + )?; - let object_store_params = if storage_options.is_some() || provider.is_some() { + let object_store_params = if accessor.is_some() { Some(ObjectStoreParams { - storage_options: storage_options.clone(), - storage_options_provider: provider, + storage_options_accessor: accessor, ..Default::default() }) } else { @@ -2285,6 +2318,7 @@ impl Dataset { } #[allow(clippy::too_many_arguments)] + #[allow(deprecated)] #[staticmethod] #[pyo3(signature = (dest, transactions, commit_lock = None, storage_options = None, storage_options_provider = None, enable_v2_manifest_paths = None, detached = None, max_retries = None))] fn commit_batch( @@ -2297,14 +2331,14 @@ impl Dataset { detached: Option, max_retries: Option, ) -> PyResult<(Self, PyLance)> { - let provider = storage_options_provider - .map(crate::storage_options::py_object_to_storage_options_provider) - .transpose()?; + let accessor = crate::storage_options::create_accessor_from_python( + storage_options.clone(), + storage_options_provider, + )?; - let object_store_params = if storage_options.is_some() || provider.is_some() { + let object_store_params = if accessor.is_some() { Some(ObjectStoreParams { - storage_options: storage_options.clone(), - storage_options_provider: provider, + storage_options_accessor: accessor, ..Default::default() }) } else { @@ -3075,6 +3109,7 @@ fn get_dict_opt<'a, 'py, D: FromPyObject<'a>>( .transpose() } +#[allow(deprecated)] pub fn get_write_params(options: &Bound<'_, PyDict>) -> PyResult> { let params = if options.is_none() { None @@ -3102,29 +3137,17 @@ pub fn get_write_params(options: &Bound<'_, PyDict>) -> PyResult>(options, "storage_options")?; let storage_options_provider = - get_dict_opt::>(options, "storage_options_provider")? - .map(|py_obj| { - crate::storage_options::py_object_to_storage_options_provider( - py_obj.bind(options.py()), - ) - }) - .transpose()?; + get_dict_opt::>(options, "storage_options_provider")?; - let s3_credentials_refresh_offset_seconds = - get_dict_opt::(options, "s3_credentials_refresh_offset_seconds")?; - - if storage_options.is_some() - || storage_options_provider.is_some() - || s3_credentials_refresh_offset_seconds.is_some() - { - let s3_credentials_refresh_offset = s3_credentials_refresh_offset_seconds - .map(std::time::Duration::from_secs) - .unwrap_or(std::time::Duration::from_secs(60)); - - p.store_params = Some(ObjectStoreParams { + if storage_options.is_some() || storage_options_provider.is_some() { + let accessor = crate::storage_options::create_accessor_from_python( storage_options, - storage_options_provider, - s3_credentials_refresh_offset, + storage_options_provider + .as_ref() + .map(|py_obj| py_obj.bind(options.py())), + )?; + p.store_params = Some(ObjectStoreParams { + storage_options_accessor: accessor, ..Default::default() }); } diff --git a/python/src/file.rs b/python/src/file.rs index ba96710a0d5..4256d4e74d6 100644 --- a/python/src/file.rs +++ b/python/src/file.rs @@ -241,7 +241,6 @@ impl LanceFileWriter { version: Option, storage_options: Option>, storage_options_provider: Option>, - s3_credentials_refresh_offset_seconds: Option, keep_original_array: Option, max_page_bytes: Option, ) -> PyResult { @@ -249,7 +248,6 @@ impl LanceFileWriter { uri_or_path, storage_options, storage_options_provider, - s3_credentials_refresh_offset_seconds, ) .await?; Self::open_with_store( @@ -299,7 +297,7 @@ impl LanceFileWriter { #[pymethods] impl LanceFileWriter { #[new] - #[pyo3(signature=(path, schema=None, data_cache_bytes=None, version=None, storage_options=None, storage_options_provider=None, s3_credentials_refresh_offset_seconds=None, keep_original_array=None, max_page_bytes=None))] + #[pyo3(signature=(path, schema=None, data_cache_bytes=None, version=None, storage_options=None, storage_options_provider=None, keep_original_array=None, max_page_bytes=None))] #[allow(clippy::too_many_arguments)] pub fn new( path: String, @@ -308,7 +306,6 @@ impl LanceFileWriter { version: Option, storage_options: Option>, storage_options_provider: Option<&Bound<'_, PyAny>>, - s3_credentials_refresh_offset_seconds: Option, keep_original_array: Option, max_page_bytes: Option, ) -> PyResult { @@ -326,7 +323,6 @@ impl LanceFileWriter { version, storage_options, provider, - s3_credentials_refresh_offset_seconds, keep_original_array, max_page_bytes, ), @@ -383,25 +379,33 @@ pub async fn object_store_from_uri_or_path( uri_or_path: impl AsRef, storage_options: Option>, ) -> PyResult<(Arc, Path)> { - object_store_from_uri_or_path_with_provider(uri_or_path, storage_options, None, None).await + object_store_from_uri_or_path_with_provider(uri_or_path, storage_options, None).await } pub async fn object_store_from_uri_or_path_with_provider( uri_or_path: impl AsRef, storage_options: Option>, storage_options_provider: Option>, - s3_credentials_refresh_offset_seconds: Option, ) -> PyResult<(Arc, Path)> { let object_store_registry = Arc::new(lance::io::ObjectStoreRegistry::default()); - let mut object_store_params = ObjectStoreParams { - storage_options: storage_options.clone(), - storage_options_provider, + + let accessor = match (storage_options, storage_options_provider) { + (Some(opts), Some(provider)) => Some(Arc::new( + lance::io::StorageOptionsAccessor::with_initial_and_provider(opts, provider), + )), + (None, Some(provider)) => Some(Arc::new(lance::io::StorageOptionsAccessor::with_provider( + provider, + ))), + (Some(opts), None) => Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(opts), + )), + (None, None) => None, + }; + + let object_store_params = ObjectStoreParams { + storage_options_accessor: accessor, ..Default::default() }; - if let Some(offset_seconds) = s3_credentials_refresh_offset_seconds { - object_store_params.s3_credentials_refresh_offset = - std::time::Duration::from_secs(offset_seconds); - } let (object_store, path) = ObjectStore::from_uri_and_params( object_store_registry, @@ -425,13 +429,11 @@ impl LanceFileSession { uri_or_path: String, storage_options: Option>, storage_options_provider: Option>, - s3_credentials_refresh_offset_seconds: Option, ) -> PyResult { let (object_store, base_path) = object_store_from_uri_or_path_with_provider( uri_or_path, storage_options, storage_options_provider, - s3_credentials_refresh_offset_seconds, ) .await?; Ok(Self { @@ -444,25 +446,16 @@ impl LanceFileSession { #[pymethods] impl LanceFileSession { #[new] - #[pyo3(signature=(uri_or_path, storage_options=None, storage_options_provider=None, s3_credentials_refresh_offset_seconds=None))] + #[pyo3(signature=(uri_or_path, storage_options=None, storage_options_provider=None))] pub fn new( uri_or_path: String, storage_options: Option>, storage_options_provider: Option<&Bound<'_, PyAny>>, - s3_credentials_refresh_offset_seconds: Option, ) -> PyResult { let provider = storage_options_provider .map(crate::storage_options::py_object_to_storage_options_provider) .transpose()?; - rt().block_on( - None, - Self::try_new( - uri_or_path, - storage_options, - provider, - s3_credentials_refresh_offset_seconds, - ), - )? + rt().block_on(None, Self::try_new(uri_or_path, storage_options, provider))? } #[pyo3(signature=(path, columns=None))] @@ -644,14 +637,12 @@ impl LanceFileReader { uri_or_path: String, storage_options: Option>, storage_options_provider: Option>, - s3_credentials_refresh_offset_seconds: Option, columns: Option>, ) -> PyResult { let (object_store, path) = object_store_from_uri_or_path_with_provider( uri_or_path, storage_options, storage_options_provider, - s3_credentials_refresh_offset_seconds, ) .await?; Self::open_with_store(object_store, path, columns).await @@ -749,27 +740,17 @@ impl LanceFileReader { #[pymethods] impl LanceFileReader { #[new] - #[pyo3(signature=(path, storage_options=None, storage_options_provider=None, s3_credentials_refresh_offset_seconds=None, columns=None))] + #[pyo3(signature=(path, storage_options=None, storage_options_provider=None, columns=None))] pub fn new( path: String, storage_options: Option>, storage_options_provider: Option<&Bound<'_, PyAny>>, - s3_credentials_refresh_offset_seconds: Option, columns: Option>, ) -> PyResult { let provider = storage_options_provider .map(crate::storage_options::py_object_to_storage_options_provider) .transpose()?; - rt().block_on( - None, - Self::open( - path, - storage_options, - provider, - s3_credentials_refresh_offset_seconds, - columns, - ), - )? + rt().block_on(None, Self::open(path, storage_options, provider, columns))? } pub fn read_all( diff --git a/python/src/lib.rs b/python/src/lib.rs index 6448ff9afa3..b7f449289bb 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -281,6 +281,7 @@ fn lance(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; #[cfg(feature = "rest-adapter")] m.add_class::()?; + m.add_class::()?; m.add_wrapped(wrap_pyfunction!(bfloat16_array))?; m.add_wrapped(wrap_pyfunction!(write_dataset))?; m.add_wrapped(wrap_pyfunction!(write_fragments))?; diff --git a/python/src/storage_options.rs b/python/src/storage_options.rs index 7501fc4a090..7e2c6011ece 100644 --- a/python/src/storage_options.rs +++ b/python/src/storage_options.rs @@ -5,7 +5,7 @@ use std::collections::HashMap; use std::sync::Arc; use async_trait::async_trait; -use lance_io::object_store::StorageOptionsProvider; +use lance_io::object_store::{StorageOptionsAccessor, StorageOptionsProvider}; use pyo3::prelude::*; use pyo3::types::PyDict; @@ -167,3 +167,127 @@ pub fn py_object_to_storage_options_provider( let py_provider = PyStorageOptionsProvider::new(py_obj)?; Ok(Arc::new(PyStorageOptionsProviderWrapper::new(py_provider))) } + +/// Python wrapper for StorageOptionsAccessor +/// +/// This wraps a Rust StorageOptionsAccessor and exposes it to Python. +#[pyclass(name = "StorageOptionsAccessor")] +#[derive(Clone)] +pub struct PyStorageOptionsAccessor { + inner: Arc, +} + +impl PyStorageOptionsAccessor { + pub fn new(accessor: Arc) -> Self { + Self { inner: accessor } + } + + pub fn inner(&self) -> Arc { + self.inner.clone() + } +} + +#[pymethods] +impl PyStorageOptionsAccessor { + /// Create an accessor with only static options (no refresh capability) + #[staticmethod] + fn with_static_options(options: HashMap) -> Self { + Self { + inner: Arc::new(StorageOptionsAccessor::with_static_options(options)), + } + } + + /// Create an accessor with a dynamic provider (no initial options) + /// + /// The refresh offset is extracted from storage options using the `refresh_offset_millis` key. + #[staticmethod] + fn with_provider(provider: &Bound<'_, PyAny>) -> PyResult { + let rust_provider = py_object_to_storage_options_provider(provider)?; + Ok(Self { + inner: Arc::new(StorageOptionsAccessor::with_provider(rust_provider)), + }) + } + + /// Create an accessor with initial options and a dynamic provider + /// + /// The refresh offset is extracted from initial_options using the `refresh_offset_millis` key. + #[staticmethod] + fn with_initial_and_provider( + initial_options: HashMap, + provider: &Bound<'_, PyAny>, + ) -> PyResult { + let rust_provider = py_object_to_storage_options_provider(provider)?; + Ok(Self { + inner: Arc::new(StorageOptionsAccessor::with_initial_and_provider( + initial_options, + rust_provider, + )), + }) + } + + /// Get current valid storage options + fn get_storage_options(&self, py: Python<'_>) -> PyResult> { + let accessor = self.inner.clone(); + let options = rt() + .block_on(Some(py), accessor.get_storage_options())? + .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(e.to_string()))?; + Ok(options.0) + } + + /// Get the initial storage options without refresh + fn initial_storage_options(&self) -> Option> { + self.inner.initial_storage_options().cloned() + } + + /// Get the accessor ID for equality/hashing + fn accessor_id(&self) -> String { + self.inner.accessor_id() + } + + /// Check if this accessor has a dynamic provider + fn has_provider(&self) -> bool { + self.inner.has_provider() + } + + /// Get the refresh offset in seconds + fn refresh_offset_secs(&self) -> u64 { + self.inner.refresh_offset().as_secs() + } + + fn __repr__(&self) -> String { + format!( + "StorageOptionsAccessor(id={}, has_provider={})", + self.inner.accessor_id(), + self.inner.has_provider() + ) + } +} + +/// Create a StorageOptionsAccessor from Python parameters +/// +/// This handles the conversion from Python types to Rust StorageOptionsAccessor. +/// The refresh offset is extracted from storage_options using the `refresh_offset_millis` key. +#[allow(dead_code)] +pub fn create_accessor_from_python( + storage_options: Option>, + storage_options_provider: Option<&Bound<'_, PyAny>>, +) -> PyResult>> { + match (storage_options, storage_options_provider) { + (Some(opts), Some(provider)) => { + let rust_provider = py_object_to_storage_options_provider(provider)?; + Ok(Some(Arc::new( + StorageOptionsAccessor::with_initial_and_provider(opts, rust_provider), + ))) + } + (None, Some(provider)) => { + let rust_provider = py_object_to_storage_options_provider(provider)?; + Ok(Some(Arc::new(StorageOptionsAccessor::with_provider( + rust_provider, + )))) + } + (Some(opts), None) => Ok(Some(Arc::new(StorageOptionsAccessor::with_static_options( + opts, + )))), + (None, None) => Ok(None), + } +} diff --git a/rust/lance-io/src/object_store.rs b/rust/lance-io/src/object_store.rs index d9d700bf878..44324fa3ee3 100644 --- a/rust/lance-io/src/object_store.rs +++ b/rust/lance-io/src/object_store.rs @@ -64,7 +64,8 @@ pub const DEFAULT_DOWNLOAD_RETRY_COUNT: usize = 3; pub use providers::{ObjectStoreProvider, ObjectStoreRegistry}; pub use storage_options::{ - LanceNamespaceStorageOptionsProvider, StorageOptionsProvider, EXPIRES_AT_MILLIS_KEY, + LanceNamespaceStorageOptionsProvider, StorageOptionsAccessor, StorageOptionsProvider, + EXPIRES_AT_MILLIS_KEY, REFRESH_OFFSET_MILLIS_KEY, }; #[async_trait] @@ -187,13 +188,18 @@ pub struct ObjectStoreParams { pub block_size: Option, #[deprecated(note = "Implement an ObjectStoreProvider instead")] pub object_store: Option<(Arc, Url)>, + /// Refresh offset for AWS credentials when using the legacy AWS credentials path. + /// For StorageOptionsAccessor, use `refresh_offset_millis` storage option instead. pub s3_credentials_refresh_offset: Duration, #[cfg(feature = "aws")] pub aws_credentials: Option, pub object_store_wrapper: Option>, - pub storage_options: Option>, - /// Dynamic storage options provider for automatic credential refresh - pub storage_options_provider: Option>, + /// Unified storage options accessor with caching and automatic refresh + /// + /// Provides storage options and optionally a dynamic provider for automatic + /// credential refresh. Use `StorageOptionsAccessor::with_static_options()` for static + /// options or `StorageOptionsAccessor::with_initial_and_provider()` for dynamic refresh. + pub storage_options_accessor: Option>, /// Use constant size upload parts for multipart uploads. Only necessary /// for Cloudflare R2, which doesn't support variable size parts. When this /// is false, max upload size is 2.5TB. When this is true, the max size is @@ -212,19 +218,34 @@ impl Default for ObjectStoreParams { #[cfg(feature = "aws")] aws_credentials: None, object_store_wrapper: None, - storage_options: None, - storage_options_provider: None, + storage_options_accessor: None, use_constant_size_upload_parts: false, list_is_lexically_ordered: None, } } } +impl ObjectStoreParams { + /// Get the StorageOptionsAccessor from the params + pub fn get_accessor(&self) -> Option> { + self.storage_options_accessor.clone() + } + + /// Get storage options from the accessor, if any + /// + /// Returns the initial storage options from the accessor without triggering refresh. + pub fn storage_options(&self) -> Option<&HashMap> { + self.storage_options_accessor + .as_ref() + .and_then(|a| a.initial_storage_options()) + } +} + // We implement hash for caching impl std::hash::Hash for ObjectStoreParams { #[allow(deprecated)] fn hash(&self, state: &mut H) { - // For hashing, we use pointer values for ObjectStore, S3 credentials, wrapper, and storage options provider + // For hashing, we use pointer values for ObjectStore, S3 credentials, wrapper self.block_size.hash(state); if let Some((store, url)) = &self.object_store { Arc::as_ptr(store).hash(state); @@ -238,14 +259,8 @@ impl std::hash::Hash for ObjectStoreParams { if let Some(wrapper) = &self.object_store_wrapper { Arc::as_ptr(wrapper).hash(state); } - if let Some(storage_options) = &self.storage_options { - for (key, value) in storage_options { - key.hash(state); - value.hash(state); - } - } - if let Some(provider) = &self.storage_options_provider { - provider.provider_id().hash(state); + if let Some(accessor) = &self.storage_options_accessor { + accessor.accessor_id().hash(state); } self.use_constant_size_upload_parts.hash(state); self.list_is_lexically_ordered.hash(state); @@ -263,7 +278,7 @@ impl PartialEq for ObjectStoreParams { } // For equality, we use pointer comparison for ObjectStore, S3 credentials, wrapper - // For storage_options_provider, we use provider_id() for semantic equality + // For accessor, we use accessor_id() for semantic equality self.block_size == other.block_size && self .object_store @@ -276,15 +291,14 @@ impl PartialEq for ObjectStoreParams { && self.s3_credentials_refresh_offset == other.s3_credentials_refresh_offset && self.object_store_wrapper.as_ref().map(Arc::as_ptr) == other.object_store_wrapper.as_ref().map(Arc::as_ptr) - && self.storage_options == other.storage_options && self - .storage_options_provider + .storage_options_accessor .as_ref() - .map(|p| p.provider_id()) + .map(|a| a.accessor_id()) == other - .storage_options_provider + .storage_options_accessor .as_ref() - .map(|p| p.provider_id()) + .map(|a| a.accessor_id()) && self.use_constant_size_upload_parts == other.use_constant_size_upload_parts && self.list_is_lexically_ordered == other.list_is_lexically_ordered } @@ -414,7 +428,7 @@ impl ObjectStore { if let Some((store, path)) = params.object_store.as_ref() { let mut inner = store.clone(); let store_prefix = - registry.calculate_object_store_prefix(uri, params.storage_options.as_ref())?; + registry.calculate_object_store_prefix(uri, params.storage_options())?; if let Some(wrapper) = params.object_store_wrapper.as_ref() { inner = wrapper.wrap(&store_prefix, inner); } @@ -984,8 +998,11 @@ mod tests { ) { // Test the default let registry = Arc::new(ObjectStoreRegistry::default()); + let accessor = storage_options + .clone() + .map(|opts| Arc::new(StorageOptionsAccessor::with_static_options(opts))); let params = ObjectStoreParams { - storage_options: storage_options.clone(), + storage_options_accessor: accessor.clone(), ..ObjectStoreParams::default() }; let (store, _) = ObjectStore::from_uri_and_params(registry, uri, ¶ms) @@ -997,7 +1014,7 @@ mod tests { let registry = Arc::new(ObjectStoreRegistry::default()); let params = ObjectStoreParams { block_size: Some(1024), - storage_options: storage_options.clone(), + storage_options_accessor: accessor, ..ObjectStoreParams::default() }; let (store, _) = ObjectStore::from_uri_and_params(registry, uri, ¶ms) diff --git a/rust/lance-io/src/object_store/providers.rs b/rust/lance-io/src/object_store/providers.rs index f2aef00583b..b48e6f20079 100644 --- a/rust/lance-io/src/object_store/providers.rs +++ b/rust/lance-io/src/object_store/providers.rs @@ -209,7 +209,7 @@ impl ObjectStoreRegistry { }; let cache_path = - provider.calculate_object_store_prefix(&base_path, params.storage_options.as_ref())?; + provider.calculate_object_store_prefix(&base_path, params.storage_options())?; let cache_key = (cache_path.clone(), params.clone()); // Check if we have a cached store for this base path and params @@ -417,12 +417,15 @@ mod tests { #[tokio::test] async fn test_stats_hit_miss_tracking() { + use crate::object_store::StorageOptionsAccessor; let registry = ObjectStoreRegistry::default(); let url = Url::parse("memory://test").unwrap(); let params1 = ObjectStoreParams::default(); let params2 = ObjectStoreParams { - storage_options: Some(HashMap::from([("k".into(), "v".into())])), + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::with_static_options( + HashMap::from([("k".into(), "v".into())]), + ))), ..Default::default() }; diff --git a/rust/lance-io/src/object_store/providers/aws.rs b/rust/lance-io/src/object_store/providers/aws.rs index 5def1b52923..982470581f2 100644 --- a/rust/lance-io/src/object_store/providers/aws.rs +++ b/rust/lance-io/src/object_store/providers/aws.rs @@ -28,8 +28,9 @@ use tokio::sync::RwLock; use url::Url; use crate::object_store::{ - ObjectStore, ObjectStoreParams, ObjectStoreProvider, StorageOptions, StorageOptionsProvider, - DEFAULT_CLOUD_BLOCK_SIZE, DEFAULT_CLOUD_IO_PARALLELISM, DEFAULT_MAX_IOP_SIZE, + ObjectStore, ObjectStoreParams, ObjectStoreProvider, StorageOptions, StorageOptionsAccessor, + StorageOptionsProvider, DEFAULT_CLOUD_BLOCK_SIZE, DEFAULT_CLOUD_IO_PARALLELISM, + DEFAULT_MAX_IOP_SIZE, }; use lance_core::error::{Error, Result}; @@ -54,13 +55,16 @@ impl AwsStoreProvider { let mut s3_storage_options = storage_options.as_s3_options(); let region = resolve_s3_region(base_path, &s3_storage_options).await?; + + // Get accessor from params + let accessor = params.get_accessor(); + let (aws_creds, region) = build_aws_credential( params.s3_credentials_refresh_offset, params.aws_credentials.clone(), Some(&s3_storage_options), region, - params.storage_options_provider.clone(), - storage_options.expires_at_millis(), + accessor, ) .await?; @@ -132,7 +136,7 @@ impl ObjectStoreProvider for AwsStoreProvider { ) -> Result { let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); let mut storage_options = - StorageOptions(params.storage_options.clone().unwrap_or_default()); + StorageOptions(params.storage_options().cloned().unwrap_or_default()); storage_options.with_env_s3(); let download_retry_count = storage_options.download_retry_count(); @@ -172,7 +176,7 @@ impl ObjectStoreProvider for AwsStoreProvider { download_retry_count, io_tracker: Default::default(), store_prefix: self - .calculate_object_store_prefix(&base_path, params.storage_options.as_ref())?, + .calculate_object_store_prefix(&base_path, params.storage_options())?, }) } } @@ -228,20 +232,17 @@ async fn resolve_s3_region( /// Build AWS credentials /// /// This resolves credentials from the following sources in order: -/// 1. An explicit `storage_options_provider` +/// 1. An explicit `storage_options_accessor` with a provider /// 2. An explicit `credentials` provider /// 3. Explicit credentials in storage_options (as in `aws_access_key_id`, /// `aws_secret_access_key`, `aws_session_token`) /// 4. The default credential provider chain from AWS SDK. /// -/// # Initial Credentials with Storage Options Provider +/// # Storage Options Accessor /// -/// When `storage_options_provider` is provided along with `storage_options` and -/// `expires_at_millis`, these serve as **initial values** to avoid redundant calls to -/// fetch new storage options. The provider will use these initial credentials until they -/// expire (based on `expires_at_millis`), then automatically fetch fresh credentials from -/// the provider. Once the initial credentials expire, the passed-in values are no longer -/// used - all future credentials come from the provider's `fetch_storage_options()` method. +/// When `storage_options_accessor` is provided and has a dynamic provider, +/// credentials are fetched and cached by the accessor with automatic refresh +/// before expiration. /// /// `credentials_refresh_offset` is the amount of time before expiry to refresh credentials. pub async fn build_aws_credential( @@ -249,10 +250,8 @@ pub async fn build_aws_credential( credentials: Option, storage_options: Option<&HashMap>, region: Option, - storage_options_provider: Option>, - expires_at_millis: Option, + storage_options_accessor: Option>, ) -> Result<(AwsCredentialProvider, String)> { - // TODO: make this return no credential provider not using AWS use aws_config::meta::region::RegionProviderChain; const DEFAULT_REGION: &str = "us-west-2"; @@ -268,17 +267,24 @@ pub async fn build_aws_credential( }; let storage_options_credentials = storage_options.and_then(extract_static_s3_credentials); - if let Some(storage_options_provider) = storage_options_provider { - let creds = build_aws_credential_with_storage_options_provider( - storage_options_provider, - credentials_refresh_offset, - credentials, - storage_options_credentials, - expires_at_millis, - ) - .await?; - Ok((creds, region)) - } else if let Some(creds) = credentials { + + // If accessor has a provider, use DynamicStorageOptionsCredentialProvider + if let Some(accessor) = storage_options_accessor { + if accessor.has_provider() { + // Explicit aws_credentials takes precedence + if let Some(creds) = credentials { + return Ok((creds, region)); + } + // Use accessor for dynamic credential refresh + return Ok(( + Arc::new(DynamicStorageOptionsCredentialProvider::new(accessor)), + region, + )); + } + } + + // Fall back to existing logic for static credentials + if let Some(creds) = credentials { Ok((creds, region)) } else if let Some(creds) = storage_options_credentials { Ok((Arc::new(creds), region)) @@ -295,37 +301,6 @@ pub async fn build_aws_credential( } } -async fn build_aws_credential_with_storage_options_provider( - storage_options_provider: Arc, - credentials_refresh_offset: Duration, - credentials: Option, - storage_options_credentials: Option>, - expires_at_millis: Option, -) -> Result { - match (credentials, storage_options_credentials) { - // Case 1: Explicit aws_credentials provider takes precedence - use it directly - // without wrapping in DynamicStorageOptionsCredentialProvider, as the user's - // provider should handle its own credential refresh logic. - (Some(cred), _) => Ok(cred), - // Case 2: storage_options credentials - wrap with DynamicStorageOptionsCredentialProvider - // to enable auto-refresh from the namespace - (None, Some(cred)) => Ok(Arc::new( - DynamicStorageOptionsCredentialProvider::new_with_initial_credential( - storage_options_provider, - credentials_refresh_offset, - cred.get_credential().await?, - expires_at_millis, - ), - )), - // Case 3: No initial credentials - DynamicStorageOptionsCredentialProvider will - // fetch credentials from the namespace on first use - (None, None) => Ok(Arc::new(DynamicStorageOptionsCredentialProvider::new( - storage_options_provider, - credentials_refresh_offset, - ))), - } -} - fn extract_static_s3_credentials( options: &HashMap, ) -> Option> { @@ -468,20 +443,24 @@ impl ObjectStoreParams { aws_credentials: Option, region: Option, ) -> Self { + let storage_options_accessor = region.map(|region| { + let opts: HashMap = + [("region".into(), region)].iter().cloned().collect(); + Arc::new(StorageOptionsAccessor::with_static_options(opts)) + }); Self { aws_credentials, - storage_options: region - .map(|region| [("region".into(), region)].iter().cloned().collect()), + storage_options_accessor, ..Default::default() } } } -/// AWS Credential Provider that uses StorageOptionsProvider +/// AWS Credential Provider that delegates to StorageOptionsAccessor /// -/// This adapter converts our generic StorageOptionsProvider trait into -/// AWS-specific credentials that can be used with S3. It caches credentials -/// and automatically refreshes them before they expire. +/// This adapter converts storage options from a [`StorageOptionsAccessor`] into +/// AWS-specific credentials that can be used with S3. All caching and refresh logic +/// is handled by the accessor. /// /// # Future Work /// @@ -491,129 +470,71 @@ impl ObjectStoreParams { /// /// See: pub struct DynamicStorageOptionsCredentialProvider { - provider: Arc, - cache: Arc>>, - refresh_offset: Duration, + accessor: Arc, } impl fmt::Debug for DynamicStorageOptionsCredentialProvider { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("DynamicStorageOptionsCredentialProvider") - .field("provider", &self.provider) - .field("refresh_offset", &self.refresh_offset) + .field("accessor", &self.accessor) .finish() } } -#[derive(Debug, Clone)] -struct CachedCredential { - credential: Arc, - expires_at_millis: Option, -} - impl DynamicStorageOptionsCredentialProvider { - /// Create a new credential provider without initial credentials + /// Create a new credential provider from a storage options accessor + pub fn new(accessor: Arc) -> Self { + Self { accessor } + } + + /// Create a new credential provider from a storage options provider + /// + /// This is a convenience constructor for backward compatibility. + /// The refresh offset will be extracted from storage options using + /// the `refresh_offset_millis` key, defaulting to 60 seconds. /// /// # Arguments /// * `provider` - The storage options provider - /// * `refresh_offset` - Duration before expiry to refresh credentials - pub fn new(provider: Arc, refresh_offset: Duration) -> Self { + pub fn from_provider(provider: Arc) -> Self { Self { - provider, - cache: Arc::new(RwLock::new(None)), - refresh_offset, + accessor: Arc::new(StorageOptionsAccessor::with_provider(provider)), } } - /// Create a new credential provider with initial credentials from an explicit credential + /// Create a new credential provider with initial credentials + /// + /// This is a convenience constructor for backward compatibility. + /// The refresh offset will be extracted from initial_options using + /// the `refresh_offset_millis` key, defaulting to 60 seconds. /// /// # Arguments /// * `provider` - The storage options provider - /// * `refresh_offset` - Duration before expiry to refresh credentials - /// * `credential` - Initial credential to cache - /// * `expires_at_millis` - Expiration time in milliseconds since epoch. If None, credentials - /// are treated as non-expiring and will not be automatically refreshed. - pub fn new_with_initial_credential( + /// * `initial_options` - Initial storage options to cache + pub fn from_provider_with_initial( provider: Arc, - refresh_offset: Duration, - credential: Arc, - expires_at_millis: Option, + initial_options: HashMap, ) -> Self { Self { - provider, - cache: Arc::new(RwLock::new(Some(CachedCredential { - credential, - expires_at_millis, - }))), - refresh_offset, - } - } - - fn needs_refresh(&self, cached: &Option) -> bool { - match cached { - None => true, - Some(cached_cred) => { - if let Some(expires_at_millis) = cached_cred.expires_at_millis { - let now_ms = SystemTime::now() - .duration_since(UNIX_EPOCH) - .unwrap_or(Duration::from_secs(0)) - .as_millis() as u64; - - // Refresh if we're within the refresh offset of expiration - let refresh_offset_millis = self.refresh_offset.as_millis() as u64; - now_ms + refresh_offset_millis >= expires_at_millis - } else { - // No expiration means credentials never expire - false - } - } + accessor: Arc::new(StorageOptionsAccessor::with_initial_and_provider( + initial_options, + provider, + )), } } +} - async fn do_get_credential(&self) -> ObjectStoreResult>> { - // Check if we have valid cached credentials with read lock - { - let cached = self.cache.read().await; - if !self.needs_refresh(&cached) { - if let Some(cached_cred) = &*cached { - return Ok(Some(cached_cred.credential.clone())); - } - } - } - - // Try to acquire write lock - if it fails, return None and let caller retry - let Ok(mut cache) = self.cache.try_write() else { - return Ok(None); - }; - - // Double-check if credentials are still stale after acquiring write lock - // (another thread might have refreshed them) - if !self.needs_refresh(&cache) { - if let Some(cached_cred) = &*cache { - return Ok(Some(cached_cred.credential.clone())); - } - } - - log::debug!( - "Refreshing S3 credentials from storage options provider: {}", - self.provider.provider_id() - ); +#[async_trait::async_trait] +impl CredentialProvider for DynamicStorageOptionsCredentialProvider { + type Credential = ObjectStoreAwsCredential; - let storage_options_map = self - .provider - .fetch_storage_options() - .await - .map_err(|e| object_store::Error::Generic { + async fn get_credential(&self) -> ObjectStoreResult> { + let storage_options = self.accessor.get_storage_options().await.map_err(|e| { + object_store::Error::Generic { store: "DynamicStorageOptionsCredentialProvider", source: Box::new(e), - })? - .ok_or_else(|| object_store::Error::Generic { - store: "DynamicStorageOptionsCredentialProvider", - source: "No storage options available".into(), - })?; + } + })?; - let storage_options = StorageOptions(storage_options_map); - let expires_at_millis = storage_options.expires_at_millis(); let s3_options = storage_options.as_s3_options(); let static_creds = extract_static_s3_credentials(&s3_options).ok_or_else(|| { object_store::Error::Generic { @@ -622,58 +543,13 @@ impl DynamicStorageOptionsCredentialProvider { } })?; - let credential = - static_creds - .get_credential() - .await - .map_err(|e| object_store::Error::Generic { - store: "DynamicStorageOptionsCredentialProvider", - source: Box::new(e), - })?; - - if let Some(expires_at) = expires_at_millis { - let now_ms = SystemTime::now() - .duration_since(UNIX_EPOCH) - .unwrap_or(Duration::from_secs(0)) - .as_millis() as u64; - let expires_in_secs = (expires_at.saturating_sub(now_ms)) / 1000; - log::debug!( - "Successfully refreshed S3 credentials from provider: {}, credentials expire in {} seconds", - self.provider.provider_id(), - expires_in_secs - ); - } else { - log::debug!( - "Successfully refreshed S3 credentials from provider: {} (no expiration)", - self.provider.provider_id() - ); - } - - *cache = Some(CachedCredential { - credential: credential.clone(), - expires_at_millis, - }); - - Ok(Some(credential)) - } -} - -#[async_trait::async_trait] -impl CredentialProvider for DynamicStorageOptionsCredentialProvider { - type Credential = ObjectStoreAwsCredential; - - async fn get_credential(&self) -> ObjectStoreResult> { - // Retry loop - if do_get_credential returns None (lock busy), retry from the beginning - loop { - match self.do_get_credential().await? { - Some(cred) => return Ok(cred), - None => { - // Lock was busy, wait 10ms before retrying - tokio::time::sleep(Duration::from_millis(10)).await; - continue; - } - } - } + static_creds + .get_credential() + .await + .map_err(|e| object_store::Error::Generic { + store: "DynamicStorageOptionsCredentialProvider", + source: Box::new(e), + }) } } @@ -795,13 +671,16 @@ mod tests { #[tokio::test] async fn test_use_opendal_flag() { + use crate::object_store::StorageOptionsAccessor; let provider = AwsStoreProvider; let url = Url::parse("s3://test-bucket/path").unwrap(); let params_with_flag = ObjectStoreParams { - storage_options: Some(HashMap::from([ - ("use_opendal".to_string(), "true".to_string()), - ("region".to_string(), "us-west-2".to_string()), - ])), + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::with_static_options( + HashMap::from([ + ("use_opendal".to_string(), "true".to_string()), + ("region".to_string(), "us-west-2".to_string()), + ]), + ))), ..Default::default() }; @@ -878,19 +757,22 @@ mod tests { 600_000, // Expires in 10 minutes ))); - // Create credential provider with initial cached credentials that expire in 10 minutes + // Create initial options with cached credentials that expire in 10 minutes let expires_at = now_ms + 600_000; // 10 minutes from now - let initial_cred = Arc::new(ObjectStoreAwsCredential { - key_id: "AKID_CACHED".to_string(), - secret_key: "SECRET_CACHED".to_string(), - token: Some("TOKEN_CACHED".to_string()), - }); + let initial_options = HashMap::from([ + ("aws_access_key_id".to_string(), "AKID_CACHED".to_string()), + ( + "aws_secret_access_key".to_string(), + "SECRET_CACHED".to_string(), + ), + ("aws_session_token".to_string(), "TOKEN_CACHED".to_string()), + ("expires_at_millis".to_string(), expires_at.to_string()), + ("refresh_offset_millis".to_string(), "300000".to_string()), // 5 minute refresh offset + ]); - let provider = DynamicStorageOptionsCredentialProvider::new_with_initial_credential( + let provider = DynamicStorageOptionsCredentialProvider::from_provider_with_initial( mock.clone(), - Duration::from_secs(300), // 5 minute refresh offset - initial_cred, - Some(expires_at), + initial_options, ); // First call should use cached credentials (not expired yet) @@ -914,19 +796,21 @@ mod tests { 600_000, // Expires in 10 minutes ))); - // Create credential provider with initial cached credentials that expired 1 second ago + // Create initial options with credentials that expired 1 second ago let expired_time = now_ms - 1_000; // 1 second ago - let initial_cred = Arc::new(ObjectStoreAwsCredential { - key_id: "AKID_EXPIRED".to_string(), - secret_key: "SECRET_EXPIRED".to_string(), - token: None, - }); + let initial_options = HashMap::from([ + ("aws_access_key_id".to_string(), "AKID_EXPIRED".to_string()), + ( + "aws_secret_access_key".to_string(), + "SECRET_EXPIRED".to_string(), + ), + ("expires_at_millis".to_string(), expired_time.to_string()), + ("refresh_offset_millis".to_string(), "300000".to_string()), // 5 minute refresh offset + ]); - let provider = DynamicStorageOptionsCredentialProvider::new_with_initial_credential( + let provider = DynamicStorageOptionsCredentialProvider::from_provider_with_initial( mock.clone(), - Duration::from_secs(300), // 5 minute refresh offset - initial_cred, - Some(expired_time), + initial_options, ); // First call should fetch new credentials because cached ones are expired @@ -943,27 +827,24 @@ mod tests { async fn test_dynamic_credential_provider_refresh_lead_time() { MockClock::set_system_time(Duration::from_secs(100_000)); - // Create a mock provider that returns credentials expiring in 4 minutes + // Create a mock provider that returns credentials expiring in 30 seconds let mock = Arc::new(MockStorageOptionsProvider::new(Some( - 240_000, // Expires in 4 minutes + 30_000, // Expires in 30 seconds ))); - // Create credential provider with 5 minute refresh offset - // This means credentials should be refreshed when they have less than 5 minutes left - let provider = DynamicStorageOptionsCredentialProvider::new( - mock.clone(), - Duration::from_secs(300), // 5 minute refresh offset - ); + // Create credential provider with default 60 second refresh offset + // This means credentials should be refreshed when they have less than 60 seconds left + let provider = DynamicStorageOptionsCredentialProvider::from_provider(mock.clone()); // First call should fetch credentials from provider (no initial cache) - // Credentials expire in 4 minutes, which is less than our 5 minute refresh offset, + // Credentials expire in 30 seconds, which is less than our 60 second refresh offset, // so they should be considered "needs refresh" immediately let cred = provider.get_credential().await.unwrap(); assert_eq!(cred.key_id, "AKID_1"); assert_eq!(mock.get_call_count().await, 1); - // Second call should trigger refresh because credentials expire in 4 minutes - // but our refresh lead time is 5 minutes (now + 5min > expires_at) + // Second call should trigger refresh because credentials expire in 30 seconds + // but our refresh lead time is 60 seconds (now + 60sec > expires_at) // The mock will return new credentials (AKID_2) with the same expiration let cred = provider.get_credential().await.unwrap(); assert_eq!(cred.key_id, "AKID_2"); @@ -974,16 +855,13 @@ mod tests { async fn test_dynamic_credential_provider_no_initial_cache() { MockClock::set_system_time(Duration::from_secs(100_000)); - // Create a mock provider that returns credentials expiring in 10 minutes + // Create a mock provider that returns credentials expiring in 2 minutes let mock = Arc::new(MockStorageOptionsProvider::new(Some( - 600_000, // Expires in 10 minutes + 120_000, // Expires in 2 minutes ))); - // Create credential provider without initial cache - let provider = DynamicStorageOptionsCredentialProvider::new( - mock.clone(), - Duration::from_secs(300), // 5 minute refresh offset - ); + // Create credential provider without initial cache, using default 60 second refresh offset + let provider = DynamicStorageOptionsCredentialProvider::from_provider(mock.clone()); // First call should fetch from provider (call count = 1) let cred = provider.get_credential().await.unwrap(); @@ -992,21 +870,22 @@ mod tests { assert_eq!(cred.token, Some("TOKEN_1".to_string())); assert_eq!(mock.get_call_count().await, 1); - // Second call should use cached credentials (not expired yet) + // Second call should use cached credentials (not expired yet, still > 60 seconds remaining) let cred = provider.get_credential().await.unwrap(); assert_eq!(cred.key_id, "AKID_1"); assert_eq!(mock.get_call_count().await, 1); // Still 1, didn't fetch again - // Advance time to 6 minutes - should trigger refresh (within 5 min refresh offset) - MockClock::set_system_time(Duration::from_secs(100_000 + 360)); + // Advance time to 90 seconds - should trigger refresh (within 60 sec refresh offset) + // At this point, credentials expire in 30 seconds (< 60 sec offset) + MockClock::set_system_time(Duration::from_secs(100_000 + 90)); let cred = provider.get_credential().await.unwrap(); assert_eq!(cred.key_id, "AKID_2"); assert_eq!(cred.secret_key, "SECRET_2"); assert_eq!(cred.token, Some("TOKEN_2".to_string())); assert_eq!(mock.get_call_count().await, 2); - // Advance time to 11 minutes total - should trigger another refresh - MockClock::set_system_time(Duration::from_secs(100_000 + 660)); + // Advance time to 210 seconds total (90 + 120) - should trigger another refresh + MockClock::set_system_time(Duration::from_secs(100_000 + 210)); let cred = provider.get_credential().await.unwrap(); assert_eq!(cred.key_id, "AKID_3"); assert_eq!(cred.secret_key, "SECRET_3"); @@ -1014,7 +893,7 @@ mod tests { } #[tokio::test] - async fn test_dynamic_credential_provider_with_initial_credential() { + async fn test_dynamic_credential_provider_with_initial_options() { MockClock::set_system_time(Duration::from_secs(100_000)); let now_ms = MockClock::system_time().as_millis() as u64; @@ -1024,20 +903,23 @@ mod tests { 600_000, // Expires in 10 minutes ))); - // Create an initial credential with expiration in 10 minutes + // Create initial options with expiration in 10 minutes let expires_at = now_ms + 600_000; // 10 minutes from now - let initial_cred = Arc::new(ObjectStoreAwsCredential { - key_id: "AKID_INITIAL".to_string(), - secret_key: "SECRET_INITIAL".to_string(), - token: Some("TOKEN_INITIAL".to_string()), - }); + let initial_options = HashMap::from([ + ("aws_access_key_id".to_string(), "AKID_INITIAL".to_string()), + ( + "aws_secret_access_key".to_string(), + "SECRET_INITIAL".to_string(), + ), + ("aws_session_token".to_string(), "TOKEN_INITIAL".to_string()), + ("expires_at_millis".to_string(), expires_at.to_string()), + ("refresh_offset_millis".to_string(), "300000".to_string()), // 5 minute refresh offset + ]); - // Create credential provider with initial credential and expiration - let provider = DynamicStorageOptionsCredentialProvider::new_with_initial_credential( + // Create credential provider with initial options + let provider = DynamicStorageOptionsCredentialProvider::from_provider_with_initial( mock.clone(), - Duration::from_secs(300), // 5 minute refresh offset - initial_cred, - Some(expires_at), + initial_options, ); // First call should use the initial credential (not expired yet) @@ -1086,9 +968,8 @@ mod tests { // Create a mock provider with far future expiration let mock = Arc::new(MockStorageOptionsProvider::new(Some(9999999999999))); - let provider = Arc::new(DynamicStorageOptionsCredentialProvider::new( + let provider = Arc::new(DynamicStorageOptionsCredentialProvider::from_provider( mock.clone(), - Duration::from_secs(300), )); // Spawn 10 concurrent tasks that all try to get credentials at the same time @@ -1134,14 +1015,18 @@ mod tests { let now_ms = MockClock::system_time().as_millis() as u64; - // Create initial credentials that expired in the past (1000 seconds ago) + // Create initial options with credentials that expired in the past (1000 seconds ago) let expires_at = now_ms - 1_000_000; - - let initial_cred = Arc::new(ObjectStoreAwsCredential { - key_id: "AKID_OLD".to_string(), - secret_key: "SECRET_OLD".to_string(), - token: Some("TOKEN_OLD".to_string()), - }); + let initial_options = HashMap::from([ + ("aws_access_key_id".to_string(), "AKID_OLD".to_string()), + ( + "aws_secret_access_key".to_string(), + "SECRET_OLD".to_string(), + ), + ("aws_session_token".to_string(), "TOKEN_OLD".to_string()), + ("expires_at_millis".to_string(), expires_at.to_string()), + ("refresh_offset_millis".to_string(), "300000".to_string()), // 5 minute refresh offset + ]); // Mock will return credentials expiring in 1 hour let mock = Arc::new(MockStorageOptionsProvider::new(Some( @@ -1149,11 +1034,9 @@ mod tests { ))); let provider = Arc::new( - DynamicStorageOptionsCredentialProvider::new_with_initial_credential( + DynamicStorageOptionsCredentialProvider::from_provider_with_initial( mock.clone(), - Duration::from_secs(300), - initial_cred, - Some(expires_at), + initial_options, ), ); @@ -1201,21 +1084,26 @@ mod tests { } #[tokio::test] - async fn test_explicit_aws_credentials_takes_precedence_over_storage_options_provider() { + async fn test_explicit_aws_credentials_takes_precedence_over_accessor() { // Create a mock storage options provider that should NOT be called let mock_storage_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); + // Create an accessor with the mock provider + let accessor = Arc::new(StorageOptionsAccessor::with_provider( + mock_storage_provider.clone(), + )); + // Create an explicit AWS credentials provider let explicit_cred_provider = Arc::new(MockAwsCredentialsProvider::default()); - // Build credentials with both aws_credentials AND storage_options_provider + // Build credentials with both aws_credentials AND accessor // The explicit aws_credentials should take precedence - let result = build_aws_credential_with_storage_options_provider( - mock_storage_provider.clone(), + let (result, _region) = build_aws_credential( Duration::from_secs(300), Some(explicit_cred_provider.clone() as AwsCredentialProvider), - None, // no storage_options credentials - Some(1000), + None, // no storage_options + Some("us-west-2".to_string()), + Some(accessor), ) .await .unwrap(); @@ -1239,8 +1127,7 @@ mod tests { } #[tokio::test] - async fn test_storage_options_credentials_uses_dynamic_provider_when_no_explicit_aws_credentials( - ) { + async fn test_accessor_used_when_no_explicit_aws_credentials() { MockClock::set_system_time(Duration::from_secs(100_000)); let now_ms = MockClock::system_time().as_millis() as u64; @@ -1248,30 +1135,46 @@ mod tests { // Create a mock storage options provider let mock_storage_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); - // Create storage_options credentials (simulating credentials from storage_options map) - let storage_options_cred = StaticCredentialProvider::new(ObjectStoreAwsCredential { - key_id: "AKID_FROM_STORAGE_OPTIONS".to_string(), - secret_key: "SECRET_FROM_STORAGE_OPTIONS".to_string(), - token: Some("TOKEN_FROM_STORAGE_OPTIONS".to_string()), - }); - + // Create initial options let expires_at = now_ms + 600_000; // 10 minutes from now + let initial_options = HashMap::from([ + ( + "aws_access_key_id".to_string(), + "AKID_FROM_ACCESSOR".to_string(), + ), + ( + "aws_secret_access_key".to_string(), + "SECRET_FROM_ACCESSOR".to_string(), + ), + ( + "aws_session_token".to_string(), + "TOKEN_FROM_ACCESSOR".to_string(), + ), + ("expires_at_millis".to_string(), expires_at.to_string()), + ("refresh_offset_millis".to_string(), "300000".to_string()), // 5 minute refresh offset + ]); - // Build credentials with storage_options_credentials but NO explicit aws_credentials - let result = build_aws_credential_with_storage_options_provider( + // Create an accessor with initial options and provider + let accessor = Arc::new(StorageOptionsAccessor::with_initial_and_provider( + initial_options, mock_storage_provider.clone(), + )); + + // Build credentials with accessor but NO explicit aws_credentials + let (result, _region) = build_aws_credential( Duration::from_secs(300), None, // no explicit aws_credentials - Some(storage_options_cred), - Some(expires_at), + None, // no storage_options + Some("us-west-2".to_string()), + Some(accessor), ) .await .unwrap(); - // Get credential - should use the initial storage_options credentials + // Get credential - should use the initial accessor credentials let cred = result.get_credential().await.unwrap(); - assert_eq!(cred.key_id, "AKID_FROM_STORAGE_OPTIONS"); - assert_eq!(cred.secret_key, "SECRET_FROM_STORAGE_OPTIONS"); + assert_eq!(cred.key_id, "AKID_FROM_ACCESSOR"); + assert_eq!(cred.secret_key, "SECRET_FROM_ACCESSOR"); // Storage options provider should NOT have been called yet (using cached initial creds) assert_eq!(mock_storage_provider.get_call_count().await, 0); diff --git a/rust/lance-io/src/object_store/providers/azure.rs b/rust/lance-io/src/object_store/providers/azure.rs index 084e61a72e2..7bf566c8972 100644 --- a/rust/lance-io/src/object_store/providers/azure.rs +++ b/rust/lance-io/src/object_store/providers/azure.rs @@ -95,7 +95,7 @@ impl ObjectStoreProvider for AzureBlobStoreProvider { async fn new_store(&self, base_path: Url, params: &ObjectStoreParams) -> Result { let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); let mut storage_options = - StorageOptions(params.storage_options.clone().unwrap_or_default()); + StorageOptions(params.storage_options().cloned().unwrap_or_default()); storage_options.with_env_azure(); let download_retry_count = storage_options.download_retry_count(); @@ -124,7 +124,7 @@ impl ObjectStoreProvider for AzureBlobStoreProvider { download_retry_count, io_tracker: Default::default(), store_prefix: self - .calculate_object_store_prefix(&base_path, params.storage_options.as_ref())?, + .calculate_object_store_prefix(&base_path, params.storage_options())?, }) } @@ -232,21 +232,24 @@ mod tests { #[tokio::test] async fn test_use_opendal_flag() { + use crate::object_store::StorageOptionsAccessor; let provider = AzureBlobStoreProvider; let url = Url::parse("az://test-container/path").unwrap(); let params_with_flag = ObjectStoreParams { - storage_options: Some(HashMap::from([ - ("use_opendal".to_string(), "true".to_string()), - ("account_name".to_string(), "test_account".to_string()), - ( - "endpoint".to_string(), - "https://test_account.blob.core.windows.net".to_string(), - ), - ( - "account_key".to_string(), - "dGVzdF9hY2NvdW50X2tleQ==".to_string(), - ), - ])), + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::with_static_options( + HashMap::from([ + ("use_opendal".to_string(), "true".to_string()), + ("account_name".to_string(), "test_account".to_string()), + ( + "endpoint".to_string(), + "https://test_account.blob.core.windows.net".to_string(), + ), + ( + "account_key".to_string(), + "dGVzdF9hY2NvdW50X2tleQ==".to_string(), + ), + ]), + ))), ..Default::default() }; diff --git a/rust/lance-io/src/object_store/providers/gcp.rs b/rust/lance-io/src/object_store/providers/gcp.rs index 282aac13b94..dba5cd8dd40 100644 --- a/rust/lance-io/src/object_store/providers/gcp.rs +++ b/rust/lance-io/src/object_store/providers/gcp.rs @@ -96,7 +96,7 @@ impl ObjectStoreProvider for GcsStoreProvider { async fn new_store(&self, base_path: Url, params: &ObjectStoreParams) -> Result { let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); let mut storage_options = - StorageOptions(params.storage_options.clone().unwrap_or_default()); + StorageOptions(params.storage_options().cloned().unwrap_or_default()); storage_options.with_env_gcs(); let download_retry_count = storage_options.download_retry_count(); @@ -125,7 +125,7 @@ impl ObjectStoreProvider for GcsStoreProvider { download_retry_count, io_tracker: Default::default(), store_prefix: self - .calculate_object_store_prefix(&base_path, params.storage_options.as_ref())?, + .calculate_object_store_prefix(&base_path, params.storage_options())?, }) } } @@ -182,16 +182,19 @@ mod tests { #[tokio::test] async fn test_use_opendal_flag() { + use crate::object_store::StorageOptionsAccessor; let provider = GcsStoreProvider; let url = Url::parse("gs://test-bucket/path").unwrap(); let params_with_flag = ObjectStoreParams { - storage_options: Some(HashMap::from([ - ("use_opendal".to_string(), "true".to_string()), - ( - "service_account".to_string(), - "test@example.iam.gserviceaccount.com".to_string(), - ), - ])), + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::with_static_options( + HashMap::from([ + ("use_opendal".to_string(), "true".to_string()), + ( + "service_account".to_string(), + "test@example.iam.gserviceaccount.com".to_string(), + ), + ]), + ))), ..Default::default() }; diff --git a/rust/lance-io/src/object_store/providers/huggingface.rs b/rust/lance-io/src/object_store/providers/huggingface.rs index 5bbdf9fcd1c..55c5f6d50b9 100644 --- a/rust/lance-io/src/object_store/providers/huggingface.rs +++ b/rust/lance-io/src/object_store/providers/huggingface.rs @@ -65,7 +65,7 @@ impl ObjectStoreProvider for HuggingfaceStoreProvider { } = parse_hf_url(&base_path)?; let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); - let storage_options = StorageOptions(params.storage_options.clone().unwrap_or_default()); + let storage_options = StorageOptions(params.storage_options().cloned().unwrap_or_default()); let download_retry_count = storage_options.download_retry_count(); // Build OpenDAL config with allowed keys only. @@ -115,7 +115,7 @@ impl ObjectStoreProvider for HuggingfaceStoreProvider { download_retry_count, io_tracker: Default::default(), store_prefix: self - .calculate_object_store_prefix(&base_path, params.storage_options.as_ref())?, + .calculate_object_store_prefix(&base_path, params.storage_options())?, }) } @@ -159,12 +159,13 @@ mod tests { #[test] fn storage_option_revision_takes_precedence() { + use crate::object_store::StorageOptionsAccessor; + use std::sync::Arc; let url = Url::parse("hf://datasets/acme/repo/data/file").unwrap(); let params = ObjectStoreParams { - storage_options: Some(HashMap::from([( - String::from("hf_revision"), - String::from("stable"), - )])), + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::with_static_options( + HashMap::from([(String::from("hf_revision"), String::from("stable"))]), + ))), ..Default::default() }; // new_store should accept without creating operator; test precedence via builder config @@ -177,8 +178,7 @@ mod tests { config_map.insert("repo_type".to_string(), repo_type); config_map.insert("repo".to_string(), repo_id); if let Some(rev) = params - .storage_options - .as_ref() + .storage_options() .unwrap() .get("hf_revision") .cloned() diff --git a/rust/lance-io/src/object_store/providers/local.rs b/rust/lance-io/src/object_store/providers/local.rs index b7820668013..78c8c9632c4 100644 --- a/rust/lance-io/src/object_store/providers/local.rs +++ b/rust/lance-io/src/object_store/providers/local.rs @@ -20,7 +20,7 @@ pub struct FileStoreProvider; impl ObjectStoreProvider for FileStoreProvider { async fn new_store(&self, base_path: Url, params: &ObjectStoreParams) -> Result { let block_size = params.block_size.unwrap_or(DEFAULT_LOCAL_BLOCK_SIZE); - let storage_options = StorageOptions(params.storage_options.clone().unwrap_or_default()); + let storage_options = StorageOptions(params.storage_options().cloned().unwrap_or_default()); let download_retry_count = storage_options.download_retry_count(); Ok(ObjectStore { inner: Arc::new(LocalFileSystem::new()), @@ -33,7 +33,7 @@ impl ObjectStoreProvider for FileStoreProvider { download_retry_count, io_tracker: Default::default(), store_prefix: self - .calculate_object_store_prefix(&base_path, params.storage_options.as_ref())?, + .calculate_object_store_prefix(&base_path, params.storage_options())?, }) } diff --git a/rust/lance-io/src/object_store/providers/memory.rs b/rust/lance-io/src/object_store/providers/memory.rs index 91e67411d44..addc2fafc80 100644 --- a/rust/lance-io/src/object_store/providers/memory.rs +++ b/rust/lance-io/src/object_store/providers/memory.rs @@ -19,7 +19,7 @@ pub struct MemoryStoreProvider; impl ObjectStoreProvider for MemoryStoreProvider { async fn new_store(&self, base_path: Url, params: &ObjectStoreParams) -> Result { let block_size = params.block_size.unwrap_or(DEFAULT_LOCAL_BLOCK_SIZE); - let storage_options = StorageOptions(params.storage_options.clone().unwrap_or_default()); + let storage_options = StorageOptions(params.storage_options().cloned().unwrap_or_default()); let download_retry_count = storage_options.download_retry_count(); Ok(ObjectStore { inner: Arc::new(InMemory::new()), @@ -32,7 +32,7 @@ impl ObjectStoreProvider for MemoryStoreProvider { download_retry_count, io_tracker: Default::default(), store_prefix: self - .calculate_object_store_prefix(&base_path, params.storage_options.as_ref())?, + .calculate_object_store_prefix(&base_path, params.storage_options())?, }) } diff --git a/rust/lance-io/src/object_store/providers/oss.rs b/rust/lance-io/src/object_store/providers/oss.rs index 115227a0d90..aace58921d2 100644 --- a/rust/lance-io/src/object_store/providers/oss.rs +++ b/rust/lance-io/src/object_store/providers/oss.rs @@ -22,7 +22,7 @@ pub struct OssStoreProvider; impl ObjectStoreProvider for OssStoreProvider { async fn new_store(&self, base_path: Url, params: &ObjectStoreParams) -> Result { let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); - let storage_options = StorageOptions(params.storage_options.clone().unwrap_or_default()); + let storage_options = StorageOptions(params.storage_options().cloned().unwrap_or_default()); let bucket = base_path .host_str() @@ -107,8 +107,7 @@ impl ObjectStoreProvider for OssStoreProvider { io_parallelism: DEFAULT_CLOUD_IO_PARALLELISM, download_retry_count: storage_options.download_retry_count(), io_tracker: Default::default(), - store_prefix: self - .calculate_object_store_prefix(&url, params.storage_options.as_ref())?, + store_prefix: self.calculate_object_store_prefix(&url, params.storage_options())?, }) } } diff --git a/rust/lance-io/src/object_store/storage_options.rs b/rust/lance-io/src/object_store/storage_options.rs index 22854e8fd53..d0f5cc20e93 100644 --- a/rust/lance-io/src/object_store/storage_options.rs +++ b/rust/lance-io/src/object_store/storage_options.rs @@ -1,25 +1,42 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -//! Storage options provider for dynamic credential fetching +//! Storage options provider and accessor for dynamic credential fetching //! -//! This module provides a trait for fetching storage options from various sources -//! (namespace servers, secret managers, etc.) with support for expiration tracking -//! and automatic refresh. +//! This module provides: +//! - [`StorageOptionsProvider`] trait for fetching storage options from various sources +//! (namespace servers, secret managers, etc.) with support for expiration tracking +//! - [`StorageOptionsAccessor`] for unified access to storage options with automatic +//! caching and refresh use std::collections::HashMap; use std::fmt; use std::sync::Arc; +use std::time::Duration; + +#[cfg(test)] +use mock_instant::thread_local::{SystemTime, UNIX_EPOCH}; + +#[cfg(not(test))] +use std::time::{SystemTime, UNIX_EPOCH}; -use crate::{Error, Result}; use async_trait::async_trait; use lance_namespace::models::DescribeTableRequest; use lance_namespace::LanceNamespace; use snafu::location; +use tokio::sync::RwLock; + +use crate::{Error, Result}; /// Key for the expiration timestamp in storage options HashMap pub const EXPIRES_AT_MILLIS_KEY: &str = "expires_at_millis"; +/// Key for the refresh offset in storage options HashMap (milliseconds before expiry to refresh) +pub const REFRESH_OFFSET_MILLIS_KEY: &str = "refresh_offset_millis"; + +/// Default refresh offset: 60 seconds before expiration +const DEFAULT_REFRESH_OFFSET_MILLIS: u64 = 60_000; + /// Trait for providing storage options with expiration tracking /// /// Implementations can fetch storage options from various sources (namespace servers, @@ -139,3 +156,558 @@ impl StorageOptionsProvider for LanceNamespaceStorageOptionsProvider { ) } } + +/// Unified access to storage options with automatic caching and refresh +/// +/// This struct bundles static storage options with an optional dynamic provider, +/// handling all caching and refresh logic internally. It provides a single entry point +/// for accessing storage options regardless of whether they're static or dynamic. +/// +/// # Behavior +/// +/// - If only static options are provided, returns those options +/// - If a provider is configured, fetches from provider and caches results +/// - Automatically refreshes cached options before expiration (based on refresh_offset) +/// - Uses `expires_at_millis` key to track expiration +/// +/// # Thread Safety +/// +/// The accessor is thread-safe and can be shared across multiple tasks. +/// Concurrent refresh attempts are deduplicated using a try-lock mechanism. +pub struct StorageOptionsAccessor { + /// Initial/fallback static storage options + initial_options: Option>, + + /// Optional dynamic provider for refreshing options + provider: Option>, + + /// Cached storage options with expiration tracking + cache: Arc>>, + + /// Duration before expiry to trigger refresh + refresh_offset: Duration, +} + +impl fmt::Debug for StorageOptionsAccessor { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("StorageOptionsAccessor") + .field("has_initial_options", &self.initial_options.is_some()) + .field("has_provider", &self.provider.is_some()) + .field("refresh_offset", &self.refresh_offset) + .finish() + } +} + +#[derive(Debug, Clone)] +struct CachedStorageOptions { + options: HashMap, + expires_at_millis: Option, +} + +impl StorageOptionsAccessor { + /// Extract refresh offset from storage options, or use default + fn extract_refresh_offset(options: &HashMap) -> Duration { + options + .get(REFRESH_OFFSET_MILLIS_KEY) + .and_then(|s| s.parse::().ok()) + .map(Duration::from_millis) + .unwrap_or(Duration::from_millis(DEFAULT_REFRESH_OFFSET_MILLIS)) + } + + /// Create an accessor with only static options (no refresh capability) + /// + /// The returned accessor will always return the provided options. + /// This is useful when credentials don't expire or are managed externally. + pub fn with_static_options(options: HashMap) -> Self { + let expires_at_millis = options + .get(EXPIRES_AT_MILLIS_KEY) + .and_then(|s| s.parse::().ok()); + let refresh_offset = Self::extract_refresh_offset(&options); + + Self { + initial_options: Some(options.clone()), + provider: None, + cache: Arc::new(RwLock::new(Some(CachedStorageOptions { + options, + expires_at_millis, + }))), + refresh_offset, + } + } + + /// Create an accessor with a dynamic provider (no initial options) + /// + /// The accessor will fetch from the provider on first access and cache + /// the results. Refresh happens automatically before expiration. + /// Uses the default refresh offset (60 seconds) until options are fetched. + /// + /// # Arguments + /// * `provider` - The storage options provider for fetching fresh options + pub fn with_provider(provider: Arc) -> Self { + Self { + initial_options: None, + provider: Some(provider), + cache: Arc::new(RwLock::new(None)), + refresh_offset: Duration::from_millis(DEFAULT_REFRESH_OFFSET_MILLIS), + } + } + + /// Create an accessor with initial options and a dynamic provider + /// + /// Initial options are used until they expire, then the provider is called. + /// This avoids an immediate fetch when initial credentials are still valid. + /// The `refresh_offset_millis` key in initial_options controls refresh timing. + /// + /// # Arguments + /// * `initial_options` - Initial storage options to cache + /// * `provider` - The storage options provider for refreshing + pub fn with_initial_and_provider( + initial_options: HashMap, + provider: Arc, + ) -> Self { + let expires_at_millis = initial_options + .get(EXPIRES_AT_MILLIS_KEY) + .and_then(|s| s.parse::().ok()); + let refresh_offset = Self::extract_refresh_offset(&initial_options); + + Self { + initial_options: Some(initial_options.clone()), + provider: Some(provider), + cache: Arc::new(RwLock::new(Some(CachedStorageOptions { + options: initial_options, + expires_at_millis, + }))), + refresh_offset, + } + } + + /// Get current valid storage options + /// + /// - Returns cached options if not expired + /// - Fetches from provider if expired or not cached + /// - Falls back to initial_options if provider returns None + /// + /// # Errors + /// + /// Returns an error if: + /// - The provider fails to fetch options + /// - No options are available (no cache, no provider, no initial options) + pub async fn get_storage_options(&self) -> Result { + loop { + match self.do_get_storage_options().await? { + Some(options) => return Ok(options), + None => { + // Lock was busy, wait 10ms before retrying + tokio::time::sleep(Duration::from_millis(10)).await; + continue; + } + } + } + } + + async fn do_get_storage_options(&self) -> Result> { + // Check if we have valid cached options with read lock + { + let cached = self.cache.read().await; + if !self.needs_refresh(&cached) { + if let Some(cached_opts) = &*cached { + return Ok(Some(super::StorageOptions(cached_opts.options.clone()))); + } + } + } + + // If no provider, return initial options or error + let Some(provider) = &self.provider else { + return if let Some(initial) = &self.initial_options { + Ok(Some(super::StorageOptions(initial.clone()))) + } else { + Err(Error::IO { + source: Box::new(std::io::Error::other("No storage options available")), + location: location!(), + }) + }; + }; + + // Try to acquire write lock - if it fails, return None and let caller retry + let Ok(mut cache) = self.cache.try_write() else { + return Ok(None); + }; + + // Double-check if options are still stale after acquiring write lock + // (another thread might have refreshed them) + if !self.needs_refresh(&cache) { + if let Some(cached_opts) = &*cache { + return Ok(Some(super::StorageOptions(cached_opts.options.clone()))); + } + } + + log::debug!( + "Refreshing storage options from provider: {}", + provider.provider_id() + ); + + let storage_options_map = + provider + .fetch_storage_options() + .await + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to fetch storage options: {}", + e + ))), + location: location!(), + })?; + + let Some(options) = storage_options_map else { + // Provider returned None, fall back to initial options + if let Some(initial) = &self.initial_options { + return Ok(Some(super::StorageOptions(initial.clone()))); + } + return Err(Error::IO { + source: Box::new(std::io::Error::other( + "Provider returned no storage options", + )), + location: location!(), + }); + }; + + let expires_at_millis = options + .get(EXPIRES_AT_MILLIS_KEY) + .and_then(|s| s.parse::().ok()); + + if let Some(expires_at) = expires_at_millis { + let now_ms = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or(Duration::from_secs(0)) + .as_millis() as u64; + let expires_in_secs = (expires_at.saturating_sub(now_ms)) / 1000; + log::debug!( + "Successfully refreshed storage options from provider: {}, options expire in {} seconds", + provider.provider_id(), + expires_in_secs + ); + } else { + log::debug!( + "Successfully refreshed storage options from provider: {} (no expiration)", + provider.provider_id() + ); + } + + *cache = Some(CachedStorageOptions { + options: options.clone(), + expires_at_millis, + }); + + Ok(Some(super::StorageOptions(options))) + } + + fn needs_refresh(&self, cached: &Option) -> bool { + match cached { + None => true, + Some(cached_opts) => { + if let Some(expires_at_millis) = cached_opts.expires_at_millis { + let now_ms = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or(Duration::from_secs(0)) + .as_millis() as u64; + + // Refresh if we're within the refresh offset of expiration + let refresh_offset_millis = self.refresh_offset.as_millis() as u64; + now_ms + refresh_offset_millis >= expires_at_millis + } else { + // No expiration means options never expire + false + } + } + } + } + + /// Get the initial storage options without refresh + /// + /// Returns the initial options that were provided when creating the accessor. + /// This does not trigger any refresh, even if the options have expired. + pub fn initial_storage_options(&self) -> Option<&HashMap> { + self.initial_options.as_ref() + } + + /// Get the accessor ID for equality/hashing + /// + /// Returns the provider_id if a provider exists, otherwise generates + /// a stable ID from the initial options hash. + pub fn accessor_id(&self) -> String { + if let Some(provider) = &self.provider { + provider.provider_id() + } else if let Some(initial) = &self.initial_options { + // Generate a stable ID from initial options + use std::collections::hash_map::DefaultHasher; + use std::hash::{Hash, Hasher}; + + let mut hasher = DefaultHasher::new(); + let mut keys: Vec<_> = initial.keys().collect(); + keys.sort(); + for key in keys { + key.hash(&mut hasher); + initial.get(key).hash(&mut hasher); + } + format!("static_options_{:x}", hasher.finish()) + } else { + "empty_accessor".to_string() + } + } + + /// Check if this accessor has a dynamic provider + pub fn has_provider(&self) -> bool { + self.provider.is_some() + } + + /// Get the refresh offset duration + pub fn refresh_offset(&self) -> Duration { + self.refresh_offset + } + + /// Get the storage options provider, if any + pub fn provider(&self) -> Option<&Arc> { + self.provider.as_ref() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use mock_instant::thread_local::MockClock; + + #[derive(Debug)] + struct MockStorageOptionsProvider { + call_count: Arc>, + expires_in_millis: Option, + } + + impl MockStorageOptionsProvider { + fn new(expires_in_millis: Option) -> Self { + Self { + call_count: Arc::new(RwLock::new(0)), + expires_in_millis, + } + } + + async fn get_call_count(&self) -> usize { + *self.call_count.read().await + } + } + + #[async_trait] + impl StorageOptionsProvider for MockStorageOptionsProvider { + async fn fetch_storage_options(&self) -> Result>> { + let count = { + let mut c = self.call_count.write().await; + *c += 1; + *c + }; + + let mut options = HashMap::from([ + ("aws_access_key_id".to_string(), format!("AKID_{}", count)), + ( + "aws_secret_access_key".to_string(), + format!("SECRET_{}", count), + ), + ("aws_session_token".to_string(), format!("TOKEN_{}", count)), + ]); + + if let Some(expires_in) = self.expires_in_millis { + let now_ms = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis() as u64; + let expires_at = now_ms + expires_in; + options.insert(EXPIRES_AT_MILLIS_KEY.to_string(), expires_at.to_string()); + } + + Ok(Some(options)) + } + + fn provider_id(&self) -> String { + let ptr = Arc::as_ptr(&self.call_count) as usize; + format!("MockStorageOptionsProvider {{ id: {} }}", ptr) + } + } + + #[tokio::test] + async fn test_static_options_only() { + let options = HashMap::from([ + ("key1".to_string(), "value1".to_string()), + ("key2".to_string(), "value2".to_string()), + ]); + let accessor = StorageOptionsAccessor::with_static_options(options.clone()); + + let result = accessor.get_storage_options().await.unwrap(); + assert_eq!(result.0, options); + assert!(!accessor.has_provider()); + assert_eq!(accessor.initial_storage_options(), Some(&options)); + } + + #[tokio::test] + async fn test_provider_only() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); + let accessor = StorageOptionsAccessor::with_provider(mock_provider.clone()); + + let result = accessor.get_storage_options().await.unwrap(); + assert!(result.0.contains_key("aws_access_key_id")); + assert_eq!(result.0.get("aws_access_key_id").unwrap(), "AKID_1"); + assert!(accessor.has_provider()); + assert_eq!(accessor.initial_storage_options(), None); + assert_eq!(mock_provider.get_call_count().await, 1); + } + + #[tokio::test] + async fn test_initial_and_provider_uses_initial_first() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + let now_ms = MockClock::system_time().as_millis() as u64; + let expires_at = now_ms + 600_000; // 10 minutes from now + + let initial = HashMap::from([ + ("aws_access_key_id".to_string(), "INITIAL_KEY".to_string()), + ( + "aws_secret_access_key".to_string(), + "INITIAL_SECRET".to_string(), + ), + (EXPIRES_AT_MILLIS_KEY.to_string(), expires_at.to_string()), + ]); + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); + + let accessor = StorageOptionsAccessor::with_initial_and_provider( + initial.clone(), + mock_provider.clone(), + ); + + // First call uses initial + let result = accessor.get_storage_options().await.unwrap(); + assert_eq!(result.0.get("aws_access_key_id").unwrap(), "INITIAL_KEY"); + assert_eq!(mock_provider.get_call_count().await, 0); // Provider not called yet + } + + #[tokio::test] + async fn test_caching_and_refresh() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); // 10 min expiry + // Use with_initial_and_provider to set custom refresh_offset_millis (5 min = 300000ms) + let now_ms = MockClock::system_time().as_millis() as u64; + let expires_at = now_ms + 600_000; // 10 minutes from now + let initial = HashMap::from([ + (EXPIRES_AT_MILLIS_KEY.to_string(), expires_at.to_string()), + (REFRESH_OFFSET_MILLIS_KEY.to_string(), "300000".to_string()), // 5 min refresh offset + ]); + let accessor = + StorageOptionsAccessor::with_initial_and_provider(initial, mock_provider.clone()); + + // First call uses initial cached options + let result = accessor.get_storage_options().await.unwrap(); + assert!(result.0.contains_key(EXPIRES_AT_MILLIS_KEY)); + assert_eq!(mock_provider.get_call_count().await, 0); + + // Advance time to 6 minutes - should trigger refresh (within 5 min refresh offset) + MockClock::set_system_time(Duration::from_secs(100_000 + 360)); + let result = accessor.get_storage_options().await.unwrap(); + assert_eq!(result.0.get("aws_access_key_id").unwrap(), "AKID_1"); + assert_eq!(mock_provider.get_call_count().await, 1); + } + + #[tokio::test] + async fn test_expired_initial_triggers_refresh() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + let now_ms = MockClock::system_time().as_millis() as u64; + let expired_time = now_ms - 1_000; // Expired 1 second ago + + let initial = HashMap::from([ + ("aws_access_key_id".to_string(), "EXPIRED_KEY".to_string()), + (EXPIRES_AT_MILLIS_KEY.to_string(), expired_time.to_string()), + ]); + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); + + let accessor = + StorageOptionsAccessor::with_initial_and_provider(initial, mock_provider.clone()); + + // Should fetch from provider since initial is expired + let result = accessor.get_storage_options().await.unwrap(); + assert_eq!(result.0.get("aws_access_key_id").unwrap(), "AKID_1"); + assert_eq!(mock_provider.get_call_count().await, 1); + } + + #[tokio::test] + async fn test_accessor_id_with_provider() { + let mock_provider = Arc::new(MockStorageOptionsProvider::new(None)); + let accessor = StorageOptionsAccessor::with_provider(mock_provider); + + let id = accessor.accessor_id(); + assert!(id.starts_with("MockStorageOptionsProvider")); + } + + #[tokio::test] + async fn test_accessor_id_static() { + let options = HashMap::from([("key".to_string(), "value".to_string())]); + let accessor = StorageOptionsAccessor::with_static_options(options); + + let id = accessor.accessor_id(); + assert!(id.starts_with("static_options_")); + } + + #[tokio::test] + async fn test_concurrent_access() { + // Create a mock provider with far future expiration + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(9999999999999))); + + let accessor = Arc::new(StorageOptionsAccessor::with_provider(mock_provider.clone())); + + // Spawn 10 concurrent tasks that all try to get options at the same time + let mut handles = vec![]; + for i in 0..10 { + let acc = accessor.clone(); + let handle = tokio::spawn(async move { + let result = acc.get_storage_options().await.unwrap(); + assert_eq!(result.0.get("aws_access_key_id").unwrap(), "AKID_1"); + i + }); + handles.push(handle); + } + + // Wait for all tasks to complete + let results: Vec<_> = futures::future::join_all(handles) + .await + .into_iter() + .map(|r| r.unwrap()) + .collect(); + + // Verify all 10 tasks completed successfully + assert_eq!(results.len(), 10); + + // The provider should have been called exactly once + let call_count = mock_provider.get_call_count().await; + assert_eq!( + call_count, 1, + "Provider should be called exactly once despite concurrent access" + ); + } + + #[tokio::test] + async fn test_no_expiration_never_refreshes() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + let mock_provider = Arc::new(MockStorageOptionsProvider::new(None)); // No expiration + let accessor = StorageOptionsAccessor::with_provider(mock_provider.clone()); + + // First call fetches + accessor.get_storage_options().await.unwrap(); + assert_eq!(mock_provider.get_call_count().await, 1); + + // Advance time significantly + MockClock::set_system_time(Duration::from_secs(200_000)); + + // Should still use cached options + accessor.get_storage_options().await.unwrap(); + assert_eq!(mock_provider.get_call_count().await, 1); + } +} diff --git a/rust/lance-namespace-impls/src/dir.rs b/rust/lance-namespace-impls/src/dir.rs index 4d6a88419ee..875df33e580 100644 --- a/rust/lance-namespace-impls/src/dir.rs +++ b/rust/lance-namespace-impls/src/dir.rs @@ -471,8 +471,11 @@ impl DirectoryNamespaceBuilder { session: &Option>, ) -> Result<(Arc, Path)> { // Build ObjectStoreParams from storage options + let accessor = storage_options.clone().map(|opts| { + Arc::new(lance_io::object_store::StorageOptionsAccessor::with_static_options(opts)) + }); let params = ObjectStoreParams { - storage_options: storage_options.clone(), + storage_options_accessor: accessor, ..Default::default() }; @@ -1262,7 +1265,9 @@ impl LanceNamespace for DirectoryNamespace { }; let store_params = self.storage_options.as_ref().map(|opts| ObjectStoreParams { - storage_options: Some(opts.clone()), + storage_options_accessor: Some(Arc::new( + lance_io::object_store::StorageOptionsAccessor::with_static_options(opts.clone()), + )), ..Default::default() }); @@ -3106,15 +3111,10 @@ mod tests { .unwrap(); let reader1 = RecordBatchIterator::new(vec![data1].into_iter().map(Ok), schema.clone()); - let dataset = Dataset::write_into_namespace( - reader1, - namespace.clone(), - table_id.clone(), - None, - false, - ) - .await - .unwrap(); + let dataset = + Dataset::write_into_namespace(reader1, namespace.clone(), table_id.clone(), None) + .await + .unwrap(); assert_eq!(dataset.count_rows(None).await.unwrap(), 3); assert_eq!(dataset.version().version, 1); @@ -3140,7 +3140,6 @@ mod tests { namespace.clone(), table_id.clone(), Some(params_append), - false, ) .await .unwrap(); @@ -3169,7 +3168,6 @@ mod tests { namespace.clone(), table_id.clone(), Some(params_overwrite), - false, ) .await .unwrap(); diff --git a/rust/lance-namespace-impls/src/dir/manifest.rs b/rust/lance-namespace-impls/src/dir/manifest.rs index bfcb9602b9a..49d19712e26 100644 --- a/rust/lance-namespace-impls/src/dir/manifest.rs +++ b/rust/lance-namespace-impls/src/dir/manifest.rs @@ -982,7 +982,11 @@ impl ManifestNamespace { let write_params = WriteParams { session, store_params: storage_options.as_ref().map(|opts| ObjectStoreParams { - storage_options: Some(opts.clone()), + storage_options_accessor: Some(Arc::new( + lance_io::object_store::StorageOptionsAccessor::with_static_options( + opts.clone(), + ), + )), ..Default::default() }), ..Default::default() diff --git a/rust/lance-namespace-impls/src/rest_adapter.rs b/rust/lance-namespace-impls/src/rest_adapter.rs index 899863793ff..b63331c8a66 100644 --- a/rust/lance-namespace-impls/src/rest_adapter.rs +++ b/rust/lance-namespace-impls/src/rest_adapter.rs @@ -2695,15 +2695,10 @@ mod tests { .unwrap(); let reader1 = RecordBatchIterator::new(vec![data1].into_iter().map(Ok), schema.clone()); - let dataset = Dataset::write_into_namespace( - reader1, - namespace.clone(), - table_id.clone(), - None, - false, - ) - .await - .unwrap(); + let dataset = + Dataset::write_into_namespace(reader1, namespace.clone(), table_id.clone(), None) + .await + .unwrap(); assert_eq!(dataset.count_rows(None).await.unwrap(), 3); assert_eq!(dataset.version().version, 1); @@ -2729,7 +2724,6 @@ mod tests { namespace.clone(), table_id.clone(), Some(params_append), - false, ) .await .unwrap(); @@ -2758,7 +2752,6 @@ mod tests { namespace.clone(), table_id.clone(), Some(params_overwrite), - false, ) .await .unwrap(); diff --git a/rust/lance-table/src/io/commit.rs b/rust/lance-table/src/io/commit.rs index be650beac50..12e48121b36 100644 --- a/rust/lance-table/src/io/commit.rs +++ b/rust/lance-table/src/io/commit.rs @@ -765,20 +765,22 @@ pub async fn commit_handler_from_url( } }; let options = options.clone().unwrap_or_default(); - let storage_options = StorageOptions(options.storage_options.unwrap_or_default()); - let dynamo_endpoint = get_dynamodb_endpoint(&storage_options); - let expires_at_millis = storage_options.expires_at_millis(); - let storage_options = storage_options.as_s3_options(); + let storage_options_raw = + StorageOptions(options.storage_options().cloned().unwrap_or_default()); + let dynamo_endpoint = get_dynamodb_endpoint(&storage_options_raw); + let storage_options = storage_options_raw.as_s3_options(); let region = storage_options.get(&AmazonS3ConfigKey::Region).cloned(); + // Get accessor from the options + let accessor = options.get_accessor(); + let (aws_creds, region) = build_aws_credential( options.s3_credentials_refresh_offset, options.aws_credentials.clone(), Some(&storage_options), region, - options.storage_options_provider.clone(), - expires_at_millis, + accessor, ) .await?; diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index db291c546d0..fe0a989dbd6 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -35,7 +35,8 @@ use lance_file::reader::FileReaderOptions; use lance_file::version::LanceFileVersion; use lance_index::DatasetIndexExt; use lance_io::object_store::{ - LanceNamespaceStorageOptionsProvider, ObjectStore, ObjectStoreParams, + LanceNamespaceStorageOptionsProvider, ObjectStore, ObjectStoreParams, StorageOptions, + StorageOptionsAccessor, StorageOptionsProvider, }; use lance_io::utils::{read_last_block, read_message, read_metadata_offset, read_struct}; use lance_namespace::LanceNamespace; @@ -808,15 +809,11 @@ impl Dataset { /// * `namespace` - The namespace to use for table management /// * `table_id` - The table identifier /// * `params` - Write parameters - /// * `ignore_namespace_table_storage_options` - If true, ignore storage options returned - /// by the namespace and only use the storage options in params. The storage options - /// provider will not be created, so credentials will not be automatically refreshed. pub async fn write_into_namespace( batches: impl RecordBatchReader + Send + 'static, namespace: Arc, table_id: Vec, mut params: Option, - ignore_namespace_table_storage_options: bool, ) -> Result { let mut write_params = params.take().unwrap_or_default(); @@ -865,28 +862,30 @@ impl Dataset { location: location!(), })?; - // Set initial credentials and provider unless ignored - if !ignore_namespace_table_storage_options { - if let Some(namespace_storage_options) = response.storage_options { - let provider = Arc::new(LanceNamespaceStorageOptionsProvider::new( - namespace, table_id, - )); + // Set initial credentials and provider from namespace + if let Some(namespace_storage_options) = response.storage_options { + let provider: Arc = Arc::new( + LanceNamespaceStorageOptionsProvider::new(namespace, table_id), + ); - // Merge namespace storage options with any existing options - let mut merged_options = write_params - .store_params - .as_ref() - .and_then(|p| p.storage_options.clone()) - .unwrap_or_default(); - merged_options.extend(namespace_storage_options); - - let existing_params = write_params.store_params.take().unwrap_or_default(); - write_params.store_params = Some(ObjectStoreParams { - storage_options: Some(merged_options), - storage_options_provider: Some(provider), - ..existing_params - }); - } + // Merge namespace storage options with any existing options + let mut merged_options = write_params + .store_params + .as_ref() + .and_then(|p| p.storage_options().cloned()) + .unwrap_or_default(); + merged_options.extend(namespace_storage_options); + + let accessor = Arc::new(StorageOptionsAccessor::with_initial_and_provider( + merged_options, + provider, + )); + + let existing_params = write_params.store_params.take().unwrap_or_default(); + write_params.store_params = Some(ObjectStoreParams { + storage_options_accessor: Some(accessor), + ..existing_params + }); } Self::write(batches, uri.as_str(), Some(write_params)).await @@ -912,29 +911,32 @@ impl Dataset { location: location!(), })?; - // Set initial credentials and provider unless ignored - if !ignore_namespace_table_storage_options { - if let Some(namespace_storage_options) = response.storage_options { - let provider = Arc::new(LanceNamespaceStorageOptionsProvider::new( + // Set initial credentials and provider from namespace + if let Some(namespace_storage_options) = response.storage_options { + let provider: Arc = + Arc::new(LanceNamespaceStorageOptionsProvider::new( namespace.clone(), table_id.clone(), )); - // Merge namespace storage options with any existing options - let mut merged_options = write_params - .store_params - .as_ref() - .and_then(|p| p.storage_options.clone()) - .unwrap_or_default(); - merged_options.extend(namespace_storage_options); - - let existing_params = write_params.store_params.take().unwrap_or_default(); - write_params.store_params = Some(ObjectStoreParams { - storage_options: Some(merged_options), - storage_options_provider: Some(provider), - ..existing_params - }); - } + // Merge namespace storage options with any existing options + let mut merged_options = write_params + .store_params + .as_ref() + .and_then(|p| p.storage_options().cloned()) + .unwrap_or_default(); + merged_options.extend(namespace_storage_options); + + let accessor = Arc::new(StorageOptionsAccessor::with_initial_and_provider( + merged_options, + provider, + )); + + let existing_params = write_params.store_params.take().unwrap_or_default(); + write_params.store_params = Some(ObjectStoreParams { + storage_options_accessor: Some(accessor), + ..existing_params + }); } // For APPEND/OVERWRITE modes, we must open the existing dataset first @@ -942,11 +944,8 @@ impl Dataset { // assumes no dataset exists and converts the mode to CREATE. let mut builder = DatasetBuilder::from_uri(uri.as_str()); if let Some(ref store_params) = write_params.store_params { - if let Some(ref storage_options) = store_params.storage_options { - builder = builder.with_storage_options(storage_options.clone()); - } - if let Some(ref provider) = store_params.storage_options_provider { - builder = builder.with_storage_options_provider(provider.clone()); + if let Some(accessor) = &store_params.storage_options_accessor { + builder = builder.with_storage_options_accessor(accessor.clone()); } } let dataset = Arc::new(builder.load().await?); @@ -1612,11 +1611,22 @@ impl Dataset { &self.object_store } - /// Returns the storage options used when opening this dataset, if any. + /// Returns the initial storage options used when opening this dataset, if any. + /// + /// This returns the static initial options without triggering any refresh. + /// For the latest refreshed options, use [`Self::latest_storage_options`]. + #[deprecated(since = "0.25.0", note = "Use initial_storage_options() instead")] pub fn storage_options(&self) -> Option<&HashMap> { + self.initial_storage_options() + } + + /// Returns the initial storage options without triggering any refresh. + /// + /// For the latest refreshed options, use [`Self::latest_storage_options`]. + pub fn initial_storage_options(&self) -> Option<&HashMap> { self.store_params .as_ref() - .and_then(|params| params.storage_options.as_ref()) + .and_then(|params| params.storage_options()) } /// Returns the storage options provider used when opening this dataset, if any. @@ -1625,7 +1635,42 @@ impl Dataset { ) -> Option> { self.store_params .as_ref() - .and_then(|params| params.storage_options_provider.clone()) + .and_then(|params| params.storage_options_accessor.as_ref()) + .and_then(|accessor| accessor.provider().cloned()) + } + + /// Returns the unified storage options accessor for this dataset, if any. + /// + /// The accessor handles both static and dynamic storage options with automatic + /// caching and refresh. Use [`StorageOptionsAccessor::get_storage_options`] to + /// get the latest options. + pub fn storage_options_accessor(&self) -> Option> { + self.store_params + .as_ref() + .and_then(|params| params.get_accessor()) + } + + /// Returns the latest (possibly refreshed) storage options. + /// + /// If a dynamic storage options provider is configured, this will return + /// the cached options if still valid, or fetch fresh options if expired. + /// + /// For the initial static options without refresh, use [`Self::storage_options`]. + /// + /// # Returns + /// + /// - `Ok(Some(options))` - Storage options are available (static or refreshed) + /// - `Ok(None)` - No storage options were configured for this dataset + /// - `Err(...)` - Error occurred while fetching/refreshing options from provider + pub async fn latest_storage_options(&self) -> Result> { + // First check if we have an accessor (handles both static and dynamic options) + if let Some(accessor) = self.storage_options_accessor() { + let options = accessor.get_storage_options().await?; + return Ok(Some(options)); + } + + // Fallback to initial storage options if no accessor + Ok(self.initial_storage_options().cloned().map(StorageOptions)) } pub fn data_dir(&self) -> Path { diff --git a/rust/lance/src/dataset/builder.rs b/rust/lance/src/dataset/builder.rs index 639fda28bca..9eba7009d19 100644 --- a/rust/lance/src/dataset/builder.rs +++ b/rust/lance/src/dataset/builder.rs @@ -12,7 +12,7 @@ use lance_file::datatypes::populate_schema_dictionary; use lance_file::reader::FileReaderOptions; use lance_io::object_store::{ LanceNamespaceStorageOptionsProvider, ObjectStore, ObjectStoreParams, StorageOptions, - DEFAULT_CLOUD_IO_PARALLELISM, + StorageOptionsAccessor, DEFAULT_CLOUD_IO_PARALLELISM, }; use lance_namespace::models::DescribeTableRequest; use lance_namespace::LanceNamespace; @@ -95,8 +95,6 @@ impl DatasetBuilder { /// # Arguments /// * `namespace` - The namespace implementation to fetch table info from /// * `table_id` - The table identifier (e.g., vec!["my_table"]) - /// * `ignore_namespace_table_storage_options` - If true, storage options returned from - /// the namespace's `describe_table()` will be ignored (treated as None). Defaults to false. /// /// # Example /// ```ignore @@ -111,28 +109,17 @@ impl DatasetBuilder { /// /// // Load a dataset using storage options from namespace /// let dataset = DatasetBuilder::from_namespace( - /// namespace.clone(), - /// vec!["my_table".to_string()], - /// false, - /// ) - /// .await? - /// .load() - /// .await?; - /// - /// // Load a dataset ignoring namespace storage options - /// let dataset = DatasetBuilder::from_namespace( /// namespace, /// vec!["my_table".to_string()], - /// true, /// ) /// .await? /// .load() /// .await?; /// ``` + #[allow(deprecated)] pub async fn from_namespace( namespace: Arc, table_id: Vec, - ignore_namespace_table_storage_options: bool, ) -> Result { let request = DescribeTableRequest { id: Some(table_id.clone()), @@ -156,17 +143,17 @@ impl DatasetBuilder { let mut builder = Self::from_uri(table_uri); - let namespace_storage_options = if ignore_namespace_table_storage_options { - None - } else { - response.storage_options - }; + // Use namespace storage options if available + let namespace_storage_options = response.storage_options; builder.storage_options_override = namespace_storage_options.clone(); - if namespace_storage_options.is_some() { - builder.options.storage_options_provider = Some(Arc::new( + if let Some(initial_opts) = namespace_storage_options { + let provider: Arc = Arc::new( LanceNamespaceStorageOptionsProvider::new(namespace, table_id), + ); + builder.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_initial_and_provider(initial_opts, provider), )); } @@ -289,7 +276,27 @@ impl DatasetBuilder { /// - [S3 options](https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html#variants) /// - [Google options](https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html#variants) pub fn with_storage_options(mut self, storage_options: HashMap) -> Self { - self.options.storage_options = Some(storage_options); + // Merge with existing options if accessor exists, otherwise create new static accessor + if let Some(existing) = self.options.storage_options_accessor.take() { + let mut merged = existing + .initial_storage_options() + .cloned() + .unwrap_or_default(); + merged.extend(storage_options); + if let Some(provider) = existing.provider().cloned() { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_initial_and_provider(merged, provider), + )); + } else { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_static_options(merged), + )); + } + } else { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_static_options(storage_options), + )); + } self } @@ -301,9 +308,25 @@ impl DatasetBuilder { /// .with_storage_option("region", "us-east-1"); /// ``` pub fn with_storage_option(mut self, key: impl AsRef, value: impl AsRef) -> Self { - let mut storage_options = self.options.storage_options.unwrap_or_default(); + let mut storage_options = self.options.storage_options().cloned().unwrap_or_default(); storage_options.insert(key.as_ref().to_string(), value.as_ref().to_string()); - self.options.storage_options = Some(storage_options); + + // Merge with existing accessor if present + if let Some(existing) = self.options.storage_options_accessor.take() { + if let Some(provider) = existing.provider().cloned() { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_initial_and_provider(storage_options, provider), + )); + } else { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_static_options(storage_options), + )); + } + } else { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_static_options(storage_options), + )); + } self } @@ -355,7 +378,50 @@ impl DatasetBuilder { mut self, provider: Arc, ) -> Self { - self.options.storage_options_provider = Some(provider); + // Preserve existing storage options if any + if let Some(existing) = self.options.storage_options_accessor.take() { + if let Some(initial) = existing.initial_storage_options().cloned() { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_initial_and_provider(initial, provider), + )); + } else { + self.options.storage_options_accessor = + Some(Arc::new(StorageOptionsAccessor::with_provider(provider))); + } + } else { + self.options.storage_options_accessor = + Some(Arc::new(StorageOptionsAccessor::with_provider(provider))); + } + self + } + + /// Set a unified storage options accessor for credential management + /// + /// The accessor bundles static storage options with an optional dynamic provider, + /// handling all caching and refresh logic internally. + /// + /// # Arguments + /// * `accessor` - The storage options accessor + /// + /// # Example + /// ```ignore + /// use std::sync::Arc; + /// use std::time::Duration; + /// use lance_io::object_store::StorageOptionsAccessor; + /// + /// // Create an accessor with a dynamic provider + /// let accessor = Arc::new(StorageOptionsAccessor::with_provider( + /// provider, + /// Duration::from_secs(300), // 5 minute refresh offset + /// )); + /// + /// let dataset = DatasetBuilder::from_uri("s3://bucket/table.lance") + /// .with_storage_options_accessor(accessor) + /// .load() + /// .await?; + /// ``` + pub fn with_storage_options_accessor(mut self, accessor: Arc) -> Self { + self.options.storage_options_accessor = Some(accessor); self } @@ -418,8 +484,8 @@ impl DatasetBuilder { let storage_options = self .options - .storage_options - .clone() + .storage_options() + .cloned() .map(StorageOptions::new) .unwrap_or_default(); let download_retry_count = storage_options.download_retry_count(); @@ -478,12 +544,29 @@ impl DatasetBuilder { } async fn load_impl(mut self) -> Result { - // Apply storage_options_override last to ensure namespace options take precedence + // Apply storage_options_override to merge namespace options with any existing accessor if let Some(override_opts) = self.storage_options_override.take() { - let mut merged_opts = self.options.storage_options.clone().unwrap_or_default(); + // Get existing options and merge + let mut merged_opts = self.options.storage_options().cloned().unwrap_or_default(); // Override with namespace storage options - they take precedence merged_opts.extend(override_opts); - self.options.storage_options = Some(merged_opts); + + // Update accessor with merged options + if let Some(accessor) = &self.options.storage_options_accessor { + if let Some(provider) = accessor.provider().cloned() { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_initial_and_provider(merged_opts, provider), + )); + } else { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_static_options(merged_opts), + )); + } + } else { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_static_options(merged_opts), + )); + } } let session = match self.session.as_ref() { diff --git a/rust/lance/src/dataset/fragment/write.rs b/rust/lance/src/dataset/fragment/write.rs index 0ab7dd15d21..a9c87e44a05 100644 --- a/rust/lance/src/dataset/fragment/write.rs +++ b/rust/lance/src/dataset/fragment/write.rs @@ -303,12 +303,12 @@ impl<'a> FragmentCreateBuilder<'a> { async fn existing_dataset_schema(&self) -> Result> { let mut builder = DatasetBuilder::from_uri(self.dataset_uri); - let storage_options = self + let accessor = self .write_params .and_then(|p| p.store_params.as_ref()) - .and_then(|p| p.storage_options.clone()); - if let Some(storage_options) = storage_options { - builder = builder.with_storage_options(storage_options); + .and_then(|p| p.storage_options_accessor.clone()); + if let Some(accessor) = accessor { + builder = builder.with_storage_options_accessor(accessor); } match builder.load().await { Ok(dataset) => { diff --git a/rust/lance/src/io.rs b/rust/lance/src/io.rs index 1ad45ce2d68..1113ef0a2a7 100644 --- a/rust/lance/src/io.rs +++ b/rust/lance/src/io.rs @@ -9,6 +9,9 @@ pub mod exec; pub use lance_io::{ bytes_read_counter, iops_counter, - object_store::{ObjectStore, ObjectStoreParams, ObjectStoreRegistry, WrappingObjectStore}, + object_store::{ + ObjectStore, ObjectStoreParams, ObjectStoreRegistry, StorageOptionsAccessor, + WrappingObjectStore, + }, stream::RecordBatchStream, }; diff --git a/rust/lance/src/io/commit/s3_test.rs b/rust/lance/src/io/commit/s3_test.rs index 35e64703688..1402fb25d46 100644 --- a/rust/lance/src/io/commit/s3_test.rs +++ b/rust/lance/src/io/commit/s3_test.rs @@ -8,7 +8,7 @@ use crate::{ dataset::{ builder::DatasetBuilder, CommitBuilder, InsertBuilder, ReadParams, WriteMode, WriteParams, }, - io::ObjectStoreParams, + io::{ObjectStoreParams, StorageOptionsAccessor}, }; use aws_config::{BehaviorVersion, ConfigLoader, Region, SdkConfig}; use aws_sdk_s3::{config::Credentials, Client as S3Client}; @@ -186,12 +186,12 @@ async fn test_concurrent_writers() { // Create a table let store_params = ObjectStoreParams { object_store_wrapper: Some(io_tracker.clone()), - storage_options: Some( + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::with_static_options( CONFIG .iter() .map(|(k, v)| (k.to_string(), v.to_string())) .collect(), - ), + ))), ..Default::default() }; let write_params = WriteParams { @@ -270,12 +270,12 @@ async fn test_ddb_open_iops() { // Create a table let store_params = ObjectStoreParams { object_store_wrapper: Some(io_tracker.clone()), - storage_options: Some( + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::with_static_options( CONFIG .iter() .map(|(k, v)| (k.to_string(), v.to_string())) .collect(), - ), + ))), ..Default::default() }; let write_params = WriteParams {