From 2bb95e9b7c982b06da68b84d422a857839336990 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Thu, 15 Jan 2026 17:41:21 -0800 Subject: [PATCH 1/9] initial impl --- java/lance-jni/src/blocking_dataset.rs | 77 +++ java/lance-jni/src/traits.rs | 22 + java/src/main/java/org/lance/Dataset.java | 36 ++ python/python/lance/dataset.py | 43 ++ python/src/dataset.rs | 34 + python/src/lib.rs | 1 + python/src/storage_options.rs | 132 +++- rust/lance-io/src/object_store.rs | 58 +- .../src/object_store/providers/aws.rs | 445 ++++++------- .../object_store/storage_options_accessor.rs | 584 ++++++++++++++++++ rust/lance-table/src/io/commit.rs | 14 +- rust/lance/src/dataset.rs | 48 +- rust/lance/src/dataset/builder.rs | 37 +- 13 files changed, 1252 insertions(+), 279 deletions(-) create mode 100644 rust/lance-io/src/object_store/storage_options_accessor.rs diff --git a/java/lance-jni/src/blocking_dataset.rs b/java/lance-jni/src/blocking_dataset.rs index dde2b81879b..1c34b605071 100644 --- a/java/lance-jni/src/blocking_dataset.rs +++ b/java/lance-jni/src/blocking_dataset.rs @@ -77,10 +77,41 @@ pub struct BlockingDataset { impl BlockingDataset { /// Get the storage options provider that was used when opening this dataset + #[deprecated(note = "Use storage_options_accessor() instead")] pub fn get_storage_options_provider(&self) -> Option> { + #[allow(deprecated)] self.inner.storage_options_provider() } + /// Get the initial storage options used to open this dataset. + /// + /// Returns the options that were provided when the dataset was opened, + /// without any refresh from the provider. Returns None if no storage options + /// were provided. + pub fn storage_options(&self) -> Option> { + self.inner.storage_options().cloned() + } + + /// Get the latest storage options, potentially refreshed from the provider. + /// + /// If a storage options provider was configured and credentials are expiring, + /// this will refresh them. + pub fn latest_storage_options(&self) -> Result>> { + RT.block_on(async { self.inner.latest_storage_options().await }) + .map(|opt| opt.map(|opts| opts.0)) + .map_err(|e| Error::io_error(e.to_string())) + } + + /// Get the storage options accessor for this dataset. + /// + /// The accessor bundles static storage options and optional dynamic provider, + /// handling caching and refresh logic internally. + pub fn storage_options_accessor( + &self, + ) -> Option> { + self.inner.storage_options_accessor() + } + pub fn drop(uri: &str, storage_options: HashMap) -> Result<()> { RT.block_on(async move { let registry = Arc::new(ObjectStoreRegistry::default()); @@ -1319,6 +1350,52 @@ fn inner_latest_version_id(env: &mut JNIEnv, java_dataset: JObject) -> Result( + mut env: JNIEnv<'local>, + java_dataset: JObject, +) -> JObject<'local> { + ok_or_throw!(env, inner_get_storage_options(&mut env, java_dataset)) +} + +fn inner_get_storage_options<'local>( + env: &mut JNIEnv<'local>, + java_dataset: JObject, +) -> Result> { + let storage_options = { + let dataset_guard = + unsafe { env.get_rust_field::<_, _, BlockingDataset>(java_dataset, NATIVE_DATASET) }?; + dataset_guard.storage_options() + }; + match storage_options { + Some(opts) => opts.into_java(env), + None => Ok(JObject::null()), + } +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_Dataset_nativeGetLatestStorageOptions<'local>( + mut env: JNIEnv<'local>, + java_dataset: JObject, +) -> JObject<'local> { + ok_or_throw!(env, inner_get_latest_storage_options(&mut env, java_dataset)) +} + +fn inner_get_latest_storage_options<'local>( + env: &mut JNIEnv<'local>, + java_dataset: JObject, +) -> Result> { + let storage_options = { + let dataset_guard = + unsafe { env.get_rust_field::<_, _, BlockingDataset>(java_dataset, NATIVE_DATASET) }?; + dataset_guard.latest_storage_options()? + }; + match storage_options { + Some(opts) => opts.into_java(env), + None => Ok(JObject::null()), + } +} + #[no_mangle] pub extern "system" fn Java_org_lance_Dataset_nativeCheckoutLatest( mut env: JNIEnv, diff --git a/java/lance-jni/src/traits.rs b/java/lance-jni/src/traits.rs index 7da64d453c2..ebc53b1679a 100644 --- a/java/lance-jni/src/traits.rs +++ b/java/lance-jni/src/traits.rs @@ -1,6 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors +use std::collections::HashMap; + use jni::objects::{JIntArray, JLongArray, JMap, JObject, JString, JValue, JValueGen}; use jni::JNIEnv; @@ -224,6 +226,26 @@ impl IntoJava for &String { } } +impl IntoJava for HashMap { + fn into_java<'a>(self, env: &mut JNIEnv<'a>) -> Result> { + let hash_map = env.new_object("java/util/HashMap", "()V", &[])?; + for (key, value) in self { + let java_key = env.new_string(&key)?; + let java_value = env.new_string(&value)?; + env.call_method( + &hash_map, + "put", + "(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;", + &[ + JValueGen::Object(&java_key.into()), + JValueGen::Object(&java_value.into()), + ], + )?; + } + Ok(hash_map) + } +} + impl IntoJava for JLance> { fn into_java<'a>(self, env: &mut JNIEnv<'a>) -> Result> { let obj = match self.0 { diff --git a/java/src/main/java/org/lance/Dataset.java b/java/src/main/java/org/lance/Dataset.java index cc4e0d03dbb..4bf638f6262 100644 --- a/java/src/main/java/org/lance/Dataset.java +++ b/java/src/main/java/org/lance/Dataset.java @@ -758,6 +758,42 @@ public long latestVersion() { private native long nativeGetLatestVersionId(); + /** + * Get the initial storage options used to open this dataset. + * + *

This returns the options that were provided when the dataset was opened, without any refresh + * from the provider. Returns null if no storage options were provided. + * + * @return the initial storage options, or null if none were provided + */ + public Map getStorageOptions() { + try (LockManager.ReadLock readLock = lockManager.acquireReadLock()) { + Preconditions.checkArgument(nativeDatasetHandle != 0, "Dataset is closed"); + return nativeGetStorageOptions(); + } + } + + private native Map nativeGetStorageOptions(); + + /** + * Get the latest storage options, potentially refreshed from the provider. + * + *

If a storage options provider was configured and credentials are expiring, this will refresh + * them. + * + * @return the latest storage options (static or refreshed from provider), or null if no storage + * options were configured for this dataset + * @throws RuntimeException if an error occurs while fetching/refreshing options from the provider + */ + public Map getLatestStorageOptions() { + try (LockManager.ReadLock readLock = lockManager.acquireReadLock()) { + Preconditions.checkArgument(nativeDatasetHandle != 0, "Dataset is closed"); + return nativeGetLatestStorageOptions(); + } + } + + private native Map nativeGetLatestStorageOptions(); + /** Checkout the dataset to the latest version. */ public void checkoutLatest() { try (LockManager.WriteLock writeLock = lockManager.acquireWriteLock()) { diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 01066a88d34..bc45824cdbd 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -2225,6 +2225,49 @@ def latest_version(self) -> int: """ return self._ds.latest_version() + @property + def storage_options(self) -> Optional[Dict[str, str]]: + """ + Get the initial storage options used to open this dataset. + + This returns the options that were provided when the dataset was opened, + without any refresh from the provider. Returns None if no storage options + were provided. + """ + return self._ds.storage_options() + + def latest_storage_options(self) -> Optional[Dict[str, str]]: + """ + Get the latest storage options, potentially refreshed from the provider. + + If a storage options provider was configured and credentials are expiring, + this will refresh them. + + Returns + ------- + Optional[Dict[str, str]] + - Storage options dict if configured (static or refreshed from provider) + - None if no storage options were configured for this dataset + + Raises + ------ + IOError + If an error occurs while fetching/refreshing options from the provider + """ + return self._ds.latest_storage_options() + + @property + def storage_options_accessor(self): + """ + Get the storage options accessor for this dataset. + + The accessor bundles static storage options and optional dynamic provider, + handling caching and refresh logic internally. + + Returns None if neither storage options nor a provider were configured. + """ + return self._ds.storage_options_accessor() + def checkout_version( self, version: int | str | Tuple[Optional[str], Optional[int]] ) -> "LanceDataset": diff --git a/python/src/dataset.rs b/python/src/dataset.rs index df03763472f..017b3c085e3 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -92,6 +92,7 @@ use crate::rt; use crate::scanner::ScanStatistics; use crate::schema::{logical_schema_from_lance, LanceSchema}; use crate::session::Session; +use crate::storage_options::PyStorageOptionsAccessor; use crate::utils::PyLance; use crate::{LanceReader, Scanner}; @@ -1518,6 +1519,39 @@ impl Dataset { .map_err(|err| PyIOError::new_err(err.to_string())) } + /// Get the initial storage options used to open this dataset. + /// + /// This returns the options that were provided when the dataset was opened, + /// without any refresh from the provider. Returns None if no storage options + /// were provided. + fn storage_options(&self) -> Option> { + self.ds.storage_options().cloned() + } + + /// Get the latest storage options, potentially refreshed from the provider. + /// + /// If a storage options provider was configured and credentials are expiring, + /// this will refresh them. Returns the current valid storage options, or None + /// if no storage options accessor is configured. + fn latest_storage_options( + self_: PyRef<'_, Self>, + ) -> PyResult>> { + let result = rt() + .block_on(Some(self_.py()), self_.ds.latest_storage_options())? + .map_err(|err| PyIOError::new_err(err.to_string()))?; + Ok(result.map(|opts| opts.0)) + } + + /// Get the storage options accessor for this dataset. + /// + /// The accessor bundles static storage options and optional dynamic provider, + /// handling caching and refresh logic internally. + fn storage_options_accessor(&self) -> Option { + self.ds + .storage_options_accessor() + .map(PyStorageOptionsAccessor::new) + } + fn checkout_version(&self, version: Bound) -> PyResult { let reference = self.transform_ref(Some(version))?; self._checkout_version(reference) diff --git a/python/src/lib.rs b/python/src/lib.rs index 6448ff9afa3..b7f449289bb 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -281,6 +281,7 @@ fn lance(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; #[cfg(feature = "rest-adapter")] m.add_class::()?; + m.add_class::()?; m.add_wrapped(wrap_pyfunction!(bfloat16_array))?; m.add_wrapped(wrap_pyfunction!(write_dataset))?; m.add_wrapped(wrap_pyfunction!(write_fragments))?; diff --git a/python/src/storage_options.rs b/python/src/storage_options.rs index 7501fc4a090..56486d997be 100644 --- a/python/src/storage_options.rs +++ b/python/src/storage_options.rs @@ -3,9 +3,10 @@ use std::collections::HashMap; use std::sync::Arc; +use std::time::Duration; use async_trait::async_trait; -use lance_io::object_store::StorageOptionsProvider; +use lance_io::object_store::{StorageOptionsAccessor, StorageOptionsProvider}; use pyo3::prelude::*; use pyo3::types::PyDict; @@ -167,3 +168,132 @@ pub fn py_object_to_storage_options_provider( let py_provider = PyStorageOptionsProvider::new(py_obj)?; Ok(Arc::new(PyStorageOptionsProviderWrapper::new(py_provider))) } + +/// Python wrapper for StorageOptionsAccessor +/// +/// This wraps a Rust StorageOptionsAccessor and exposes it to Python. +#[pyclass(name = "StorageOptionsAccessor")] +#[derive(Clone)] +pub struct PyStorageOptionsAccessor { + inner: Arc, +} + +impl PyStorageOptionsAccessor { + pub fn new(accessor: Arc) -> Self { + Self { inner: accessor } + } + + pub fn inner(&self) -> Arc { + self.inner.clone() + } +} + +#[pymethods] +impl PyStorageOptionsAccessor { + /// Create an accessor with only static options (no refresh capability) + #[staticmethod] + fn static_options(options: HashMap) -> Self { + Self { + inner: Arc::new(StorageOptionsAccessor::static_options(options)), + } + } + + /// Create an accessor with a dynamic provider (no initial options) + #[staticmethod] + #[pyo3(signature = (provider, refresh_offset_secs=300))] + fn with_provider(provider: &Bound<'_, PyAny>, refresh_offset_secs: u64) -> PyResult { + let rust_provider = py_object_to_storage_options_provider(provider)?; + Ok(Self { + inner: Arc::new(StorageOptionsAccessor::with_provider( + rust_provider, + Duration::from_secs(refresh_offset_secs), + )), + }) + } + + /// Create an accessor with initial options and a dynamic provider + #[staticmethod] + #[pyo3(signature = (initial_options, provider, refresh_offset_secs=300))] + fn with_initial_and_provider( + initial_options: HashMap, + provider: &Bound<'_, PyAny>, + refresh_offset_secs: u64, + ) -> PyResult { + let rust_provider = py_object_to_storage_options_provider(provider)?; + Ok(Self { + inner: Arc::new(StorageOptionsAccessor::with_initial_and_provider( + initial_options, + rust_provider, + Duration::from_secs(refresh_offset_secs), + )), + }) + } + + /// Get current valid storage options + fn get_storage_options(&self, py: Python<'_>) -> PyResult> { + let accessor = self.inner.clone(); + let options = rt() + .block_on(Some(py), accessor.get_storage_options())? + .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(e.to_string()))?; + Ok(options.0) + } + + /// Get the initial storage options without refresh + fn initial_storage_options(&self) -> Option> { + self.inner.initial_storage_options().cloned() + } + + /// Get the accessor ID for equality/hashing + fn accessor_id(&self) -> String { + self.inner.accessor_id() + } + + /// Check if this accessor has a dynamic provider + fn has_provider(&self) -> bool { + self.inner.has_provider() + } + + /// Get the refresh offset in seconds + fn refresh_offset_secs(&self) -> u64 { + self.inner.refresh_offset().as_secs() + } + + fn __repr__(&self) -> String { + format!( + "StorageOptionsAccessor(id={}, has_provider={})", + self.inner.accessor_id(), + self.inner.has_provider() + ) + } +} + +/// Create a StorageOptionsAccessor from Python parameters +/// +/// This handles the conversion from Python types to Rust StorageOptionsAccessor +pub fn create_accessor_from_python( + storage_options: Option>, + storage_options_provider: Option<&Bound<'_, PyAny>>, + refresh_offset_secs: u64, +) -> PyResult>> { + match (storage_options, storage_options_provider) { + (Some(opts), Some(provider)) => { + let rust_provider = py_object_to_storage_options_provider(provider)?; + Ok(Some(Arc::new(StorageOptionsAccessor::with_initial_and_provider( + opts, + rust_provider, + Duration::from_secs(refresh_offset_secs), + )))) + } + (None, Some(provider)) => { + let rust_provider = py_object_to_storage_options_provider(provider)?; + Ok(Some(Arc::new(StorageOptionsAccessor::with_provider( + rust_provider, + Duration::from_secs(refresh_offset_secs), + )))) + } + (Some(opts), None) => { + Ok(Some(Arc::new(StorageOptionsAccessor::static_options(opts)))) + } + (None, None) => Ok(None), + } +} diff --git a/rust/lance-io/src/object_store.rs b/rust/lance-io/src/object_store.rs index d9d700bf878..b53a665f545 100644 --- a/rust/lance-io/src/object_store.rs +++ b/rust/lance-io/src/object_store.rs @@ -35,6 +35,7 @@ use super::local::LocalObjectReader; mod list_retry; pub mod providers; pub mod storage_options; +pub mod storage_options_accessor; mod tracing; use crate::object_reader::SmallReader; use crate::object_writer::WriteResult; @@ -66,6 +67,7 @@ pub use providers::{ObjectStoreProvider, ObjectStoreRegistry}; pub use storage_options::{ LanceNamespaceStorageOptionsProvider, StorageOptionsProvider, EXPIRES_AT_MILLIS_KEY, }; +pub use storage_options_accessor::StorageOptionsAccessor; #[async_trait] pub trait ObjectStoreExt { @@ -193,7 +195,16 @@ pub struct ObjectStoreParams { pub object_store_wrapper: Option>, pub storage_options: Option>, /// Dynamic storage options provider for automatic credential refresh + #[deprecated( + since = "0.25.0", + note = "Use storage_options_accessor instead for unified access to storage options" + )] pub storage_options_provider: Option>, + /// Unified storage options accessor with caching and automatic refresh + /// + /// This is the preferred way to provide storage options. It bundles static options + /// with an optional dynamic provider and handles all caching and refresh logic. + pub storage_options_accessor: Option>, /// Use constant size upload parts for multipart uploads. Only necessary /// for Cloudflare R2, which doesn't support variable size parts. When this /// is false, max upload size is 2.5TB. When this is true, the max size is @@ -214,12 +225,46 @@ impl Default for ObjectStoreParams { object_store_wrapper: None, storage_options: None, storage_options_provider: None, + storage_options_accessor: None, use_constant_size_upload_parts: false, list_is_lexically_ordered: None, } } } +impl ObjectStoreParams { + /// Get or create a StorageOptionsAccessor from the params + /// + /// If `storage_options_accessor` is already set, returns it. + /// Otherwise, creates an accessor from the legacy `storage_options` and + /// `storage_options_provider` fields. + #[allow(deprecated)] + pub fn get_or_create_accessor(&self) -> Option> { + if let Some(accessor) = &self.storage_options_accessor { + return Some(accessor.clone()); + } + + // Create accessor from legacy fields + match (&self.storage_options, &self.storage_options_provider) { + (Some(opts), Some(provider)) => { + Some(Arc::new(StorageOptionsAccessor::with_initial_and_provider( + opts.clone(), + provider.clone(), + self.s3_credentials_refresh_offset, + ))) + } + (None, Some(provider)) => Some(Arc::new(StorageOptionsAccessor::with_provider( + provider.clone(), + self.s3_credentials_refresh_offset, + ))), + (Some(opts), None) => Some(Arc::new(StorageOptionsAccessor::static_options( + opts.clone(), + ))), + (None, None) => None, + } + } +} + // We implement hash for caching impl std::hash::Hash for ObjectStoreParams { #[allow(deprecated)] @@ -247,6 +292,9 @@ impl std::hash::Hash for ObjectStoreParams { if let Some(provider) = &self.storage_options_provider { provider.provider_id().hash(state); } + if let Some(accessor) = &self.storage_options_accessor { + accessor.accessor_id().hash(state); + } self.use_constant_size_upload_parts.hash(state); self.list_is_lexically_ordered.hash(state); } @@ -263,7 +311,7 @@ impl PartialEq for ObjectStoreParams { } // For equality, we use pointer comparison for ObjectStore, S3 credentials, wrapper - // For storage_options_provider, we use provider_id() for semantic equality + // For storage_options_provider and accessor, we use provider_id()/accessor_id() for semantic equality self.block_size == other.block_size && self .object_store @@ -285,6 +333,14 @@ impl PartialEq for ObjectStoreParams { .storage_options_provider .as_ref() .map(|p| p.provider_id()) + && self + .storage_options_accessor + .as_ref() + .map(|a| a.accessor_id()) + == other + .storage_options_accessor + .as_ref() + .map(|a| a.accessor_id()) && self.use_constant_size_upload_parts == other.use_constant_size_upload_parts && self.list_is_lexically_ordered == other.list_is_lexically_ordered } diff --git a/rust/lance-io/src/object_store/providers/aws.rs b/rust/lance-io/src/object_store/providers/aws.rs index 5def1b52923..ad1c0bcefd5 100644 --- a/rust/lance-io/src/object_store/providers/aws.rs +++ b/rust/lance-io/src/object_store/providers/aws.rs @@ -28,8 +28,9 @@ use tokio::sync::RwLock; use url::Url; use crate::object_store::{ - ObjectStore, ObjectStoreParams, ObjectStoreProvider, StorageOptions, StorageOptionsProvider, - DEFAULT_CLOUD_BLOCK_SIZE, DEFAULT_CLOUD_IO_PARALLELISM, DEFAULT_MAX_IOP_SIZE, + ObjectStore, ObjectStoreParams, ObjectStoreProvider, StorageOptions, StorageOptionsAccessor, + StorageOptionsProvider, DEFAULT_CLOUD_BLOCK_SIZE, DEFAULT_CLOUD_IO_PARALLELISM, + DEFAULT_MAX_IOP_SIZE, }; use lance_core::error::{Error, Result}; @@ -54,13 +55,16 @@ impl AwsStoreProvider { let mut s3_storage_options = storage_options.as_s3_options(); let region = resolve_s3_region(base_path, &s3_storage_options).await?; + + // Get or create accessor from params + let accessor = params.get_or_create_accessor(); + let (aws_creds, region) = build_aws_credential( params.s3_credentials_refresh_offset, params.aws_credentials.clone(), Some(&s3_storage_options), region, - params.storage_options_provider.clone(), - storage_options.expires_at_millis(), + accessor, ) .await?; @@ -228,20 +232,17 @@ async fn resolve_s3_region( /// Build AWS credentials /// /// This resolves credentials from the following sources in order: -/// 1. An explicit `storage_options_provider` +/// 1. An explicit `storage_options_accessor` with a provider /// 2. An explicit `credentials` provider /// 3. Explicit credentials in storage_options (as in `aws_access_key_id`, /// `aws_secret_access_key`, `aws_session_token`) /// 4. The default credential provider chain from AWS SDK. /// -/// # Initial Credentials with Storage Options Provider +/// # Storage Options Accessor /// -/// When `storage_options_provider` is provided along with `storage_options` and -/// `expires_at_millis`, these serve as **initial values** to avoid redundant calls to -/// fetch new storage options. The provider will use these initial credentials until they -/// expire (based on `expires_at_millis`), then automatically fetch fresh credentials from -/// the provider. Once the initial credentials expire, the passed-in values are no longer -/// used - all future credentials come from the provider's `fetch_storage_options()` method. +/// When `storage_options_accessor` is provided and has a dynamic provider, +/// credentials are fetched and cached by the accessor with automatic refresh +/// before expiration. /// /// `credentials_refresh_offset` is the amount of time before expiry to refresh credentials. pub async fn build_aws_credential( @@ -249,10 +250,8 @@ pub async fn build_aws_credential( credentials: Option, storage_options: Option<&HashMap>, region: Option, - storage_options_provider: Option>, - expires_at_millis: Option, + storage_options_accessor: Option>, ) -> Result<(AwsCredentialProvider, String)> { - // TODO: make this return no credential provider not using AWS use aws_config::meta::region::RegionProviderChain; const DEFAULT_REGION: &str = "us-west-2"; @@ -268,17 +267,24 @@ pub async fn build_aws_credential( }; let storage_options_credentials = storage_options.and_then(extract_static_s3_credentials); - if let Some(storage_options_provider) = storage_options_provider { - let creds = build_aws_credential_with_storage_options_provider( - storage_options_provider, - credentials_refresh_offset, - credentials, - storage_options_credentials, - expires_at_millis, - ) - .await?; - Ok((creds, region)) - } else if let Some(creds) = credentials { + + // If accessor has a provider, use DynamicStorageOptionsCredentialProvider + if let Some(accessor) = storage_options_accessor { + if accessor.has_provider() { + // Explicit aws_credentials takes precedence + if let Some(creds) = credentials { + return Ok((creds, region)); + } + // Use accessor for dynamic credential refresh + return Ok(( + Arc::new(DynamicStorageOptionsCredentialProvider::new(accessor)), + region, + )); + } + } + + // Fall back to existing logic for static credentials + if let Some(creds) = credentials { Ok((creds, region)) } else if let Some(creds) = storage_options_credentials { Ok((Arc::new(creds), region)) @@ -295,37 +301,6 @@ pub async fn build_aws_credential( } } -async fn build_aws_credential_with_storage_options_provider( - storage_options_provider: Arc, - credentials_refresh_offset: Duration, - credentials: Option, - storage_options_credentials: Option>, - expires_at_millis: Option, -) -> Result { - match (credentials, storage_options_credentials) { - // Case 1: Explicit aws_credentials provider takes precedence - use it directly - // without wrapping in DynamicStorageOptionsCredentialProvider, as the user's - // provider should handle its own credential refresh logic. - (Some(cred), _) => Ok(cred), - // Case 2: storage_options credentials - wrap with DynamicStorageOptionsCredentialProvider - // to enable auto-refresh from the namespace - (None, Some(cred)) => Ok(Arc::new( - DynamicStorageOptionsCredentialProvider::new_with_initial_credential( - storage_options_provider, - credentials_refresh_offset, - cred.get_credential().await?, - expires_at_millis, - ), - )), - // Case 3: No initial credentials - DynamicStorageOptionsCredentialProvider will - // fetch credentials from the namespace on first use - (None, None) => Ok(Arc::new(DynamicStorageOptionsCredentialProvider::new( - storage_options_provider, - credentials_refresh_offset, - ))), - } -} - fn extract_static_s3_credentials( options: &HashMap, ) -> Option> { @@ -477,11 +452,11 @@ impl ObjectStoreParams { } } -/// AWS Credential Provider that uses StorageOptionsProvider +/// AWS Credential Provider that delegates to StorageOptionsAccessor /// -/// This adapter converts our generic StorageOptionsProvider trait into -/// AWS-specific credentials that can be used with S3. It caches credentials -/// and automatically refreshes them before they expire. +/// This adapter converts storage options from a [`StorageOptionsAccessor`] into +/// AWS-specific credentials that can be used with S3. All caching and refresh logic +/// is handled by the accessor. /// /// # Future Work /// @@ -491,129 +466,77 @@ impl ObjectStoreParams { /// /// See: pub struct DynamicStorageOptionsCredentialProvider { - provider: Arc, - cache: Arc>>, - refresh_offset: Duration, + accessor: Arc, } impl fmt::Debug for DynamicStorageOptionsCredentialProvider { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("DynamicStorageOptionsCredentialProvider") - .field("provider", &self.provider) - .field("refresh_offset", &self.refresh_offset) + .field("accessor", &self.accessor) .finish() } } -#[derive(Debug, Clone)] -struct CachedCredential { - credential: Arc, - expires_at_millis: Option, -} - impl DynamicStorageOptionsCredentialProvider { - /// Create a new credential provider without initial credentials + /// Create a new credential provider from a storage options accessor + pub fn new(accessor: Arc) -> Self { + Self { accessor } + } + + /// Create a new credential provider from a storage options provider + /// + /// This is a convenience constructor for backward compatibility. /// /// # Arguments /// * `provider` - The storage options provider /// * `refresh_offset` - Duration before expiry to refresh credentials - pub fn new(provider: Arc, refresh_offset: Duration) -> Self { + pub fn from_provider( + provider: Arc, + refresh_offset: Duration, + ) -> Self { Self { - provider, - cache: Arc::new(RwLock::new(None)), - refresh_offset, + accessor: Arc::new(StorageOptionsAccessor::with_provider( + provider, + refresh_offset, + )), } } - /// Create a new credential provider with initial credentials from an explicit credential + /// Create a new credential provider with initial credentials + /// + /// This is a convenience constructor for backward compatibility. /// /// # Arguments /// * `provider` - The storage options provider /// * `refresh_offset` - Duration before expiry to refresh credentials - /// * `credential` - Initial credential to cache - /// * `expires_at_millis` - Expiration time in milliseconds since epoch. If None, credentials - /// are treated as non-expiring and will not be automatically refreshed. - pub fn new_with_initial_credential( + /// * `initial_options` - Initial storage options to cache + pub fn from_provider_with_initial( provider: Arc, refresh_offset: Duration, - credential: Arc, - expires_at_millis: Option, + initial_options: HashMap, ) -> Self { Self { - provider, - cache: Arc::new(RwLock::new(Some(CachedCredential { - credential, - expires_at_millis, - }))), - refresh_offset, - } - } - - fn needs_refresh(&self, cached: &Option) -> bool { - match cached { - None => true, - Some(cached_cred) => { - if let Some(expires_at_millis) = cached_cred.expires_at_millis { - let now_ms = SystemTime::now() - .duration_since(UNIX_EPOCH) - .unwrap_or(Duration::from_secs(0)) - .as_millis() as u64; - - // Refresh if we're within the refresh offset of expiration - let refresh_offset_millis = self.refresh_offset.as_millis() as u64; - now_ms + refresh_offset_millis >= expires_at_millis - } else { - // No expiration means credentials never expire - false - } - } + accessor: Arc::new(StorageOptionsAccessor::with_initial_and_provider( + initial_options, + provider, + refresh_offset, + )), } } +} - async fn do_get_credential(&self) -> ObjectStoreResult>> { - // Check if we have valid cached credentials with read lock - { - let cached = self.cache.read().await; - if !self.needs_refresh(&cached) { - if let Some(cached_cred) = &*cached { - return Ok(Some(cached_cred.credential.clone())); - } - } - } - - // Try to acquire write lock - if it fails, return None and let caller retry - let Ok(mut cache) = self.cache.try_write() else { - return Ok(None); - }; - - // Double-check if credentials are still stale after acquiring write lock - // (another thread might have refreshed them) - if !self.needs_refresh(&cache) { - if let Some(cached_cred) = &*cache { - return Ok(Some(cached_cred.credential.clone())); - } - } - - log::debug!( - "Refreshing S3 credentials from storage options provider: {}", - self.provider.provider_id() - ); +#[async_trait::async_trait] +impl CredentialProvider for DynamicStorageOptionsCredentialProvider { + type Credential = ObjectStoreAwsCredential; - let storage_options_map = self - .provider - .fetch_storage_options() - .await - .map_err(|e| object_store::Error::Generic { + async fn get_credential(&self) -> ObjectStoreResult> { + let storage_options = self.accessor.get_storage_options().await.map_err(|e| { + object_store::Error::Generic { store: "DynamicStorageOptionsCredentialProvider", source: Box::new(e), - })? - .ok_or_else(|| object_store::Error::Generic { - store: "DynamicStorageOptionsCredentialProvider", - source: "No storage options available".into(), - })?; + } + })?; - let storage_options = StorageOptions(storage_options_map); - let expires_at_millis = storage_options.expires_at_millis(); let s3_options = storage_options.as_s3_options(); let static_creds = extract_static_s3_credentials(&s3_options).ok_or_else(|| { object_store::Error::Generic { @@ -622,58 +545,13 @@ impl DynamicStorageOptionsCredentialProvider { } })?; - let credential = - static_creds - .get_credential() - .await - .map_err(|e| object_store::Error::Generic { - store: "DynamicStorageOptionsCredentialProvider", - source: Box::new(e), - })?; - - if let Some(expires_at) = expires_at_millis { - let now_ms = SystemTime::now() - .duration_since(UNIX_EPOCH) - .unwrap_or(Duration::from_secs(0)) - .as_millis() as u64; - let expires_in_secs = (expires_at.saturating_sub(now_ms)) / 1000; - log::debug!( - "Successfully refreshed S3 credentials from provider: {}, credentials expire in {} seconds", - self.provider.provider_id(), - expires_in_secs - ); - } else { - log::debug!( - "Successfully refreshed S3 credentials from provider: {} (no expiration)", - self.provider.provider_id() - ); - } - - *cache = Some(CachedCredential { - credential: credential.clone(), - expires_at_millis, - }); - - Ok(Some(credential)) - } -} - -#[async_trait::async_trait] -impl CredentialProvider for DynamicStorageOptionsCredentialProvider { - type Credential = ObjectStoreAwsCredential; - - async fn get_credential(&self) -> ObjectStoreResult> { - // Retry loop - if do_get_credential returns None (lock busy), retry from the beginning - loop { - match self.do_get_credential().await? { - Some(cred) => return Ok(cred), - None => { - // Lock was busy, wait 10ms before retrying - tokio::time::sleep(Duration::from_millis(10)).await; - continue; - } - } - } + static_creds + .get_credential() + .await + .map_err(|e| object_store::Error::Generic { + store: "DynamicStorageOptionsCredentialProvider", + source: Box::new(e), + }) } } @@ -878,19 +756,22 @@ mod tests { 600_000, // Expires in 10 minutes ))); - // Create credential provider with initial cached credentials that expire in 10 minutes + // Create initial options with cached credentials that expire in 10 minutes let expires_at = now_ms + 600_000; // 10 minutes from now - let initial_cred = Arc::new(ObjectStoreAwsCredential { - key_id: "AKID_CACHED".to_string(), - secret_key: "SECRET_CACHED".to_string(), - token: Some("TOKEN_CACHED".to_string()), - }); + let initial_options = HashMap::from([ + ("aws_access_key_id".to_string(), "AKID_CACHED".to_string()), + ( + "aws_secret_access_key".to_string(), + "SECRET_CACHED".to_string(), + ), + ("aws_session_token".to_string(), "TOKEN_CACHED".to_string()), + ("expires_at_millis".to_string(), expires_at.to_string()), + ]); - let provider = DynamicStorageOptionsCredentialProvider::new_with_initial_credential( + let provider = DynamicStorageOptionsCredentialProvider::from_provider_with_initial( mock.clone(), Duration::from_secs(300), // 5 minute refresh offset - initial_cred, - Some(expires_at), + initial_options, ); // First call should use cached credentials (not expired yet) @@ -914,19 +795,21 @@ mod tests { 600_000, // Expires in 10 minutes ))); - // Create credential provider with initial cached credentials that expired 1 second ago + // Create initial options with credentials that expired 1 second ago let expired_time = now_ms - 1_000; // 1 second ago - let initial_cred = Arc::new(ObjectStoreAwsCredential { - key_id: "AKID_EXPIRED".to_string(), - secret_key: "SECRET_EXPIRED".to_string(), - token: None, - }); + let initial_options = HashMap::from([ + ("aws_access_key_id".to_string(), "AKID_EXPIRED".to_string()), + ( + "aws_secret_access_key".to_string(), + "SECRET_EXPIRED".to_string(), + ), + ("expires_at_millis".to_string(), expired_time.to_string()), + ]); - let provider = DynamicStorageOptionsCredentialProvider::new_with_initial_credential( + let provider = DynamicStorageOptionsCredentialProvider::from_provider_with_initial( mock.clone(), Duration::from_secs(300), // 5 minute refresh offset - initial_cred, - Some(expired_time), + initial_options, ); // First call should fetch new credentials because cached ones are expired @@ -950,7 +833,7 @@ mod tests { // Create credential provider with 5 minute refresh offset // This means credentials should be refreshed when they have less than 5 minutes left - let provider = DynamicStorageOptionsCredentialProvider::new( + let provider = DynamicStorageOptionsCredentialProvider::from_provider( mock.clone(), Duration::from_secs(300), // 5 minute refresh offset ); @@ -980,7 +863,7 @@ mod tests { ))); // Create credential provider without initial cache - let provider = DynamicStorageOptionsCredentialProvider::new( + let provider = DynamicStorageOptionsCredentialProvider::from_provider( mock.clone(), Duration::from_secs(300), // 5 minute refresh offset ); @@ -1014,7 +897,7 @@ mod tests { } #[tokio::test] - async fn test_dynamic_credential_provider_with_initial_credential() { + async fn test_dynamic_credential_provider_with_initial_options() { MockClock::set_system_time(Duration::from_secs(100_000)); let now_ms = MockClock::system_time().as_millis() as u64; @@ -1024,20 +907,23 @@ mod tests { 600_000, // Expires in 10 minutes ))); - // Create an initial credential with expiration in 10 minutes + // Create initial options with expiration in 10 minutes let expires_at = now_ms + 600_000; // 10 minutes from now - let initial_cred = Arc::new(ObjectStoreAwsCredential { - key_id: "AKID_INITIAL".to_string(), - secret_key: "SECRET_INITIAL".to_string(), - token: Some("TOKEN_INITIAL".to_string()), - }); - - // Create credential provider with initial credential and expiration - let provider = DynamicStorageOptionsCredentialProvider::new_with_initial_credential( + let initial_options = HashMap::from([ + ("aws_access_key_id".to_string(), "AKID_INITIAL".to_string()), + ( + "aws_secret_access_key".to_string(), + "SECRET_INITIAL".to_string(), + ), + ("aws_session_token".to_string(), "TOKEN_INITIAL".to_string()), + ("expires_at_millis".to_string(), expires_at.to_string()), + ]); + + // Create credential provider with initial options + let provider = DynamicStorageOptionsCredentialProvider::from_provider_with_initial( mock.clone(), Duration::from_secs(300), // 5 minute refresh offset - initial_cred, - Some(expires_at), + initial_options, ); // First call should use the initial credential (not expired yet) @@ -1086,7 +972,7 @@ mod tests { // Create a mock provider with far future expiration let mock = Arc::new(MockStorageOptionsProvider::new(Some(9999999999999))); - let provider = Arc::new(DynamicStorageOptionsCredentialProvider::new( + let provider = Arc::new(DynamicStorageOptionsCredentialProvider::from_provider( mock.clone(), Duration::from_secs(300), )); @@ -1134,14 +1020,17 @@ mod tests { let now_ms = MockClock::system_time().as_millis() as u64; - // Create initial credentials that expired in the past (1000 seconds ago) + // Create initial options with credentials that expired in the past (1000 seconds ago) let expires_at = now_ms - 1_000_000; - - let initial_cred = Arc::new(ObjectStoreAwsCredential { - key_id: "AKID_OLD".to_string(), - secret_key: "SECRET_OLD".to_string(), - token: Some("TOKEN_OLD".to_string()), - }); + let initial_options = HashMap::from([ + ("aws_access_key_id".to_string(), "AKID_OLD".to_string()), + ( + "aws_secret_access_key".to_string(), + "SECRET_OLD".to_string(), + ), + ("aws_session_token".to_string(), "TOKEN_OLD".to_string()), + ("expires_at_millis".to_string(), expires_at.to_string()), + ]); // Mock will return credentials expiring in 1 hour let mock = Arc::new(MockStorageOptionsProvider::new(Some( @@ -1149,11 +1038,10 @@ mod tests { ))); let provider = Arc::new( - DynamicStorageOptionsCredentialProvider::new_with_initial_credential( + DynamicStorageOptionsCredentialProvider::from_provider_with_initial( mock.clone(), Duration::from_secs(300), - initial_cred, - Some(expires_at), + initial_options, ), ); @@ -1201,21 +1089,27 @@ mod tests { } #[tokio::test] - async fn test_explicit_aws_credentials_takes_precedence_over_storage_options_provider() { + async fn test_explicit_aws_credentials_takes_precedence_over_accessor() { // Create a mock storage options provider that should NOT be called let mock_storage_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); + // Create an accessor with the mock provider + let accessor = Arc::new(StorageOptionsAccessor::with_provider( + mock_storage_provider.clone(), + Duration::from_secs(300), + )); + // Create an explicit AWS credentials provider let explicit_cred_provider = Arc::new(MockAwsCredentialsProvider::default()); - // Build credentials with both aws_credentials AND storage_options_provider + // Build credentials with both aws_credentials AND accessor // The explicit aws_credentials should take precedence - let result = build_aws_credential_with_storage_options_provider( - mock_storage_provider.clone(), + let (result, _region) = build_aws_credential( Duration::from_secs(300), Some(explicit_cred_provider.clone() as AwsCredentialProvider), - None, // no storage_options credentials - Some(1000), + None, // no storage_options + Some("us-west-2".to_string()), + Some(accessor), ) .await .unwrap(); @@ -1239,8 +1133,7 @@ mod tests { } #[tokio::test] - async fn test_storage_options_credentials_uses_dynamic_provider_when_no_explicit_aws_credentials( - ) { + async fn test_accessor_used_when_no_explicit_aws_credentials() { MockClock::set_system_time(Duration::from_secs(100_000)); let now_ms = MockClock::system_time().as_millis() as u64; @@ -1248,30 +1141,46 @@ mod tests { // Create a mock storage options provider let mock_storage_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); - // Create storage_options credentials (simulating credentials from storage_options map) - let storage_options_cred = StaticCredentialProvider::new(ObjectStoreAwsCredential { - key_id: "AKID_FROM_STORAGE_OPTIONS".to_string(), - secret_key: "SECRET_FROM_STORAGE_OPTIONS".to_string(), - token: Some("TOKEN_FROM_STORAGE_OPTIONS".to_string()), - }); - + // Create initial options let expires_at = now_ms + 600_000; // 10 minutes from now + let initial_options = HashMap::from([ + ( + "aws_access_key_id".to_string(), + "AKID_FROM_ACCESSOR".to_string(), + ), + ( + "aws_secret_access_key".to_string(), + "SECRET_FROM_ACCESSOR".to_string(), + ), + ( + "aws_session_token".to_string(), + "TOKEN_FROM_ACCESSOR".to_string(), + ), + ("expires_at_millis".to_string(), expires_at.to_string()), + ]); - // Build credentials with storage_options_credentials but NO explicit aws_credentials - let result = build_aws_credential_with_storage_options_provider( + // Create an accessor with initial options and provider + let accessor = Arc::new(StorageOptionsAccessor::with_initial_and_provider( + initial_options, mock_storage_provider.clone(), Duration::from_secs(300), + )); + + // Build credentials with accessor but NO explicit aws_credentials + let (result, _region) = build_aws_credential( + Duration::from_secs(300), None, // no explicit aws_credentials - Some(storage_options_cred), - Some(expires_at), + None, // no storage_options + Some("us-west-2".to_string()), + Some(accessor), ) .await .unwrap(); - // Get credential - should use the initial storage_options credentials + // Get credential - should use the initial accessor credentials let cred = result.get_credential().await.unwrap(); - assert_eq!(cred.key_id, "AKID_FROM_STORAGE_OPTIONS"); - assert_eq!(cred.secret_key, "SECRET_FROM_STORAGE_OPTIONS"); + assert_eq!(cred.key_id, "AKID_FROM_ACCESSOR"); + assert_eq!(cred.secret_key, "SECRET_FROM_ACCESSOR"); // Storage options provider should NOT have been called yet (using cached initial creds) assert_eq!(mock_storage_provider.get_call_count().await, 0); diff --git a/rust/lance-io/src/object_store/storage_options_accessor.rs b/rust/lance-io/src/object_store/storage_options_accessor.rs new file mode 100644 index 00000000000..0a4748c389a --- /dev/null +++ b/rust/lance-io/src/object_store/storage_options_accessor.rs @@ -0,0 +1,584 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Unified storage options accessor with caching and automatic refresh +//! +//! This module provides [`StorageOptionsAccessor`] which bundles static storage options +//! with an optional dynamic provider, handling all caching and refresh logic internally. + +use std::collections::HashMap; +use std::fmt; +use std::sync::Arc; +use std::time::Duration; + +#[cfg(test)] +use mock_instant::thread_local::{SystemTime, UNIX_EPOCH}; + +#[cfg(not(test))] +use std::time::{SystemTime, UNIX_EPOCH}; + +use tokio::sync::RwLock; + +use super::storage_options::StorageOptionsProvider; +use super::{StorageOptions, EXPIRES_AT_MILLIS_KEY}; +use lance_core::{Error, Result}; +use snafu::location; + +/// Unified access to storage options with automatic caching and refresh +/// +/// This struct bundles static storage options with an optional dynamic provider, +/// handling all caching and refresh logic internally. It provides a single entry point +/// for accessing storage options regardless of whether they're static or dynamic. +/// +/// # Behavior +/// +/// - If only static options are provided, returns those options +/// - If a provider is configured, fetches from provider and caches results +/// - Automatically refreshes cached options before expiration (based on refresh_offset) +/// - Uses `expires_at_millis` key to track expiration +/// +/// # Thread Safety +/// +/// The accessor is thread-safe and can be shared across multiple tasks. +/// Concurrent refresh attempts are deduplicated using a try-lock mechanism. +pub struct StorageOptionsAccessor { + /// Initial/fallback static storage options + initial_options: Option>, + + /// Optional dynamic provider for refreshing options + provider: Option>, + + /// Cached storage options with expiration tracking + cache: Arc>>, + + /// Duration before expiry to trigger refresh + refresh_offset: Duration, +} + +impl fmt::Debug for StorageOptionsAccessor { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("StorageOptionsAccessor") + .field("has_initial_options", &self.initial_options.is_some()) + .field("has_provider", &self.provider.is_some()) + .field("refresh_offset", &self.refresh_offset) + .finish() + } +} + +#[derive(Debug, Clone)] +struct CachedStorageOptions { + options: HashMap, + expires_at_millis: Option, +} + +impl StorageOptionsAccessor { + /// Create an accessor with only static options (no refresh capability) + /// + /// The returned accessor will always return the provided options. + /// This is useful when credentials don't expire or are managed externally. + pub fn static_options(options: HashMap) -> Self { + let expires_at_millis = options + .get(EXPIRES_AT_MILLIS_KEY) + .and_then(|s| s.parse::().ok()); + + Self { + initial_options: Some(options.clone()), + provider: None, + cache: Arc::new(RwLock::new(Some(CachedStorageOptions { + options, + expires_at_millis, + }))), + refresh_offset: Duration::ZERO, + } + } + + /// Create an accessor with a dynamic provider (no initial options) + /// + /// The accessor will fetch from the provider on first access and cache + /// the results. Refresh happens automatically before expiration. + /// + /// # Arguments + /// * `provider` - The storage options provider for fetching fresh options + /// * `refresh_offset` - Duration before expiry to trigger refresh + pub fn with_provider( + provider: Arc, + refresh_offset: Duration, + ) -> Self { + Self { + initial_options: None, + provider: Some(provider), + cache: Arc::new(RwLock::new(None)), + refresh_offset, + } + } + + /// Create an accessor with initial options and a dynamic provider + /// + /// Initial options are used until they expire, then the provider is called. + /// This avoids an immediate fetch when initial credentials are still valid. + /// + /// # Arguments + /// * `initial_options` - Initial storage options to cache + /// * `provider` - The storage options provider for refreshing + /// * `refresh_offset` - Duration before expiry to trigger refresh + pub fn with_initial_and_provider( + initial_options: HashMap, + provider: Arc, + refresh_offset: Duration, + ) -> Self { + let expires_at_millis = initial_options + .get(EXPIRES_AT_MILLIS_KEY) + .and_then(|s| s.parse::().ok()); + + Self { + initial_options: Some(initial_options.clone()), + provider: Some(provider), + cache: Arc::new(RwLock::new(Some(CachedStorageOptions { + options: initial_options, + expires_at_millis, + }))), + refresh_offset, + } + } + + /// Get current valid storage options + /// + /// - Returns cached options if not expired + /// - Fetches from provider if expired or not cached + /// - Falls back to initial_options if provider returns None + /// + /// # Errors + /// + /// Returns an error if: + /// - The provider fails to fetch options + /// - No options are available (no cache, no provider, no initial options) + pub async fn get_storage_options(&self) -> Result { + loop { + match self.do_get_storage_options().await? { + Some(options) => return Ok(options), + None => { + // Lock was busy, wait 10ms before retrying + tokio::time::sleep(Duration::from_millis(10)).await; + continue; + } + } + } + } + + async fn do_get_storage_options(&self) -> Result> { + // Check if we have valid cached options with read lock + { + let cached = self.cache.read().await; + if !self.needs_refresh(&cached) { + if let Some(cached_opts) = &*cached { + return Ok(Some(StorageOptions(cached_opts.options.clone()))); + } + } + } + + // If no provider, return initial options or error + let Some(provider) = &self.provider else { + return if let Some(initial) = &self.initial_options { + Ok(Some(StorageOptions(initial.clone()))) + } else { + Err(Error::IO { + source: Box::new(std::io::Error::other("No storage options available")), + location: location!(), + }) + }; + }; + + // Try to acquire write lock - if it fails, return None and let caller retry + let Ok(mut cache) = self.cache.try_write() else { + return Ok(None); + }; + + // Double-check if options are still stale after acquiring write lock + // (another thread might have refreshed them) + if !self.needs_refresh(&cache) { + if let Some(cached_opts) = &*cache { + return Ok(Some(StorageOptions(cached_opts.options.clone()))); + } + } + + log::debug!( + "Refreshing storage options from provider: {}", + provider.provider_id() + ); + + let storage_options_map = + provider + .fetch_storage_options() + .await + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to fetch storage options: {}", + e + ))), + location: location!(), + })?; + + let Some(options) = storage_options_map else { + // Provider returned None, fall back to initial options + if let Some(initial) = &self.initial_options { + return Ok(Some(StorageOptions(initial.clone()))); + } + return Err(Error::IO { + source: Box::new(std::io::Error::other( + "Provider returned no storage options", + )), + location: location!(), + }); + }; + + let expires_at_millis = options + .get(EXPIRES_AT_MILLIS_KEY) + .and_then(|s| s.parse::().ok()); + + if let Some(expires_at) = expires_at_millis { + let now_ms = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or(Duration::from_secs(0)) + .as_millis() as u64; + let expires_in_secs = (expires_at.saturating_sub(now_ms)) / 1000; + log::debug!( + "Successfully refreshed storage options from provider: {}, options expire in {} seconds", + provider.provider_id(), + expires_in_secs + ); + } else { + log::debug!( + "Successfully refreshed storage options from provider: {} (no expiration)", + provider.provider_id() + ); + } + + *cache = Some(CachedStorageOptions { + options: options.clone(), + expires_at_millis, + }); + + Ok(Some(StorageOptions(options))) + } + + fn needs_refresh(&self, cached: &Option) -> bool { + match cached { + None => true, + Some(cached_opts) => { + if let Some(expires_at_millis) = cached_opts.expires_at_millis { + let now_ms = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or(Duration::from_secs(0)) + .as_millis() as u64; + + // Refresh if we're within the refresh offset of expiration + let refresh_offset_millis = self.refresh_offset.as_millis() as u64; + now_ms + refresh_offset_millis >= expires_at_millis + } else { + // No expiration means options never expire + false + } + } + } + } + + /// Get the initial storage options without refresh + /// + /// Returns the initial options that were provided when creating the accessor. + /// This does not trigger any refresh, even if the options have expired. + pub fn initial_storage_options(&self) -> Option<&HashMap> { + self.initial_options.as_ref() + } + + /// Get the accessor ID for equality/hashing + /// + /// Returns the provider_id if a provider exists, otherwise generates + /// a stable ID from the initial options hash. + pub fn accessor_id(&self) -> String { + if let Some(provider) = &self.provider { + provider.provider_id() + } else if let Some(initial) = &self.initial_options { + // Generate a stable ID from initial options + use std::collections::hash_map::DefaultHasher; + use std::hash::{Hash, Hasher}; + + let mut hasher = DefaultHasher::new(); + let mut keys: Vec<_> = initial.keys().collect(); + keys.sort(); + for key in keys { + key.hash(&mut hasher); + initial.get(key).hash(&mut hasher); + } + format!("static_options_{:x}", hasher.finish()) + } else { + "empty_accessor".to_string() + } + } + + /// Check if this accessor has a dynamic provider + pub fn has_provider(&self) -> bool { + self.provider.is_some() + } + + /// Get the refresh offset duration + pub fn refresh_offset(&self) -> Duration { + self.refresh_offset + } + + /// Get the storage options provider, if any + pub fn provider(&self) -> Option<&Arc> { + self.provider.as_ref() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use async_trait::async_trait; + use mock_instant::thread_local::MockClock; + + #[derive(Debug)] + struct MockStorageOptionsProvider { + call_count: Arc>, + expires_in_millis: Option, + } + + impl MockStorageOptionsProvider { + fn new(expires_in_millis: Option) -> Self { + Self { + call_count: Arc::new(RwLock::new(0)), + expires_in_millis, + } + } + + async fn get_call_count(&self) -> usize { + *self.call_count.read().await + } + } + + #[async_trait] + impl StorageOptionsProvider for MockStorageOptionsProvider { + async fn fetch_storage_options(&self) -> Result>> { + let count = { + let mut c = self.call_count.write().await; + *c += 1; + *c + }; + + let mut options = HashMap::from([ + ("aws_access_key_id".to_string(), format!("AKID_{}", count)), + ( + "aws_secret_access_key".to_string(), + format!("SECRET_{}", count), + ), + ("aws_session_token".to_string(), format!("TOKEN_{}", count)), + ]); + + if let Some(expires_in) = self.expires_in_millis { + let now_ms = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis() as u64; + let expires_at = now_ms + expires_in; + options.insert(EXPIRES_AT_MILLIS_KEY.to_string(), expires_at.to_string()); + } + + Ok(Some(options)) + } + + fn provider_id(&self) -> String { + let ptr = Arc::as_ptr(&self.call_count) as usize; + format!("MockStorageOptionsProvider {{ id: {} }}", ptr) + } + } + + #[tokio::test] + async fn test_static_options_only() { + let options = HashMap::from([ + ("key1".to_string(), "value1".to_string()), + ("key2".to_string(), "value2".to_string()), + ]); + let accessor = StorageOptionsAccessor::static_options(options.clone()); + + let result = accessor.get_storage_options().await.unwrap(); + assert_eq!(result.0, options); + assert!(!accessor.has_provider()); + assert_eq!(accessor.initial_storage_options(), Some(&options)); + } + + #[tokio::test] + async fn test_provider_only() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); + let accessor = + StorageOptionsAccessor::with_provider(mock_provider.clone(), Duration::from_secs(60)); + + let result = accessor.get_storage_options().await.unwrap(); + assert!(result.0.contains_key("aws_access_key_id")); + assert_eq!(result.0.get("aws_access_key_id").unwrap(), "AKID_1"); + assert!(accessor.has_provider()); + assert_eq!(accessor.initial_storage_options(), None); + assert_eq!(mock_provider.get_call_count().await, 1); + } + + #[tokio::test] + async fn test_initial_and_provider_uses_initial_first() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + let now_ms = MockClock::system_time().as_millis() as u64; + let expires_at = now_ms + 600_000; // 10 minutes from now + + let initial = HashMap::from([ + ("aws_access_key_id".to_string(), "INITIAL_KEY".to_string()), + ( + "aws_secret_access_key".to_string(), + "INITIAL_SECRET".to_string(), + ), + (EXPIRES_AT_MILLIS_KEY.to_string(), expires_at.to_string()), + ]); + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); + + let accessor = StorageOptionsAccessor::with_initial_and_provider( + initial.clone(), + mock_provider.clone(), + Duration::from_secs(60), + ); + + // First call uses initial + let result = accessor.get_storage_options().await.unwrap(); + assert_eq!(result.0.get("aws_access_key_id").unwrap(), "INITIAL_KEY"); + assert_eq!(mock_provider.get_call_count().await, 0); // Provider not called yet + } + + #[tokio::test] + async fn test_caching_and_refresh() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); // 10 min expiry + let accessor = StorageOptionsAccessor::with_provider( + mock_provider.clone(), + Duration::from_secs(300), // 5 min refresh offset + ); + + // First call fetches + let result = accessor.get_storage_options().await.unwrap(); + assert_eq!(result.0.get("aws_access_key_id").unwrap(), "AKID_1"); + assert_eq!(mock_provider.get_call_count().await, 1); + + // Second call uses cache + let result = accessor.get_storage_options().await.unwrap(); + assert_eq!(result.0.get("aws_access_key_id").unwrap(), "AKID_1"); + assert_eq!(mock_provider.get_call_count().await, 1); + + // Advance time to 6 minutes - should trigger refresh (within 5 min refresh offset) + MockClock::set_system_time(Duration::from_secs(100_000 + 360)); + let result = accessor.get_storage_options().await.unwrap(); + assert_eq!(result.0.get("aws_access_key_id").unwrap(), "AKID_2"); + assert_eq!(mock_provider.get_call_count().await, 2); + } + + #[tokio::test] + async fn test_expired_initial_triggers_refresh() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + let now_ms = MockClock::system_time().as_millis() as u64; + let expired_time = now_ms - 1_000; // Expired 1 second ago + + let initial = HashMap::from([ + ("aws_access_key_id".to_string(), "EXPIRED_KEY".to_string()), + (EXPIRES_AT_MILLIS_KEY.to_string(), expired_time.to_string()), + ]); + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); + + let accessor = StorageOptionsAccessor::with_initial_and_provider( + initial, + mock_provider.clone(), + Duration::from_secs(60), + ); + + // Should fetch from provider since initial is expired + let result = accessor.get_storage_options().await.unwrap(); + assert_eq!(result.0.get("aws_access_key_id").unwrap(), "AKID_1"); + assert_eq!(mock_provider.get_call_count().await, 1); + } + + #[tokio::test] + async fn test_accessor_id_with_provider() { + let mock_provider = Arc::new(MockStorageOptionsProvider::new(None)); + let accessor = + StorageOptionsAccessor::with_provider(mock_provider, Duration::from_secs(60)); + + let id = accessor.accessor_id(); + assert!(id.starts_with("MockStorageOptionsProvider")); + } + + #[tokio::test] + async fn test_accessor_id_static() { + let options = HashMap::from([("key".to_string(), "value".to_string())]); + let accessor = StorageOptionsAccessor::static_options(options); + + let id = accessor.accessor_id(); + assert!(id.starts_with("static_options_")); + } + + #[tokio::test] + async fn test_concurrent_access() { + // Create a mock provider with far future expiration + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(9999999999999))); + + let accessor = Arc::new(StorageOptionsAccessor::with_provider( + mock_provider.clone(), + Duration::from_secs(300), + )); + + // Spawn 10 concurrent tasks that all try to get options at the same time + let mut handles = vec![]; + for i in 0..10 { + let acc = accessor.clone(); + let handle = tokio::spawn(async move { + let result = acc.get_storage_options().await.unwrap(); + assert_eq!(result.0.get("aws_access_key_id").unwrap(), "AKID_1"); + i + }); + handles.push(handle); + } + + // Wait for all tasks to complete + let results: Vec<_> = futures::future::join_all(handles) + .await + .into_iter() + .map(|r| r.unwrap()) + .collect(); + + // Verify all 10 tasks completed successfully + assert_eq!(results.len(), 10); + + // The provider should have been called exactly once + let call_count = mock_provider.get_call_count().await; + assert_eq!( + call_count, 1, + "Provider should be called exactly once despite concurrent access" + ); + } + + #[tokio::test] + async fn test_no_expiration_never_refreshes() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + let mock_provider = Arc::new(MockStorageOptionsProvider::new(None)); // No expiration + let accessor = + StorageOptionsAccessor::with_provider(mock_provider.clone(), Duration::from_secs(60)); + + // First call fetches + accessor.get_storage_options().await.unwrap(); + assert_eq!(mock_provider.get_call_count().await, 1); + + // Advance time significantly + MockClock::set_system_time(Duration::from_secs(200_000)); + + // Should still use cached options + accessor.get_storage_options().await.unwrap(); + assert_eq!(mock_provider.get_call_count().await, 1); + } +} diff --git a/rust/lance-table/src/io/commit.rs b/rust/lance-table/src/io/commit.rs index be650beac50..d93fa4b3c86 100644 --- a/rust/lance-table/src/io/commit.rs +++ b/rust/lance-table/src/io/commit.rs @@ -765,20 +765,22 @@ pub async fn commit_handler_from_url( } }; let options = options.clone().unwrap_or_default(); - let storage_options = StorageOptions(options.storage_options.unwrap_or_default()); - let dynamo_endpoint = get_dynamodb_endpoint(&storage_options); - let expires_at_millis = storage_options.expires_at_millis(); - let storage_options = storage_options.as_s3_options(); + let storage_options_raw = + StorageOptions(options.storage_options.clone().unwrap_or_default()); + let dynamo_endpoint = get_dynamodb_endpoint(&storage_options_raw); + let storage_options = storage_options_raw.as_s3_options(); let region = storage_options.get(&AmazonS3ConfigKey::Region).cloned(); + // Get or create accessor from the options + let accessor = options.get_or_create_accessor(); + let (aws_creds, region) = build_aws_credential( options.s3_credentials_refresh_offset, options.aws_credentials.clone(), Some(&storage_options), region, - options.storage_options_provider.clone(), - expires_at_millis, + accessor, ) .await?; diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index db291c546d0..c005f61639a 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -35,7 +35,8 @@ use lance_file::reader::FileReaderOptions; use lance_file::version::LanceFileVersion; use lance_index::DatasetIndexExt; use lance_io::object_store::{ - LanceNamespaceStorageOptionsProvider, ObjectStore, ObjectStoreParams, + LanceNamespaceStorageOptionsProvider, ObjectStore, ObjectStoreParams, StorageOptions, + StorageOptionsAccessor, }; use lance_io::utils::{read_last_block, read_message, read_metadata_offset, read_struct}; use lance_namespace::LanceNamespace; @@ -1612,7 +1613,10 @@ impl Dataset { &self.object_store } - /// Returns the storage options used when opening this dataset, if any. + /// Returns the initial storage options used when opening this dataset, if any. + /// + /// This returns the static initial options without triggering any refresh. + /// For the latest refreshed options, use [`Self::latest_storage_options`]. pub fn storage_options(&self) -> Option<&HashMap> { self.store_params .as_ref() @@ -1620,6 +1624,11 @@ impl Dataset { } /// Returns the storage options provider used when opening this dataset, if any. + #[deprecated( + since = "0.25.0", + note = "Use storage_options_accessor() instead for unified access to storage options" + )] + #[allow(deprecated)] pub fn storage_options_provider( &self, ) -> Option> { @@ -1628,6 +1637,41 @@ impl Dataset { .and_then(|params| params.storage_options_provider.clone()) } + /// Returns the unified storage options accessor for this dataset, if any. + /// + /// The accessor handles both static and dynamic storage options with automatic + /// caching and refresh. Use [`StorageOptionsAccessor::get_storage_options`] to + /// get the latest options. + pub fn storage_options_accessor(&self) -> Option> { + self.store_params + .as_ref() + .and_then(|params| params.get_or_create_accessor()) + } + + /// Returns the latest (possibly refreshed) storage options. + /// + /// If a dynamic storage options provider is configured, this will return + /// the cached options if still valid, or fetch fresh options if expired. + /// + /// For the initial static options without refresh, use [`Self::storage_options`]. + /// + /// # Returns + /// + /// - `Ok(Some(options))` - Storage options are available (static or refreshed) + /// - `Ok(None)` - No storage options were configured for this dataset + /// - `Err(...)` - Error occurred while fetching/refreshing options from provider + pub async fn latest_storage_options(&self) -> Result> { + // First check if we have an accessor (handles both static and dynamic options) + if let Some(accessor) = self.storage_options_accessor() { + let options = accessor.get_storage_options().await?; + return Ok(Some(options)); + } + + // Fallback to legacy storage_options field if no accessor + // This handles the case where storage_options was set without a provider + Ok(self.storage_options().cloned().map(StorageOptions)) + } + pub fn data_dir(&self) -> Path { self.base.child(DATA_DIR) } diff --git a/rust/lance/src/dataset/builder.rs b/rust/lance/src/dataset/builder.rs index 639fda28bca..89d290604a4 100644 --- a/rust/lance/src/dataset/builder.rs +++ b/rust/lance/src/dataset/builder.rs @@ -12,7 +12,7 @@ use lance_file::datatypes::populate_schema_dictionary; use lance_file::reader::FileReaderOptions; use lance_io::object_store::{ LanceNamespaceStorageOptionsProvider, ObjectStore, ObjectStoreParams, StorageOptions, - DEFAULT_CLOUD_IO_PARALLELISM, + StorageOptionsAccessor, DEFAULT_CLOUD_IO_PARALLELISM, }; use lance_namespace::models::DescribeTableRequest; use lance_namespace::LanceNamespace; @@ -351,6 +351,11 @@ impl DatasetBuilder { /// .with_s3_credentials_refresh_offset(Duration::from_secs(600)) /// .load() /// .await?; + #[deprecated( + since = "0.25.0", + note = "Use with_storage_options_accessor() instead for unified access to storage options" + )] + #[allow(deprecated)] pub fn with_storage_options_provider( mut self, provider: Arc, @@ -359,6 +364,36 @@ impl DatasetBuilder { self } + /// Set a unified storage options accessor for credential management + /// + /// The accessor bundles static storage options with an optional dynamic provider, + /// handling all caching and refresh logic internally. + /// + /// # Arguments + /// * `accessor` - The storage options accessor + /// + /// # Example + /// ```ignore + /// use std::sync::Arc; + /// use std::time::Duration; + /// use lance_io::object_store::StorageOptionsAccessor; + /// + /// // Create an accessor with a dynamic provider + /// let accessor = Arc::new(StorageOptionsAccessor::with_provider( + /// provider, + /// Duration::from_secs(300), // 5 minute refresh offset + /// )); + /// + /// let dataset = DatasetBuilder::from_uri("s3://bucket/table.lance") + /// .with_storage_options_accessor(accessor) + /// .load() + /// .await?; + /// ``` + pub fn with_storage_options_accessor(mut self, accessor: Arc) -> Self { + self.options.storage_options_accessor = Some(accessor); + self + } + /// Set options based on [ReadParams]. pub fn with_read_params(mut self, read_params: ReadParams) -> Self { self = self From 1de6c00b91bee6ab3b56d78dfbb0bd4e3b4f673f Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Thu, 15 Jan 2026 22:05:54 -0800 Subject: [PATCH 2/9] refactor: introduce storage options accessor --- java/lance-jni/src/blocking_dataset.rs | 35 +------ java/lance-jni/src/fragment.rs | 10 -- java/lance-jni/src/transaction.rs | 11 +-- java/lance-jni/src/utils.rs | 8 -- java/src/main/java/org/lance/Dataset.java | 11 +-- java/src/main/java/org/lance/Fragment.java | 13 +-- .../java/org/lance/OpenDatasetBuilder.java | 20 +--- java/src/main/java/org/lance/ReadOptions.java | 23 ----- java/src/main/java/org/lance/Transaction.java | 25 +---- .../java/org/lance/WriteDatasetBuilder.java | 20 ---- .../java/org/lance/WriteFragmentBuilder.java | 16 ---- java/src/main/java/org/lance/WriteParams.java | 14 --- .../org/lance/NamespaceIntegrationTest.java | 19 +--- python/python/lance/__init__.py | 26 +----- python/python/lance/dataset.py | 36 +------- python/python/lance/file.py | 15 --- python/python/lance/lance/__init__.pyi | 3 - .../tests/test_namespace_integration.py | 9 -- python/src/dataset.rs | 34 ++----- python/src/file.rs | 47 ++-------- python/src/storage_options.rs | 34 +++---- rust/lance-io/src/object_store.rs | 14 ++- .../src/object_store/providers/aws.rs | 18 ++-- .../src/object_store/storage_options.rs | 3 + .../object_store/storage_options_accessor.rs | 80 ++++++++-------- rust/lance-namespace-impls/src/dir.rs | 15 +-- rust/lance/src/dataset.rs | 91 +++++++++---------- rust/lance/src/dataset/builder.rs | 22 +---- 28 files changed, 160 insertions(+), 512 deletions(-) diff --git a/java/lance-jni/src/blocking_dataset.rs b/java/lance-jni/src/blocking_dataset.rs index 1c34b605071..28fe3f0ae5c 100644 --- a/java/lance-jni/src/blocking_dataset.rs +++ b/java/lance-jni/src/blocking_dataset.rs @@ -148,17 +148,12 @@ impl BlockingDataset { storage_options: HashMap, serialized_manifest: Option<&[u8]>, storage_options_provider: Option>, - s3_credentials_refresh_offset_seconds: Option, ) -> Result { let mut store_params = ObjectStoreParams { block_size: block_size.map(|size| size as usize), storage_options: Some(storage_options.clone()), ..Default::default() }; - if let Some(offset_seconds) = s3_credentials_refresh_offset_seconds { - store_params.s3_credentials_refresh_offset = - std::time::Duration::from_secs(offset_seconds); - } if let Some(provider) = storage_options_provider.clone() { store_params.storage_options_provider = Some(provider); } @@ -178,10 +173,6 @@ impl BlockingDataset { if let Some(provider) = storage_options_provider.clone() { builder = builder.with_storage_options_provider(provider) } - if let Some(offset_seconds) = s3_credentials_refresh_offset_seconds { - builder = builder - .with_s3_credentials_refresh_offset(std::time::Duration::from_secs(offset_seconds)); - } if let Some(serialized_manifest) = serialized_manifest { builder = builder.with_serialized_manifest(serialized_manifest)?; @@ -366,7 +357,6 @@ pub extern "system" fn Java_org_lance_Dataset_createWithFfiSchema<'local>( data_storage_version: JObject, // Optional enable_v2_manifest_paths: JObject, // Optional storage_options_obj: JObject, // Map - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional initial_bases: JObject, target_bases: JObject, ) -> JObject<'local> { @@ -384,7 +374,6 @@ pub extern "system" fn Java_org_lance_Dataset_createWithFfiSchema<'local>( data_storage_version, enable_v2_manifest_paths, storage_options_obj, - s3_credentials_refresh_offset_seconds_obj, initial_bases, target_bases, ) @@ -404,7 +393,6 @@ fn inner_create_with_ffi_schema<'local>( data_storage_version: JObject, // Optional enable_v2_manifest_paths: JObject, // Optional storage_options_obj: JObject, // Map - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional initial_bases: JObject, target_bases: JObject, ) -> Result> { @@ -425,7 +413,6 @@ fn inner_create_with_ffi_schema<'local>( enable_v2_manifest_paths, storage_options_obj, JObject::null(), // No provider for schema-only creation - s3_credentials_refresh_offset_seconds_obj, initial_bases, target_bases, reader, @@ -478,7 +465,6 @@ pub extern "system" fn Java_org_lance_Dataset_createWithFfiStream<'local>( data_storage_version: JObject, // Optional enable_v2_manifest_paths: JObject, // Optional storage_options_obj: JObject, // Map - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional initial_bases: JObject, target_bases: JObject, ) -> JObject<'local> { @@ -497,7 +483,6 @@ pub extern "system" fn Java_org_lance_Dataset_createWithFfiStream<'local>( data_storage_version, storage_options_obj, JObject::null(), - s3_credentials_refresh_offset_seconds_obj, initial_bases, target_bases ) @@ -519,7 +504,6 @@ pub extern "system" fn Java_org_lance_Dataset_createWithFfiStreamAndProvider<'lo enable_v2_manifest_paths: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional initial_bases: JObject, // Optional> target_bases: JObject, // Optional> ) -> JObject<'local> { @@ -538,7 +522,6 @@ pub extern "system" fn Java_org_lance_Dataset_createWithFfiStreamAndProvider<'lo enable_v2_manifest_paths, storage_options_obj, storage_options_provider_obj, - s3_credentials_refresh_offset_seconds_obj, initial_bases, target_bases ) @@ -559,7 +542,6 @@ fn inner_create_with_ffi_stream<'local>( enable_v2_manifest_paths: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional initial_bases: JObject, // Optional> target_bases: JObject, // Optional> ) -> Result> { @@ -577,7 +559,6 @@ fn inner_create_with_ffi_stream<'local>( enable_v2_manifest_paths, storage_options_obj, storage_options_provider_obj, - s3_credentials_refresh_offset_seconds_obj, initial_bases, target_bases, reader, @@ -597,7 +578,6 @@ fn create_dataset<'local>( enable_v2_manifest_paths: JObject, storage_options_obj: JObject, storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, initial_bases: JObject, target_bases: JObject, reader: impl RecordBatchReader + Send + 'static, @@ -615,7 +595,6 @@ fn create_dataset<'local>( Some(&enable_v2_manifest_paths), &storage_options_obj, &storage_options_provider_obj, - &s3_credentials_refresh_offset_seconds_obj, &initial_bases, &target_bases, )?; @@ -1092,7 +1071,6 @@ pub extern "system" fn Java_org_lance_Dataset_openNative<'local>( storage_options_obj: JObject, // Map serialized_manifest: JObject, // Optional storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> JObject<'local> { ok_or_throw!( env, @@ -1106,7 +1084,6 @@ pub extern "system" fn Java_org_lance_Dataset_openNative<'local>( storage_options_obj, serialized_manifest, storage_options_provider_obj, - s3_credentials_refresh_offset_seconds_obj ) ) } @@ -1122,7 +1099,6 @@ fn inner_open_native<'local>( storage_options_obj: JObject, // Map serialized_manifest: JObject, // Optional storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> Result> { let path_str: String = path.extract(env)?; let version = env.get_int_opt(&version_obj)?; @@ -1139,11 +1115,6 @@ fn inner_open_native<'local>( let storage_options_provider_arc = storage_options_provider.map(|v| Arc::new(v) as Arc); - // Extract s3_credentials_refresh_offset_seconds - let s3_credentials_refresh_offset_seconds = env - .get_long_opt(&s3_credentials_refresh_offset_seconds_obj)? - .map(|v| v as u64); - let serialized_manifest = env.get_bytes_opt(&serialized_manifest)?; let dataset = BlockingDataset::open( &path_str, @@ -1154,7 +1125,6 @@ fn inner_open_native<'local>( storage_options, serialized_manifest, storage_options_provider_arc, - s3_credentials_refresh_offset_seconds, )?; dataset.into_java(env) } @@ -1378,7 +1348,10 @@ pub extern "system" fn Java_org_lance_Dataset_nativeGetLatestStorageOptions<'loc mut env: JNIEnv<'local>, java_dataset: JObject, ) -> JObject<'local> { - ok_or_throw!(env, inner_get_latest_storage_options(&mut env, java_dataset)) + ok_or_throw!( + env, + inner_get_latest_storage_options(&mut env, java_dataset) + ) } fn inner_get_latest_storage_options<'local>( diff --git a/java/lance-jni/src/fragment.rs b/java/lance-jni/src/fragment.rs index 2ca3fe4824f..07cc0d53d73 100644 --- a/java/lance-jni/src/fragment.rs +++ b/java/lance-jni/src/fragment.rs @@ -91,7 +91,6 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiArray<'local>( data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> JObject<'local> { ok_or_throw_with_return!( env, @@ -108,7 +107,6 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiArray<'local>( data_storage_version, storage_options_obj, storage_options_provider_obj, - s3_credentials_refresh_offset_seconds_obj ), JObject::default() ) @@ -128,7 +126,6 @@ fn inner_create_with_ffi_array<'local>( data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> Result> { let c_array_ptr = arrow_array_addr as *mut FFI_ArrowArray; let c_schema_ptr = arrow_schema_addr as *mut FFI_ArrowSchema; @@ -154,7 +151,6 @@ fn inner_create_with_ffi_array<'local>( data_storage_version, storage_options_obj, storage_options_provider_obj, - s3_credentials_refresh_offset_seconds_obj, reader, ) } @@ -173,7 +169,6 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiStream<'a>( data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> JObject<'a> { ok_or_throw_with_return!( env, @@ -189,7 +184,6 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiStream<'a>( data_storage_version, storage_options_obj, storage_options_provider_obj, - s3_credentials_refresh_offset_seconds_obj ), JObject::null() ) @@ -208,7 +202,6 @@ fn inner_create_with_ffi_stream<'local>( data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> Result> { let stream_ptr = arrow_array_stream_addr as *mut FFI_ArrowArrayStream; let reader = unsafe { ArrowArrayStreamReader::from_raw(stream_ptr) }?; @@ -224,7 +217,6 @@ fn inner_create_with_ffi_stream<'local>( data_storage_version, storage_options_obj, storage_options_provider_obj, - s3_credentials_refresh_offset_seconds_obj, reader, ) } @@ -241,7 +233,6 @@ fn create_fragment<'a>( data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional source: impl StreamingWriteSource, ) -> Result> { let path_str = dataset_uri.extract(env)?; @@ -257,7 +248,6 @@ fn create_fragment<'a>( None, &storage_options_obj, &storage_options_provider_obj, - &s3_credentials_refresh_offset_seconds_obj, &JObject::null(), // not used when creating fragments &JObject::null(), // not used when creating fragments )?; diff --git a/java/lance-jni/src/transaction.rs b/java/lance-jni/src/transaction.rs index 5f43f454154..923de6736f0 100644 --- a/java/lance-jni/src/transaction.rs +++ b/java/lance-jni/src/transaction.rs @@ -754,19 +754,13 @@ fn inner_commit_transaction<'local>( .call_method(&java_transaction, "writeParams", "()Ljava/util/Map;", &[])? .l()?; let write_param_jmap = JMap::from_env(env, &write_param_jobj)?; - let mut write_param = to_rust_map(env, &write_param_jmap)?; - - // Extract s3_credentials_refresh_offset_seconds from write_param - let s3_credentials_refresh_offset = write_param - .remove("s3_credentials_refresh_offset_seconds") - .and_then(|v| v.parse::().ok()) - .map(std::time::Duration::from_secs) - .unwrap_or_else(|| std::time::Duration::from_secs(10)); + let write_param = to_rust_map(env, &write_param_jmap)?; // Get the Dataset's storage_options_provider let storage_options_provider = { let dataset_guard = unsafe { env.get_rust_field::<_, _, BlockingDataset>(&java_dataset, NATIVE_DATASET) }?; + #[allow(deprecated)] dataset_guard.get_storage_options_provider() }; @@ -774,7 +768,6 @@ fn inner_commit_transaction<'local>( let store_params = ObjectStoreParams { storage_options: Some(write_param), storage_options_provider, - s3_credentials_refresh_offset, ..Default::default() }; diff --git a/java/lance-jni/src/utils.rs b/java/lance-jni/src/utils.rs index a87df02040f..567182c39a1 100644 --- a/java/lance-jni/src/utils.rs +++ b/java/lance-jni/src/utils.rs @@ -50,7 +50,6 @@ pub fn extract_write_params( enable_v2_manifest_paths: Option<&JObject>, storage_options_obj: &JObject, storage_options_provider_obj: &JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: &JObject, // Optional initial_bases: &JObject, // Optional target_bases: &JObject, // Optional ) -> Result { @@ -98,12 +97,6 @@ pub fn extract_write_params( let storage_options_provider_arc: Option> = storage_options_provider.map(|v| Arc::new(v) as Arc); - // Extract s3_credentials_refresh_offset_seconds if present - let s3_credentials_refresh_offset = env - .get_long_opt(s3_credentials_refresh_offset_seconds_obj)? - .map(|v| std::time::Duration::from_secs(v as u64)) - .unwrap_or_else(|| std::time::Duration::from_secs(10)); - if let Some(initial_bases) = env.get_list_opt(initial_bases, |env, elem| elem.extract_object(env))? { @@ -117,7 +110,6 @@ pub fn extract_write_params( write_params.store_params = Some(ObjectStoreParams { storage_options: Some(storage_options), storage_options_provider: storage_options_provider_arc, - s3_credentials_refresh_offset, ..Default::default() }); Ok(write_params) diff --git a/java/src/main/java/org/lance/Dataset.java b/java/src/main/java/org/lance/Dataset.java index 4bf638f6262..6b7e914ea35 100644 --- a/java/src/main/java/org/lance/Dataset.java +++ b/java/src/main/java/org/lance/Dataset.java @@ -144,7 +144,6 @@ public static Dataset create( params.getDataStorageVersion(), params.getEnableV2ManifestPaths(), params.getStorageOptions(), - params.getS3CredentialsRefreshOffsetSeconds(), params.getInitialBases(), params.getTargetBases()); dataset.allocator = allocator; @@ -206,7 +205,6 @@ static Dataset create( params.getEnableV2ManifestPaths(), params.getStorageOptions(), Optional.ofNullable(storageOptionsProvider), - params.getS3CredentialsRefreshOffsetSeconds(), params.getInitialBases(), params.getTargetBases()); dataset.allocator = allocator; @@ -224,7 +222,6 @@ private static native Dataset createWithFfiSchema( Optional dataStorageVersion, Optional enableV2ManifestPaths, Map storageOptions, - Optional s3CredentialsRefreshOffsetSeconds, Optional> initialBases, Optional> targetBases); @@ -239,7 +236,6 @@ private static native Dataset createWithFfiStream( Optional dataStorageVersion, Optional enableV2ManifestPaths, Map storageOptions, - Optional s3CredentialsRefreshOffsetSeconds, Optional> initialBases, Optional> targetBases); @@ -255,7 +251,6 @@ private static native Dataset createWithFfiStreamAndProvider( Optional enableV2ManifestPaths, Map storageOptions, Optional storageOptionsProvider, - Optional s3CredentialsRefreshOffsetSeconds, Optional> initialBases, Optional> targetBases); @@ -335,8 +330,7 @@ static Dataset open( options.getMetadataCacheSizeBytes(), options.getStorageOptions(), options.getSerializedManifest(), - options.getStorageOptionsProvider(), - options.getS3CredentialsRefreshOffsetSeconds()); + options.getStorageOptionsProvider()); dataset.allocator = allocator; dataset.selfManagedAllocator = selfManagedAllocator; return dataset; @@ -350,8 +344,7 @@ private static native Dataset openNative( long metadataCacheSizeBytes, Map storageOptions, Optional serializedManifest, - Optional storageOptionsProvider, - Optional s3CredentialsRefreshOffsetSeconds); + Optional storageOptionsProvider); /** * Creates a builder for opening a dataset. diff --git a/java/src/main/java/org/lance/Fragment.java b/java/src/main/java/org/lance/Fragment.java index 62bcabf9bdd..8eb1f70053d 100644 --- a/java/src/main/java/org/lance/Fragment.java +++ b/java/src/main/java/org/lance/Fragment.java @@ -211,7 +211,6 @@ private native FragmentUpdateResult nativeUpdateColumns( * .allocator(allocator) * .data(vectorSchemaRoot) * .storageOptions(storageOptions) - * .s3CredentialsRefreshOffsetSeconds(10) * .execute(); * } * @@ -277,8 +276,7 @@ public static List create( params.getEnableStableRowIds(), params.getDataStorageVersion(), params.getStorageOptions(), - Optional.ofNullable(storageOptionsProvider), - params.getS3CredentialsRefreshOffsetSeconds()); + Optional.ofNullable(storageOptionsProvider)); } } @@ -330,8 +328,7 @@ public static List create( params.getEnableStableRowIds(), params.getDataStorageVersion(), params.getStorageOptions(), - Optional.ofNullable(storageOptionsProvider), - params.getS3CredentialsRefreshOffsetSeconds()); + Optional.ofNullable(storageOptionsProvider)); } /** @@ -350,8 +347,7 @@ private static native List createWithFfiArray( Optional enableStableRowIds, Optional dataStorageVersion, Map storageOptions, - Optional storageOptionsProvider, - Optional s3CredentialsRefreshOffsetSeconds); + Optional storageOptionsProvider); /** * Create a fragment from the given arrow stream. @@ -368,6 +364,5 @@ private static native List createWithFfiStream( Optional enableStableRowIds, Optional dataStorageVersion, Map storageOptions, - Optional storageOptionsProvider, - Optional s3CredentialsRefreshOffsetSeconds); + Optional storageOptionsProvider); } diff --git a/java/src/main/java/org/lance/OpenDatasetBuilder.java b/java/src/main/java/org/lance/OpenDatasetBuilder.java index ae082e14ceb..fc350fb0fcf 100644 --- a/java/src/main/java/org/lance/OpenDatasetBuilder.java +++ b/java/src/main/java/org/lance/OpenDatasetBuilder.java @@ -58,7 +58,6 @@ public class OpenDatasetBuilder { private LanceNamespace namespace; private List tableId; private ReadOptions options = new ReadOptions.Builder().build(); - private boolean ignoreNamespaceTableStorageOptions = false; /** Creates a new builder instance. Package-private, use Dataset.open() instead. */ OpenDatasetBuilder() {} @@ -128,19 +127,6 @@ public OpenDatasetBuilder readOptions(ReadOptions options) { return this; } - /** - * Sets whether to ignore storage options from the namespace's describeTable(). - * - * @param ignoreNamespaceTableStorageOptions If true, storage options returned from - * describeTable() will be ignored (treated as null) - * @return this builder instance - */ - public OpenDatasetBuilder ignoreNamespaceTableStorageOptions( - boolean ignoreNamespaceTableStorageOptions) { - this.ignoreNamespaceTableStorageOptions = ignoreNamespaceTableStorageOptions; - return this; - } - /** * Opens the dataset with the configured parameters. * @@ -204,8 +190,7 @@ private Dataset buildFromNamespace() { throw new IllegalArgumentException("Namespace did not return a table location"); } - Map namespaceStorageOptions = - ignoreNamespaceTableStorageOptions ? null : response.getStorageOptions(); + Map namespaceStorageOptions = response.getStorageOptions(); ReadOptions.Builder optionsBuilder = new ReadOptions.Builder() @@ -221,9 +206,6 @@ private Dataset buildFromNamespace() { options.getVersion().ifPresent(optionsBuilder::setVersion); options.getBlockSize().ifPresent(optionsBuilder::setBlockSize); options.getSerializedManifest().ifPresent(optionsBuilder::setSerializedManifest); - options - .getS3CredentialsRefreshOffsetSeconds() - .ifPresent(optionsBuilder::setS3CredentialsRefreshOffsetSeconds); Map storageOptions = new HashMap<>(options.getStorageOptions()); if (namespaceStorageOptions != null) { diff --git a/java/src/main/java/org/lance/ReadOptions.java b/java/src/main/java/org/lance/ReadOptions.java index 9d08c834008..0a7a0343a79 100644 --- a/java/src/main/java/org/lance/ReadOptions.java +++ b/java/src/main/java/org/lance/ReadOptions.java @@ -32,7 +32,6 @@ public class ReadOptions { private final Optional serializedManifest; private final Map storageOptions; private final Optional storageOptionsProvider; - private final Optional s3CredentialsRefreshOffsetSeconds; private ReadOptions(Builder builder) { this.version = builder.version; @@ -42,7 +41,6 @@ private ReadOptions(Builder builder) { this.storageOptions = builder.storageOptions; this.serializedManifest = builder.serializedManifest; this.storageOptionsProvider = builder.storageOptionsProvider; - this.s3CredentialsRefreshOffsetSeconds = builder.s3CredentialsRefreshOffsetSeconds; } public Optional getVersion() { @@ -73,10 +71,6 @@ public Optional getStorageOptionsProvider() { return storageOptionsProvider; } - public Optional getS3CredentialsRefreshOffsetSeconds() { - return s3CredentialsRefreshOffsetSeconds; - } - @Override public String toString() { return MoreObjects.toStringHelper(this) @@ -100,7 +94,6 @@ public static class Builder { private Map storageOptions = new HashMap<>(); private Optional serializedManifest = Optional.empty(); private Optional storageOptionsProvider = Optional.empty(); - private Optional s3CredentialsRefreshOffsetSeconds = Optional.empty(); /** * Set the version of the dataset to read. If not set, read from latest version. @@ -221,22 +214,6 @@ public Builder setStorageOptionsProvider(StorageOptionsProvider storageOptionsPr return this; } - /** - * Set the number of seconds before credential expiration to trigger a refresh. - * - *

Default is 60 seconds. Only applicable when using AWS S3 with temporary credentials. For - * example, if set to 60, credentials will be refreshed when they have less than 60 seconds - * remaining before expiration. This should be set shorter than the credential lifetime to avoid - * using expired credentials. - * - * @param s3CredentialsRefreshOffsetSeconds the refresh offset in seconds - * @return this builder - */ - public Builder setS3CredentialsRefreshOffsetSeconds(long s3CredentialsRefreshOffsetSeconds) { - this.s3CredentialsRefreshOffsetSeconds = Optional.of(s3CredentialsRefreshOffsetSeconds); - return this; - } - public ReadOptions build() { return new ReadOptions(this); } diff --git a/java/src/main/java/org/lance/Transaction.java b/java/src/main/java/org/lance/Transaction.java index 67bc5f8d93d..2d565c73258 100644 --- a/java/src/main/java/org/lance/Transaction.java +++ b/java/src/main/java/org/lance/Transaction.java @@ -118,7 +118,6 @@ public static class Builder { private Operation operation; private Map writeParams; private Map transactionProperties; - private Optional s3CredentialsRefreshOffsetSeconds = Optional.empty(); public Builder(Dataset dataset) { this.dataset = dataset; @@ -140,21 +139,6 @@ public Builder writeParams(Map writeParams) { return this; } - /** - * Sets the S3 credentials refresh offset in seconds. - * - *

This parameter controls how long before credential expiration to refresh them. For - * example, if credentials expire at T+60s and this is set to 10, credentials will be refreshed - * at T+50s. - * - * @param s3CredentialsRefreshOffsetSeconds Refresh offset in seconds - * @return this builder instance - */ - public Builder s3CredentialsRefreshOffsetSeconds(long s3CredentialsRefreshOffsetSeconds) { - this.s3CredentialsRefreshOffsetSeconds = Optional.of(s3CredentialsRefreshOffsetSeconds); - return this; - } - public Builder operation(Operation operation) { validateState(); this.operation = operation; @@ -171,15 +155,8 @@ private void validateState() { public Transaction build() { Preconditions.checkState(operation != null, "TransactionBuilder has no operations"); - // Merge s3_credentials_refresh_offset_seconds into writeParams if present - Map finalWriteParams = - writeParams != null ? new HashMap<>(writeParams) : new HashMap<>(); - s3CredentialsRefreshOffsetSeconds.ifPresent( - value -> - finalWriteParams.put("s3_credentials_refresh_offset_seconds", String.valueOf(value))); - return new Transaction( - dataset, readVersion, uuid, operation, finalWriteParams, transactionProperties); + dataset, readVersion, uuid, operation, writeParams, transactionProperties); } } } diff --git a/java/src/main/java/org/lance/WriteDatasetBuilder.java b/java/src/main/java/org/lance/WriteDatasetBuilder.java index ec7e856b4f3..8a8c33163da 100644 --- a/java/src/main/java/org/lance/WriteDatasetBuilder.java +++ b/java/src/main/java/org/lance/WriteDatasetBuilder.java @@ -80,7 +80,6 @@ public class WriteDatasetBuilder { private Optional maxBytesPerFile = Optional.empty(); private Optional enableStableRowIds = Optional.empty(); private Optional dataStorageVersion = Optional.empty(); - private Optional s3CredentialsRefreshOffsetSeconds = Optional.empty(); private Optional> initialBases = Optional.empty(); private Optional> targetBases = Optional.empty(); @@ -276,21 +275,6 @@ public WriteDatasetBuilder dataStorageVersion(WriteParams.LanceFileVersion dataS return this; } - /** - * Sets the S3 credentials refresh offset in seconds. - * - *

This parameter controls how long before credential expiration to refresh them. For example, - * if credentials expire at T+60s and this is set to 10, credentials will be refreshed at T+50s. - * - * @param s3CredentialsRefreshOffsetSeconds Refresh offset in seconds - * @return this builder instance - */ - public WriteDatasetBuilder s3CredentialsRefreshOffsetSeconds( - long s3CredentialsRefreshOffsetSeconds) { - this.s3CredentialsRefreshOffsetSeconds = Optional.of(s3CredentialsRefreshOffsetSeconds); - return this; - } - public WriteDatasetBuilder initialBases(List bases) { this.initialBases = Optional.of(bases); return this; @@ -424,8 +408,6 @@ private Dataset executeWithNamespace() { maxBytesPerFile.ifPresent(paramsBuilder::withMaxBytesPerFile); enableStableRowIds.ifPresent(paramsBuilder::withEnableStableRowIds); dataStorageVersion.ifPresent(paramsBuilder::withDataStorageVersion); - s3CredentialsRefreshOffsetSeconds.ifPresent( - paramsBuilder::withS3CredentialsRefreshOffsetSeconds); initialBases.ifPresent(paramsBuilder::withInitialBases); targetBases.ifPresent(paramsBuilder::withTargetBases); @@ -451,8 +433,6 @@ private Dataset executeWithUri() { maxBytesPerFile.ifPresent(paramsBuilder::withMaxBytesPerFile); enableStableRowIds.ifPresent(paramsBuilder::withEnableStableRowIds); dataStorageVersion.ifPresent(paramsBuilder::withDataStorageVersion); - s3CredentialsRefreshOffsetSeconds.ifPresent( - paramsBuilder::withS3CredentialsRefreshOffsetSeconds); initialBases.ifPresent(paramsBuilder::withInitialBases); targetBases.ifPresent(paramsBuilder::withTargetBases); diff --git a/java/src/main/java/org/lance/WriteFragmentBuilder.java b/java/src/main/java/org/lance/WriteFragmentBuilder.java index 76882b14a29..56ce06a7b0a 100644 --- a/java/src/main/java/org/lance/WriteFragmentBuilder.java +++ b/java/src/main/java/org/lance/WriteFragmentBuilder.java @@ -37,7 +37,6 @@ * .allocator(allocator) * .data(vectorSchemaRoot) * .storageOptions(storageOptions) - * .s3CredentialsRefreshOffsetSeconds(10) * .execute(); * } */ @@ -134,21 +133,6 @@ public WriteFragmentBuilder storageOptionsProvider(StorageOptionsProvider provid return this; } - /** - * Set the S3 credentials refresh offset in seconds. - * - *

This parameter controls how long before credential expiration to refresh them. For example, - * if credentials expire at T+60s and this is set to 10, credentials will be refreshed at T+50s. - * - * @param seconds refresh offset in seconds - * @return this builder - */ - public WriteFragmentBuilder s3CredentialsRefreshOffsetSeconds(long seconds) { - ensureWriteParamsBuilder(); - this.writeParamsBuilder.withS3CredentialsRefreshOffsetSeconds(seconds); - return this; - } - /** * Set the maximum number of rows per file. * diff --git a/java/src/main/java/org/lance/WriteParams.java b/java/src/main/java/org/lance/WriteParams.java index 2a4853383dd..c095009bc67 100644 --- a/java/src/main/java/org/lance/WriteParams.java +++ b/java/src/main/java/org/lance/WriteParams.java @@ -58,7 +58,6 @@ public String getVersionString() { private final Optional dataStorageVersion; private final Optional enableV2ManifestPaths; private Map storageOptions = new HashMap<>(); - private final Optional s3CredentialsRefreshOffsetSeconds; private final Optional> initialBases; private final Optional> targetBases; @@ -71,7 +70,6 @@ private WriteParams( Optional dataStorageVersion, Optional enableV2ManifestPaths, Map storageOptions, - Optional s3CredentialsRefreshOffsetSeconds, Optional> initialBases, Optional> targetBases) { this.maxRowsPerFile = maxRowsPerFile; @@ -82,7 +80,6 @@ private WriteParams( this.dataStorageVersion = dataStorageVersion; this.enableV2ManifestPaths = enableV2ManifestPaths; this.storageOptions = storageOptions; - this.s3CredentialsRefreshOffsetSeconds = s3CredentialsRefreshOffsetSeconds; this.initialBases = initialBases; this.targetBases = targetBases; } @@ -124,10 +121,6 @@ public Map getStorageOptions() { return storageOptions; } - public Optional getS3CredentialsRefreshOffsetSeconds() { - return s3CredentialsRefreshOffsetSeconds; - } - public Optional> getInitialBases() { return initialBases; } @@ -157,7 +150,6 @@ public static class Builder { private Optional dataStorageVersion = Optional.empty(); private Optional enableV2ManifestPaths; private Map storageOptions = new HashMap<>(); - private Optional s3CredentialsRefreshOffsetSeconds = Optional.empty(); private Optional> initialBases = Optional.empty(); private Optional> targetBases = Optional.empty(); @@ -201,11 +193,6 @@ public Builder withEnableV2ManifestPaths(boolean enableV2ManifestPaths) { return this; } - public Builder withS3CredentialsRefreshOffsetSeconds(long s3CredentialsRefreshOffsetSeconds) { - this.s3CredentialsRefreshOffsetSeconds = Optional.of(s3CredentialsRefreshOffsetSeconds); - return this; - } - public Builder withInitialBases(List initialBases) { this.initialBases = Optional.of(initialBases); return this; @@ -226,7 +213,6 @@ public WriteParams build() { dataStorageVersion, enableV2ManifestPaths, storageOptions, - s3CredentialsRefreshOffsetSeconds, initialBases, targetBases); } diff --git a/java/src/test/java/org/lance/NamespaceIntegrationTest.java b/java/src/test/java/org/lance/NamespaceIntegrationTest.java index ad0b55dccdc..85e86ff5d09 100644 --- a/java/src/test/java/org/lance/NamespaceIntegrationTest.java +++ b/java/src/test/java/org/lance/NamespaceIntegrationTest.java @@ -326,11 +326,7 @@ public VectorSchemaRoot getVectorSchemaRoot() { assertEquals(1, namespace.getCreateCallCount(), "createEmptyTable should be called once"); // Open dataset through namespace WITH refresh enabled - // Use 10-second refresh offset, so credentials effectively expire at T+50s - ReadOptions readOptions = - new ReadOptions.Builder() - .setS3CredentialsRefreshOffsetSeconds(10) // Refresh 10s before expiration - .build(); + ReadOptions readOptions = new ReadOptions.Builder().build(); int callCountBeforeOpen = namespace.getDescribeCallCount(); try (Dataset dsFromNamespace = @@ -451,7 +447,6 @@ public VectorSchemaRoot getVectorSchemaRoot() { .namespace(namespace) .tableId(Arrays.asList(tableName)) .mode(WriteParams.WriteMode.CREATE) - .s3CredentialsRefreshOffsetSeconds(2) // Refresh 2s before expiration .execute()) { assertEquals(2, dataset.countRows()); } @@ -461,11 +456,7 @@ public VectorSchemaRoot getVectorSchemaRoot() { assertEquals(1, namespace.getCreateCallCount(), "createEmptyTable should be called once"); // Open dataset through namespace with refresh enabled - // Use 2-second refresh offset so credentials effectively expire at T+3s (5s - 2s) - ReadOptions readOptions = - new ReadOptions.Builder() - .setS3CredentialsRefreshOffsetSeconds(2) // Refresh 2s before expiration - .build(); + ReadOptions readOptions = new ReadOptions.Builder().build(); int callCountBeforeOpen = namespace.getDescribeCallCount(); try (Dataset dsFromNamespace = @@ -692,7 +683,6 @@ public VectorSchemaRoot getVectorSchemaRoot() { }; // Use the write builder to create a dataset through namespace - // Set a 1-second refresh offset. Credentials expire at T+60s, so refresh at T+59s. // Write completes instantly, so NO describeTable call should happen for refresh. try (Dataset dataset = Dataset.write() @@ -701,7 +691,6 @@ public VectorSchemaRoot getVectorSchemaRoot() { .namespace(namespace) .tableId(Arrays.asList(tableName)) .mode(WriteParams.WriteMode.CREATE) - .s3CredentialsRefreshOffsetSeconds(1) .execute()) { // Verify createEmptyTable was called exactly ONCE @@ -732,9 +721,7 @@ public VectorSchemaRoot getVectorSchemaRoot() { "describeTable should still be 0 after close (no refresh needed)"); // Now open the dataset through namespace with long-lived credentials (60s expiration) - // With 1s refresh offset, credentials are valid for 59s - plenty of time for reads - ReadOptions readOptions = - new ReadOptions.Builder().setS3CredentialsRefreshOffsetSeconds(1).build(); + ReadOptions readOptions = new ReadOptions.Builder().build(); try (Dataset dsFromNamespace = Dataset.open() diff --git a/python/python/lance/__init__.py b/python/python/lance/__init__.py index ff2870b6d1a..d98de0424ad 100644 --- a/python/python/lance/__init__.py +++ b/python/python/lance/__init__.py @@ -99,8 +99,6 @@ def dataset( session: Optional[Session] = None, namespace: Optional[LanceNamespace] = None, table_id: Optional[List[str]] = None, - ignore_namespace_table_storage_options: bool = False, - s3_credentials_refresh_offset_seconds: Optional[int] = None, ) -> LanceDataset: """ Opens the Lance dataset from the address specified. @@ -168,26 +166,13 @@ def dataset( table_id : optional, List[str] The table identifier when using a namespace (e.g., ["my_table"]). Must be provided together with `namespace`. Cannot be used with `uri`. - ignore_namespace_table_storage_options : bool, default False - Only applicable when using `namespace` and `table_id`. If True, storage - options returned from the namespace's describe_table() will be ignored - (treated as None). If False (default), storage options from describe_table() - will be used and a dynamic storage options provider will be created to - automatically refresh credentials before they expire. - s3_credentials_refresh_offset_seconds : optional, int - The number of seconds before credential expiration to trigger a refresh. - Default is 60 seconds. Only applicable when using AWS S3 with temporary - credentials. For example, if set to 60, credentials will be refreshed - when they have less than 60 seconds remaining before expiration. This - should be set shorter than the credential lifetime to avoid using - expired credentials. Notes ----- When using `namespace` and `table_id`: - The `uri` parameter is optional and will be fetched from the namespace - - Storage options from describe_table() will be used unless - `ignore_namespace_table_storage_options=True` + - Storage options from describe_table() will be used automatically + - A dynamic storage options provider will be created to refresh credentials - Initial storage options from describe_table() will be merged with any provided `storage_options` """ @@ -220,10 +205,7 @@ def dataset( if uri is None: raise ValueError("Namespace did not return a 'location' for the table") - if ignore_namespace_table_storage_options: - namespace_storage_options = None - else: - namespace_storage_options = response.storage_options + namespace_storage_options = response.storage_options if namespace_storage_options: storage_options_provider = LanceNamespaceStorageOptionsProvider( @@ -251,7 +233,6 @@ def dataset( read_params=read_params, session=session, storage_options_provider=storage_options_provider, - s3_credentials_refresh_offset_seconds=s3_credentials_refresh_offset_seconds, ) if version is None and asof is not None: ts_cutoff = sanitize_ts(asof) @@ -276,7 +257,6 @@ def dataset( read_params=read_params, session=session, storage_options_provider=storage_options_provider, - s3_credentials_refresh_offset_seconds=s3_credentials_refresh_offset_seconds, ) else: return ds diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index bc45824cdbd..4a7f75c64b6 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -433,7 +433,6 @@ def __init__( read_params: Optional[Dict[str, Any]] = None, session: Optional[Session] = None, storage_options_provider: Optional[Any] = None, - s3_credentials_refresh_offset_seconds: Optional[int] = None, ): uri = os.fspath(uri) if isinstance(uri, Path) else uri self._uri = uri @@ -463,7 +462,6 @@ def __init__( read_params=read_params, session=session, storage_options_provider=storage_options_provider, - s3_credentials_refresh_offset_seconds=s3_credentials_refresh_offset_seconds, ) self._default_scan_options = default_scan_options self._read_params = read_params @@ -5551,8 +5549,6 @@ def write_dataset( target_bases: Optional[List[str]] = None, namespace: Optional[LanceNamespace] = None, table_id: Optional[List[str]] = None, - ignore_namespace_table_storage_options: bool = False, - s3_credentials_refresh_offset_seconds: Optional[int] = None, ) -> LanceDataset: """Write a given data_obj to the given uri @@ -5654,29 +5650,16 @@ def write_dataset( table_id : optional, List[str] The table identifier when using a namespace (e.g., ["my_table"]). Must be provided together with `namespace`. Cannot be used with `uri`. - ignore_namespace_table_storage_options : bool, default False - If True, ignore the storage options returned by the namespace and only use - the provided `storage_options` parameter. The storage options provider will - not be created, so credentials will not be automatically refreshed. - This is useful when you want to use your own credentials instead of the - namespace-provided credentials. - s3_credentials_refresh_offset_seconds : optional, int - The number of seconds before credential expiration to trigger a refresh. - Default is 60 seconds. Only applicable when using AWS S3 with temporary - credentials. For example, if set to 60, credentials will be refreshed - when they have less than 60 seconds remaining before expiration. This - should be set shorter than the credential lifetime to avoid using - expired credentials. Notes ----- When using `namespace` and `table_id`: - The `uri` parameter is optional and will be fetched from the namespace + - Storage options from describe_table() will be used automatically - A `LanceNamespaceStorageOptionsProvider` will be created automatically for - storage options refresh (unless `ignore_namespace_table_storage_options=True`) + storage options refresh - Initial storage options from describe_table() will be merged with - any provided `storage_options` (unless - `ignore_namespace_table_storage_options=True`) + any provided `storage_options` """ # Validate that user provides either uri OR (namespace + table_id), not both has_uri = uri is not None @@ -5758,11 +5741,8 @@ def write_dataset( f"Namespace did not return a table location in {mode} response" ) - # Check if we should ignore namespace storage options - if ignore_namespace_table_storage_options: - namespace_storage_options = None - else: - namespace_storage_options = response.storage_options + # Use namespace storage options + namespace_storage_options = response.storage_options # Set up storage options and provider if namespace_storage_options: @@ -5825,12 +5805,6 @@ def write_dataset( if storage_options_provider is not None: params["storage_options_provider"] = storage_options_provider - # Add s3_credentials_refresh_offset_seconds if specified - if s3_credentials_refresh_offset_seconds is not None: - params["s3_credentials_refresh_offset_seconds"] = ( - s3_credentials_refresh_offset_seconds - ) - if commit_lock: if not callable(commit_lock): raise TypeError(f"commit_lock must be a function, got {type(commit_lock)}") diff --git a/python/python/lance/file.py b/python/python/lance/file.py index dec4aea00b6..8a20e4aff2f 100644 --- a/python/python/lance/file.py +++ b/python/python/lance/file.py @@ -68,7 +68,6 @@ def __init__( columns: Optional[List[str]] = None, *, storage_options_provider: Optional[StorageOptionsProvider] = None, - s3_credentials_refresh_offset_seconds: Optional[int] = None, _inner_reader: Optional[_LanceFileReader] = None, ): """ @@ -86,9 +85,6 @@ def __init__( storage_options_provider : optional A provider that can provide storage options dynamically. This is useful for credentials that need to be refreshed or vended on-demand. - s3_credentials_refresh_offset_seconds : optional, int - How early (in seconds) before expiration to refresh S3 credentials. - Default is 60 seconds. Only applies when using storage_options_provider. columns: list of str, default None List of column names to be fetched. All columns are fetched if None or unspecified. @@ -102,7 +98,6 @@ def __init__( path, storage_options=storage_options, storage_options_provider=storage_options_provider, - s3_credentials_refresh_offset_seconds=s3_credentials_refresh_offset_seconds, columns=columns, ) @@ -219,7 +214,6 @@ def __init__( base_path: str, storage_options: Optional[Dict[str, str]] = None, storage_options_provider: Optional[StorageOptionsProvider] = None, - s3_credentials_refresh_offset_seconds: Optional[int] = None, ): """ Creates a new file session @@ -236,9 +230,6 @@ def __init__( storage_options_provider : optional A provider that can provide storage options dynamically. This is useful for credentials that need to be refreshed or vended on-demand. - s3_credentials_refresh_offset_seconds : optional, int - How early (in seconds) before expiration to refresh S3 credentials. - Default is 60 seconds. Only applies when using storage_options_provider. """ if isinstance(base_path, Path): base_path = str(base_path) @@ -246,7 +237,6 @@ def __init__( base_path, storage_options=storage_options, storage_options_provider=storage_options_provider, - s3_credentials_refresh_offset_seconds=s3_credentials_refresh_offset_seconds, ) def open_reader( @@ -391,7 +381,6 @@ def __init__( version: Optional[str] = None, storage_options: Optional[Dict[str, str]] = None, storage_options_provider: Optional[StorageOptionsProvider] = None, - s3_credentials_refresh_offset_seconds: Optional[int] = None, max_page_bytes: Optional[int] = None, _inner_writer: Optional[_LanceFileWriter] = None, **kwargs, @@ -422,9 +411,6 @@ def __init__( A storage options provider that can fetch and refresh storage options dynamically. This is useful for credentials that expire and need to be refreshed automatically. - s3_credentials_refresh_offset_seconds : optional, int - How early (in seconds) before expiration to refresh S3 credentials. - Default is 60 seconds. Only applies when using storage_options_provider. max_page_bytes : optional, int The maximum size of a page in bytes, if a single array would create a page larger than this then it will be split into multiple pages. The @@ -442,7 +428,6 @@ def __init__( version=version, storage_options=storage_options, storage_options_provider=storage_options_provider, - s3_credentials_refresh_offset_seconds=s3_credentials_refresh_offset_seconds, max_page_bytes=max_page_bytes, **kwargs, ) diff --git a/python/python/lance/lance/__init__.pyi b/python/python/lance/lance/__init__.pyi index 506e4e267d5..a346b648ed5 100644 --- a/python/python/lance/lance/__init__.pyi +++ b/python/python/lance/lance/__init__.pyi @@ -95,7 +95,6 @@ class LanceFileWriter: version: Optional[str], storage_options: Optional[Dict[str, str]], storage_options_provider: Optional[StorageOptionsProvider], - s3_credentials_refresh_offset_seconds: Optional[int], keep_original_array: Optional[bool], max_page_bytes: Optional[int], ): ... @@ -110,7 +109,6 @@ class LanceFileSession: base_path: str, storage_options: Optional[Dict[str, str]] = None, storage_options_provider: Optional[StorageOptionsProvider] = None, - s3_credentials_refresh_offset_seconds: Optional[int] = None, ): ... def open_reader( self, path: str, columns: Optional[List[str]] = None @@ -135,7 +133,6 @@ class LanceFileReader: path: str, storage_options: Optional[Dict[str, str]], storage_options_provider: Optional[StorageOptionsProvider], - s3_credentials_refresh_offset_seconds: Optional[int], columns: Optional[List[str]] = None, ): ... def read_all( diff --git a/python/python/tests/test_namespace_integration.py b/python/python/tests/test_namespace_integration.py index 3c93dbcb504..01a3b6859e9 100644 --- a/python/python/tests/test_namespace_integration.py +++ b/python/python/tests/test_namespace_integration.py @@ -235,7 +235,6 @@ def test_namespace_with_refresh(s3_bucket: str): namespace=namespace, table_id=table_id, mode="create", - s3_credentials_refresh_offset_seconds=1, ) assert ds.count_rows() == 2 assert namespace.get_create_call_count() == 1 @@ -243,7 +242,6 @@ def test_namespace_with_refresh(s3_bucket: str): ds_from_namespace = lance.dataset( namespace=namespace, table_id=table_id, - s3_credentials_refresh_offset_seconds=1, ) initial_call_count = namespace.get_describe_call_count() @@ -574,7 +572,6 @@ def test_file_writer_with_storage_options_provider(s3_bucket: str): schema=schema, storage_options=namespace_storage_options, storage_options_provider=provider, - s3_credentials_refresh_offset_seconds=1, ) batch = pa.RecordBatch.from_pydict({"x": [1, 2, 3], "y": [4, 5, 6]}, schema=schema) @@ -593,7 +590,6 @@ def test_file_writer_with_storage_options_provider(s3_bucket: str): file_uri, storage_options=namespace_storage_options, storage_options_provider=provider, - s3_credentials_refresh_offset_seconds=1, ) result = reader.read_all(batch_size=1024) result_table = result.to_table() @@ -613,7 +609,6 @@ def test_file_writer_with_storage_options_provider(s3_bucket: str): schema=schema, storage_options=namespace_storage_options, storage_options_provider=provider, - s3_credentials_refresh_offset_seconds=1, ) batch3 = pa.RecordBatch.from_pydict( @@ -629,7 +624,6 @@ def test_file_writer_with_storage_options_provider(s3_bucket: str): file_uri2, storage_options=namespace_storage_options, storage_options_provider=provider, - s3_credentials_refresh_offset_seconds=1, ) result2 = reader2.read_all(batch_size=1024) result_table2 = result2.to_table() @@ -696,7 +690,6 @@ def test_file_reader_with_storage_options_provider(s3_bucket: str): file_uri, storage_options=namespace_storage_options, storage_options_provider=provider, - s3_credentials_refresh_offset_seconds=1, ) result = reader.read_all(batch_size=1024) result_table = result.to_table() @@ -727,7 +720,6 @@ def test_file_reader_with_storage_options_provider(s3_bucket: str): file_uri2, storage_options=namespace_storage_options, storage_options_provider=provider, - s3_credentials_refresh_offset_seconds=1, ) result2 = reader2.read_all(batch_size=1024) result_table2 = result2.to_table() @@ -778,7 +770,6 @@ def test_file_session_with_storage_options_provider(s3_bucket: str): f"s3://{s3_bucket}/{table_name}_session", storage_options=namespace_storage_options, storage_options_provider=provider, - s3_credentials_refresh_offset_seconds=1, ) # Test contains method diff --git a/python/src/dataset.rs b/python/src/dataset.rs index 017b3c085e3..e6a954c4439 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -481,8 +481,9 @@ pub struct Dataset { #[pymethods] impl Dataset { #[allow(clippy::too_many_arguments)] + #[allow(deprecated)] #[new] - #[pyo3(signature=(uri, version=None, block_size=None, index_cache_size=None, metadata_cache_size=None, commit_handler=None, storage_options=None, manifest=None, metadata_cache_size_bytes=None, index_cache_size_bytes=None, read_params=None, session=None, storage_options_provider=None, s3_credentials_refresh_offset_seconds=None))] + #[pyo3(signature=(uri, version=None, block_size=None, index_cache_size=None, metadata_cache_size=None, commit_handler=None, storage_options=None, manifest=None, metadata_cache_size_bytes=None, index_cache_size_bytes=None, read_params=None, session=None, storage_options_provider=None))] fn new( py: Python, uri: String, @@ -498,7 +499,6 @@ impl Dataset { read_params: Option<&Bound>, session: Option, storage_options_provider: Option<&Bound<'_, PyAny>>, - s3_credentials_refresh_offset_seconds: Option, ) -> PyResult { let mut params = ReadParams::default(); if let Some(metadata_cache_size_bytes) = metadata_cache_size_bytes { @@ -515,16 +515,12 @@ impl Dataset { let index_cache_size_bytes = index_cache_size * 20 * 1024 * 1024; params.index_cache_size_bytes(index_cache_size_bytes); } - // Set up store options (block size and S3 credentials refresh offset) - let mut store_params = params.store_options.take().unwrap_or_default(); + // Set up store options (block size) if let Some(block_size) = block_size { + let mut store_params = params.store_options.take().unwrap_or_default(); store_params.block_size = Some(block_size); + params.store_options = Some(store_params); } - if let Some(offset_seconds) = s3_credentials_refresh_offset_seconds { - store_params.s3_credentials_refresh_offset = - std::time::Duration::from_secs(offset_seconds); - } - params.store_options = Some(store_params); if let Some(commit_handler) = commit_handler { let py_commit_lock = PyCommitLock::new(commit_handler); params.set_commit_lock(Arc::new(py_commit_lock)); @@ -1533,9 +1529,7 @@ impl Dataset { /// If a storage options provider was configured and credentials are expiring, /// this will refresh them. Returns the current valid storage options, or None /// if no storage options accessor is configured. - fn latest_storage_options( - self_: PyRef<'_, Self>, - ) -> PyResult>> { + fn latest_storage_options(self_: PyRef<'_, Self>) -> PyResult>> { let result = rt() .block_on(Some(self_.py()), self_.ds.latest_storage_options())? .map_err(|err| PyIOError::new_err(err.to_string()))?; @@ -2256,6 +2250,7 @@ impl Dataset { } #[allow(clippy::too_many_arguments)] + #[allow(deprecated)] #[staticmethod] #[pyo3(signature = (dest, transaction, commit_lock = None, storage_options = None, storage_options_provider = None, enable_v2_manifest_paths = None, detached = None, max_retries = None))] fn commit_transaction( @@ -2319,6 +2314,7 @@ impl Dataset { } #[allow(clippy::too_many_arguments)] + #[allow(deprecated)] #[staticmethod] #[pyo3(signature = (dest, transactions, commit_lock = None, storage_options = None, storage_options_provider = None, enable_v2_manifest_paths = None, detached = None, max_retries = None))] fn commit_batch( @@ -3109,6 +3105,7 @@ fn get_dict_opt<'a, 'py, D: FromPyObject<'a>>( .transpose() } +#[allow(deprecated)] pub fn get_write_params(options: &Bound<'_, PyDict>) -> PyResult> { let params = if options.is_none() { None @@ -3144,21 +3141,10 @@ pub fn get_write_params(options: &Bound<'_, PyDict>) -> PyResult(options, "s3_credentials_refresh_offset_seconds")?; - - if storage_options.is_some() - || storage_options_provider.is_some() - || s3_credentials_refresh_offset_seconds.is_some() - { - let s3_credentials_refresh_offset = s3_credentials_refresh_offset_seconds - .map(std::time::Duration::from_secs) - .unwrap_or(std::time::Duration::from_secs(60)); - + if storage_options.is_some() || storage_options_provider.is_some() { p.store_params = Some(ObjectStoreParams { storage_options, storage_options_provider, - s3_credentials_refresh_offset, ..Default::default() }); } diff --git a/python/src/file.rs b/python/src/file.rs index ba96710a0d5..80c66d739d1 100644 --- a/python/src/file.rs +++ b/python/src/file.rs @@ -241,7 +241,6 @@ impl LanceFileWriter { version: Option, storage_options: Option>, storage_options_provider: Option>, - s3_credentials_refresh_offset_seconds: Option, keep_original_array: Option, max_page_bytes: Option, ) -> PyResult { @@ -249,7 +248,6 @@ impl LanceFileWriter { uri_or_path, storage_options, storage_options_provider, - s3_credentials_refresh_offset_seconds, ) .await?; Self::open_with_store( @@ -299,7 +297,7 @@ impl LanceFileWriter { #[pymethods] impl LanceFileWriter { #[new] - #[pyo3(signature=(path, schema=None, data_cache_bytes=None, version=None, storage_options=None, storage_options_provider=None, s3_credentials_refresh_offset_seconds=None, keep_original_array=None, max_page_bytes=None))] + #[pyo3(signature=(path, schema=None, data_cache_bytes=None, version=None, storage_options=None, storage_options_provider=None, keep_original_array=None, max_page_bytes=None))] #[allow(clippy::too_many_arguments)] pub fn new( path: String, @@ -308,7 +306,6 @@ impl LanceFileWriter { version: Option, storage_options: Option>, storage_options_provider: Option<&Bound<'_, PyAny>>, - s3_credentials_refresh_offset_seconds: Option, keep_original_array: Option, max_page_bytes: Option, ) -> PyResult { @@ -326,7 +323,6 @@ impl LanceFileWriter { version, storage_options, provider, - s3_credentials_refresh_offset_seconds, keep_original_array, max_page_bytes, ), @@ -383,25 +379,21 @@ pub async fn object_store_from_uri_or_path( uri_or_path: impl AsRef, storage_options: Option>, ) -> PyResult<(Arc, Path)> { - object_store_from_uri_or_path_with_provider(uri_or_path, storage_options, None, None).await + object_store_from_uri_or_path_with_provider(uri_or_path, storage_options, None).await } +#[allow(deprecated)] pub async fn object_store_from_uri_or_path_with_provider( uri_or_path: impl AsRef, storage_options: Option>, storage_options_provider: Option>, - s3_credentials_refresh_offset_seconds: Option, ) -> PyResult<(Arc, Path)> { let object_store_registry = Arc::new(lance::io::ObjectStoreRegistry::default()); - let mut object_store_params = ObjectStoreParams { + let object_store_params = ObjectStoreParams { storage_options: storage_options.clone(), storage_options_provider, ..Default::default() }; - if let Some(offset_seconds) = s3_credentials_refresh_offset_seconds { - object_store_params.s3_credentials_refresh_offset = - std::time::Duration::from_secs(offset_seconds); - } let (object_store, path) = ObjectStore::from_uri_and_params( object_store_registry, @@ -425,13 +417,11 @@ impl LanceFileSession { uri_or_path: String, storage_options: Option>, storage_options_provider: Option>, - s3_credentials_refresh_offset_seconds: Option, ) -> PyResult { let (object_store, base_path) = object_store_from_uri_or_path_with_provider( uri_or_path, storage_options, storage_options_provider, - s3_credentials_refresh_offset_seconds, ) .await?; Ok(Self { @@ -444,25 +434,16 @@ impl LanceFileSession { #[pymethods] impl LanceFileSession { #[new] - #[pyo3(signature=(uri_or_path, storage_options=None, storage_options_provider=None, s3_credentials_refresh_offset_seconds=None))] + #[pyo3(signature=(uri_or_path, storage_options=None, storage_options_provider=None))] pub fn new( uri_or_path: String, storage_options: Option>, storage_options_provider: Option<&Bound<'_, PyAny>>, - s3_credentials_refresh_offset_seconds: Option, ) -> PyResult { let provider = storage_options_provider .map(crate::storage_options::py_object_to_storage_options_provider) .transpose()?; - rt().block_on( - None, - Self::try_new( - uri_or_path, - storage_options, - provider, - s3_credentials_refresh_offset_seconds, - ), - )? + rt().block_on(None, Self::try_new(uri_or_path, storage_options, provider))? } #[pyo3(signature=(path, columns=None))] @@ -644,14 +625,12 @@ impl LanceFileReader { uri_or_path: String, storage_options: Option>, storage_options_provider: Option>, - s3_credentials_refresh_offset_seconds: Option, columns: Option>, ) -> PyResult { let (object_store, path) = object_store_from_uri_or_path_with_provider( uri_or_path, storage_options, storage_options_provider, - s3_credentials_refresh_offset_seconds, ) .await?; Self::open_with_store(object_store, path, columns).await @@ -749,27 +728,17 @@ impl LanceFileReader { #[pymethods] impl LanceFileReader { #[new] - #[pyo3(signature=(path, storage_options=None, storage_options_provider=None, s3_credentials_refresh_offset_seconds=None, columns=None))] + #[pyo3(signature=(path, storage_options=None, storage_options_provider=None, columns=None))] pub fn new( path: String, storage_options: Option>, storage_options_provider: Option<&Bound<'_, PyAny>>, - s3_credentials_refresh_offset_seconds: Option, columns: Option>, ) -> PyResult { let provider = storage_options_provider .map(crate::storage_options::py_object_to_storage_options_provider) .transpose()?; - rt().block_on( - None, - Self::open( - path, - storage_options, - provider, - s3_credentials_refresh_offset_seconds, - columns, - ), - )? + rt().block_on(None, Self::open(path, storage_options, provider, columns))? } pub fn read_all( diff --git a/python/src/storage_options.rs b/python/src/storage_options.rs index 56486d997be..0c8a4d2d00b 100644 --- a/python/src/storage_options.rs +++ b/python/src/storage_options.rs @@ -3,7 +3,6 @@ use std::collections::HashMap; use std::sync::Arc; -use std::time::Duration; use async_trait::async_trait; use lance_io::object_store::{StorageOptionsAccessor, StorageOptionsProvider}; @@ -199,32 +198,29 @@ impl PyStorageOptionsAccessor { } /// Create an accessor with a dynamic provider (no initial options) + /// + /// The refresh offset is extracted from storage options using the `refresh_offset_millis` key. #[staticmethod] - #[pyo3(signature = (provider, refresh_offset_secs=300))] - fn with_provider(provider: &Bound<'_, PyAny>, refresh_offset_secs: u64) -> PyResult { + fn with_provider(provider: &Bound<'_, PyAny>) -> PyResult { let rust_provider = py_object_to_storage_options_provider(provider)?; Ok(Self { - inner: Arc::new(StorageOptionsAccessor::with_provider( - rust_provider, - Duration::from_secs(refresh_offset_secs), - )), + inner: Arc::new(StorageOptionsAccessor::with_provider(rust_provider)), }) } /// Create an accessor with initial options and a dynamic provider + /// + /// The refresh offset is extracted from initial_options using the `refresh_offset_millis` key. #[staticmethod] - #[pyo3(signature = (initial_options, provider, refresh_offset_secs=300))] fn with_initial_and_provider( initial_options: HashMap, provider: &Bound<'_, PyAny>, - refresh_offset_secs: u64, ) -> PyResult { let rust_provider = py_object_to_storage_options_provider(provider)?; Ok(Self { inner: Arc::new(StorageOptionsAccessor::with_initial_and_provider( initial_options, rust_provider, - Duration::from_secs(refresh_offset_secs), )), }) } @@ -269,31 +265,27 @@ impl PyStorageOptionsAccessor { /// Create a StorageOptionsAccessor from Python parameters /// -/// This handles the conversion from Python types to Rust StorageOptionsAccessor +/// This handles the conversion from Python types to Rust StorageOptionsAccessor. +/// The refresh offset is extracted from storage_options using the `refresh_offset_millis` key. +#[allow(dead_code)] pub fn create_accessor_from_python( storage_options: Option>, storage_options_provider: Option<&Bound<'_, PyAny>>, - refresh_offset_secs: u64, ) -> PyResult>> { match (storage_options, storage_options_provider) { (Some(opts), Some(provider)) => { let rust_provider = py_object_to_storage_options_provider(provider)?; - Ok(Some(Arc::new(StorageOptionsAccessor::with_initial_and_provider( - opts, - rust_provider, - Duration::from_secs(refresh_offset_secs), - )))) + Ok(Some(Arc::new( + StorageOptionsAccessor::with_initial_and_provider(opts, rust_provider), + ))) } (None, Some(provider)) => { let rust_provider = py_object_to_storage_options_provider(provider)?; Ok(Some(Arc::new(StorageOptionsAccessor::with_provider( rust_provider, - Duration::from_secs(refresh_offset_secs), )))) } - (Some(opts), None) => { - Ok(Some(Arc::new(StorageOptionsAccessor::static_options(opts)))) - } + (Some(opts), None) => Ok(Some(Arc::new(StorageOptionsAccessor::static_options(opts)))), (None, None) => Ok(None), } } diff --git a/rust/lance-io/src/object_store.rs b/rust/lance-io/src/object_store.rs index b53a665f545..9ec16462486 100644 --- a/rust/lance-io/src/object_store.rs +++ b/rust/lance-io/src/object_store.rs @@ -66,6 +66,7 @@ pub const DEFAULT_DOWNLOAD_RETRY_COUNT: usize = 3; pub use providers::{ObjectStoreProvider, ObjectStoreRegistry}; pub use storage_options::{ LanceNamespaceStorageOptionsProvider, StorageOptionsProvider, EXPIRES_AT_MILLIS_KEY, + REFRESH_OFFSET_MILLIS_KEY, }; pub use storage_options_accessor::StorageOptionsAccessor; @@ -189,6 +190,8 @@ pub struct ObjectStoreParams { pub block_size: Option, #[deprecated(note = "Implement an ObjectStoreProvider instead")] pub object_store: Option<(Arc, Url)>, + /// Refresh offset for AWS credentials when using the legacy AWS credentials path. + /// For StorageOptionsAccessor, use `refresh_offset_millis` storage option instead. pub s3_credentials_refresh_offset: Duration, #[cfg(feature = "aws")] pub aws_credentials: Option, @@ -246,16 +249,11 @@ impl ObjectStoreParams { // Create accessor from legacy fields match (&self.storage_options, &self.storage_options_provider) { - (Some(opts), Some(provider)) => { - Some(Arc::new(StorageOptionsAccessor::with_initial_and_provider( - opts.clone(), - provider.clone(), - self.s3_credentials_refresh_offset, - ))) - } + (Some(opts), Some(provider)) => Some(Arc::new( + StorageOptionsAccessor::with_initial_and_provider(opts.clone(), provider.clone()), + )), (None, Some(provider)) => Some(Arc::new(StorageOptionsAccessor::with_provider( provider.clone(), - self.s3_credentials_refresh_offset, ))), (Some(opts), None) => Some(Arc::new(StorageOptionsAccessor::static_options( opts.clone(), diff --git a/rust/lance-io/src/object_store/providers/aws.rs b/rust/lance-io/src/object_store/providers/aws.rs index ad1c0bcefd5..6ec643c5e8e 100644 --- a/rust/lance-io/src/object_store/providers/aws.rs +++ b/rust/lance-io/src/object_store/providers/aws.rs @@ -486,40 +486,34 @@ impl DynamicStorageOptionsCredentialProvider { /// Create a new credential provider from a storage options provider /// /// This is a convenience constructor for backward compatibility. + /// The refresh offset will be extracted from storage options using + /// the `refresh_offset_millis` key, defaulting to 60 seconds. /// /// # Arguments /// * `provider` - The storage options provider - /// * `refresh_offset` - Duration before expiry to refresh credentials - pub fn from_provider( - provider: Arc, - refresh_offset: Duration, - ) -> Self { + pub fn from_provider(provider: Arc) -> Self { Self { - accessor: Arc::new(StorageOptionsAccessor::with_provider( - provider, - refresh_offset, - )), + accessor: Arc::new(StorageOptionsAccessor::with_provider(provider)), } } /// Create a new credential provider with initial credentials /// /// This is a convenience constructor for backward compatibility. + /// The refresh offset will be extracted from initial_options using + /// the `refresh_offset_millis` key, defaulting to 60 seconds. /// /// # Arguments /// * `provider` - The storage options provider - /// * `refresh_offset` - Duration before expiry to refresh credentials /// * `initial_options` - Initial storage options to cache pub fn from_provider_with_initial( provider: Arc, - refresh_offset: Duration, initial_options: HashMap, ) -> Self { Self { accessor: Arc::new(StorageOptionsAccessor::with_initial_and_provider( initial_options, provider, - refresh_offset, )), } } diff --git a/rust/lance-io/src/object_store/storage_options.rs b/rust/lance-io/src/object_store/storage_options.rs index 22854e8fd53..2e630279a73 100644 --- a/rust/lance-io/src/object_store/storage_options.rs +++ b/rust/lance-io/src/object_store/storage_options.rs @@ -20,6 +20,9 @@ use snafu::location; /// Key for the expiration timestamp in storage options HashMap pub const EXPIRES_AT_MILLIS_KEY: &str = "expires_at_millis"; +/// Key for the refresh offset in storage options HashMap (milliseconds before expiry to refresh) +pub const REFRESH_OFFSET_MILLIS_KEY: &str = "refresh_offset_millis"; + /// Trait for providing storage options with expiration tracking /// /// Implementations can fetch storage options from various sources (namespace servers, diff --git a/rust/lance-io/src/object_store/storage_options_accessor.rs b/rust/lance-io/src/object_store/storage_options_accessor.rs index 0a4748c389a..7e282f9f589 100644 --- a/rust/lance-io/src/object_store/storage_options_accessor.rs +++ b/rust/lance-io/src/object_store/storage_options_accessor.rs @@ -20,10 +20,13 @@ use std::time::{SystemTime, UNIX_EPOCH}; use tokio::sync::RwLock; use super::storage_options::StorageOptionsProvider; -use super::{StorageOptions, EXPIRES_AT_MILLIS_KEY}; +use super::{StorageOptions, EXPIRES_AT_MILLIS_KEY, REFRESH_OFFSET_MILLIS_KEY}; use lance_core::{Error, Result}; use snafu::location; +/// Default refresh offset: 60 seconds before expiration +const DEFAULT_REFRESH_OFFSET_MILLIS: u64 = 60_000; + /// Unified access to storage options with automatic caching and refresh /// /// This struct bundles static storage options with an optional dynamic provider, @@ -72,6 +75,15 @@ struct CachedStorageOptions { } impl StorageOptionsAccessor { + /// Extract refresh offset from storage options, or use default + fn extract_refresh_offset(options: &HashMap) -> Duration { + options + .get(REFRESH_OFFSET_MILLIS_KEY) + .and_then(|s| s.parse::().ok()) + .map(Duration::from_millis) + .unwrap_or(Duration::from_millis(DEFAULT_REFRESH_OFFSET_MILLIS)) + } + /// Create an accessor with only static options (no refresh capability) /// /// The returned accessor will always return the provided options. @@ -80,6 +92,7 @@ impl StorageOptionsAccessor { let expires_at_millis = options .get(EXPIRES_AT_MILLIS_KEY) .and_then(|s| s.parse::().ok()); + let refresh_offset = Self::extract_refresh_offset(&options); Self { initial_options: Some(options.clone()), @@ -88,7 +101,7 @@ impl StorageOptionsAccessor { options, expires_at_millis, }))), - refresh_offset: Duration::ZERO, + refresh_offset, } } @@ -96,19 +109,16 @@ impl StorageOptionsAccessor { /// /// The accessor will fetch from the provider on first access and cache /// the results. Refresh happens automatically before expiration. + /// Uses the default refresh offset (60 seconds) until options are fetched. /// /// # Arguments /// * `provider` - The storage options provider for fetching fresh options - /// * `refresh_offset` - Duration before expiry to trigger refresh - pub fn with_provider( - provider: Arc, - refresh_offset: Duration, - ) -> Self { + pub fn with_provider(provider: Arc) -> Self { Self { initial_options: None, provider: Some(provider), cache: Arc::new(RwLock::new(None)), - refresh_offset, + refresh_offset: Duration::from_millis(DEFAULT_REFRESH_OFFSET_MILLIS), } } @@ -116,19 +126,19 @@ impl StorageOptionsAccessor { /// /// Initial options are used until they expire, then the provider is called. /// This avoids an immediate fetch when initial credentials are still valid. + /// The `refresh_offset_millis` key in initial_options controls refresh timing. /// /// # Arguments /// * `initial_options` - Initial storage options to cache /// * `provider` - The storage options provider for refreshing - /// * `refresh_offset` - Duration before expiry to trigger refresh pub fn with_initial_and_provider( initial_options: HashMap, provider: Arc, - refresh_offset: Duration, ) -> Self { let expires_at_millis = initial_options .get(EXPIRES_AT_MILLIS_KEY) .and_then(|s| s.parse::().ok()); + let refresh_offset = Self::extract_refresh_offset(&initial_options); Self { initial_options: Some(initial_options.clone()), @@ -411,8 +421,7 @@ mod tests { MockClock::set_system_time(Duration::from_secs(100_000)); let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); - let accessor = - StorageOptionsAccessor::with_provider(mock_provider.clone(), Duration::from_secs(60)); + let accessor = StorageOptionsAccessor::with_provider(mock_provider.clone()); let result = accessor.get_storage_options().await.unwrap(); assert!(result.0.contains_key("aws_access_key_id")); @@ -442,7 +451,6 @@ mod tests { let accessor = StorageOptionsAccessor::with_initial_and_provider( initial.clone(), mock_provider.clone(), - Duration::from_secs(60), ); // First call uses initial @@ -456,26 +464,26 @@ mod tests { MockClock::set_system_time(Duration::from_secs(100_000)); let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); // 10 min expiry - let accessor = StorageOptionsAccessor::with_provider( - mock_provider.clone(), - Duration::from_secs(300), // 5 min refresh offset - ); - - // First call fetches - let result = accessor.get_storage_options().await.unwrap(); - assert_eq!(result.0.get("aws_access_key_id").unwrap(), "AKID_1"); - assert_eq!(mock_provider.get_call_count().await, 1); + // Use with_initial_and_provider to set custom refresh_offset_millis (5 min = 300000ms) + let now_ms = MockClock::system_time().as_millis() as u64; + let expires_at = now_ms + 600_000; // 10 minutes from now + let initial = HashMap::from([ + (EXPIRES_AT_MILLIS_KEY.to_string(), expires_at.to_string()), + (REFRESH_OFFSET_MILLIS_KEY.to_string(), "300000".to_string()), // 5 min refresh offset + ]); + let accessor = + StorageOptionsAccessor::with_initial_and_provider(initial, mock_provider.clone()); - // Second call uses cache + // First call uses initial cached options let result = accessor.get_storage_options().await.unwrap(); - assert_eq!(result.0.get("aws_access_key_id").unwrap(), "AKID_1"); - assert_eq!(mock_provider.get_call_count().await, 1); + assert!(result.0.contains_key(EXPIRES_AT_MILLIS_KEY)); + assert_eq!(mock_provider.get_call_count().await, 0); // Advance time to 6 minutes - should trigger refresh (within 5 min refresh offset) MockClock::set_system_time(Duration::from_secs(100_000 + 360)); let result = accessor.get_storage_options().await.unwrap(); - assert_eq!(result.0.get("aws_access_key_id").unwrap(), "AKID_2"); - assert_eq!(mock_provider.get_call_count().await, 2); + assert_eq!(result.0.get("aws_access_key_id").unwrap(), "AKID_1"); + assert_eq!(mock_provider.get_call_count().await, 1); } #[tokio::test] @@ -491,11 +499,8 @@ mod tests { ]); let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); - let accessor = StorageOptionsAccessor::with_initial_and_provider( - initial, - mock_provider.clone(), - Duration::from_secs(60), - ); + let accessor = + StorageOptionsAccessor::with_initial_and_provider(initial, mock_provider.clone()); // Should fetch from provider since initial is expired let result = accessor.get_storage_options().await.unwrap(); @@ -506,8 +511,7 @@ mod tests { #[tokio::test] async fn test_accessor_id_with_provider() { let mock_provider = Arc::new(MockStorageOptionsProvider::new(None)); - let accessor = - StorageOptionsAccessor::with_provider(mock_provider, Duration::from_secs(60)); + let accessor = StorageOptionsAccessor::with_provider(mock_provider); let id = accessor.accessor_id(); assert!(id.starts_with("MockStorageOptionsProvider")); @@ -527,10 +531,7 @@ mod tests { // Create a mock provider with far future expiration let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(9999999999999))); - let accessor = Arc::new(StorageOptionsAccessor::with_provider( - mock_provider.clone(), - Duration::from_secs(300), - )); + let accessor = Arc::new(StorageOptionsAccessor::with_provider(mock_provider.clone())); // Spawn 10 concurrent tasks that all try to get options at the same time let mut handles = vec![]; @@ -567,8 +568,7 @@ mod tests { MockClock::set_system_time(Duration::from_secs(100_000)); let mock_provider = Arc::new(MockStorageOptionsProvider::new(None)); // No expiration - let accessor = - StorageOptionsAccessor::with_provider(mock_provider.clone(), Duration::from_secs(60)); + let accessor = StorageOptionsAccessor::with_provider(mock_provider.clone()); // First call fetches accessor.get_storage_options().await.unwrap(); diff --git a/rust/lance-namespace-impls/src/dir.rs b/rust/lance-namespace-impls/src/dir.rs index 4d6a88419ee..0722347d684 100644 --- a/rust/lance-namespace-impls/src/dir.rs +++ b/rust/lance-namespace-impls/src/dir.rs @@ -3106,15 +3106,10 @@ mod tests { .unwrap(); let reader1 = RecordBatchIterator::new(vec![data1].into_iter().map(Ok), schema.clone()); - let dataset = Dataset::write_into_namespace( - reader1, - namespace.clone(), - table_id.clone(), - None, - false, - ) - .await - .unwrap(); + let dataset = + Dataset::write_into_namespace(reader1, namespace.clone(), table_id.clone(), None) + .await + .unwrap(); assert_eq!(dataset.count_rows(None).await.unwrap(), 3); assert_eq!(dataset.version().version, 1); @@ -3140,7 +3135,6 @@ mod tests { namespace.clone(), table_id.clone(), Some(params_append), - false, ) .await .unwrap(); @@ -3169,7 +3163,6 @@ mod tests { namespace.clone(), table_id.clone(), Some(params_overwrite), - false, ) .await .unwrap(); diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index c005f61639a..f7df62d1c37 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -809,15 +809,12 @@ impl Dataset { /// * `namespace` - The namespace to use for table management /// * `table_id` - The table identifier /// * `params` - Write parameters - /// * `ignore_namespace_table_storage_options` - If true, ignore storage options returned - /// by the namespace and only use the storage options in params. The storage options - /// provider will not be created, so credentials will not be automatically refreshed. + #[allow(deprecated)] pub async fn write_into_namespace( batches: impl RecordBatchReader + Send + 'static, namespace: Arc, table_id: Vec, mut params: Option, - ignore_namespace_table_storage_options: bool, ) -> Result { let mut write_params = params.take().unwrap_or_default(); @@ -866,28 +863,26 @@ impl Dataset { location: location!(), })?; - // Set initial credentials and provider unless ignored - if !ignore_namespace_table_storage_options { - if let Some(namespace_storage_options) = response.storage_options { - let provider = Arc::new(LanceNamespaceStorageOptionsProvider::new( - namespace, table_id, - )); - - // Merge namespace storage options with any existing options - let mut merged_options = write_params - .store_params - .as_ref() - .and_then(|p| p.storage_options.clone()) - .unwrap_or_default(); - merged_options.extend(namespace_storage_options); - - let existing_params = write_params.store_params.take().unwrap_or_default(); - write_params.store_params = Some(ObjectStoreParams { - storage_options: Some(merged_options), - storage_options_provider: Some(provider), - ..existing_params - }); - } + // Set initial credentials and provider from namespace + if let Some(namespace_storage_options) = response.storage_options { + let provider = Arc::new(LanceNamespaceStorageOptionsProvider::new( + namespace, table_id, + )); + + // Merge namespace storage options with any existing options + let mut merged_options = write_params + .store_params + .as_ref() + .and_then(|p| p.storage_options.clone()) + .unwrap_or_default(); + merged_options.extend(namespace_storage_options); + + let existing_params = write_params.store_params.take().unwrap_or_default(); + write_params.store_params = Some(ObjectStoreParams { + storage_options: Some(merged_options), + storage_options_provider: Some(provider), + ..existing_params + }); } Self::write(batches, uri.as_str(), Some(write_params)).await @@ -913,29 +908,27 @@ impl Dataset { location: location!(), })?; - // Set initial credentials and provider unless ignored - if !ignore_namespace_table_storage_options { - if let Some(namespace_storage_options) = response.storage_options { - let provider = Arc::new(LanceNamespaceStorageOptionsProvider::new( - namespace.clone(), - table_id.clone(), - )); - - // Merge namespace storage options with any existing options - let mut merged_options = write_params - .store_params - .as_ref() - .and_then(|p| p.storage_options.clone()) - .unwrap_or_default(); - merged_options.extend(namespace_storage_options); - - let existing_params = write_params.store_params.take().unwrap_or_default(); - write_params.store_params = Some(ObjectStoreParams { - storage_options: Some(merged_options), - storage_options_provider: Some(provider), - ..existing_params - }); - } + // Set initial credentials and provider from namespace + if let Some(namespace_storage_options) = response.storage_options { + let provider = Arc::new(LanceNamespaceStorageOptionsProvider::new( + namespace.clone(), + table_id.clone(), + )); + + // Merge namespace storage options with any existing options + let mut merged_options = write_params + .store_params + .as_ref() + .and_then(|p| p.storage_options.clone()) + .unwrap_or_default(); + merged_options.extend(namespace_storage_options); + + let existing_params = write_params.store_params.take().unwrap_or_default(); + write_params.store_params = Some(ObjectStoreParams { + storage_options: Some(merged_options), + storage_options_provider: Some(provider), + ..existing_params + }); } // For APPEND/OVERWRITE modes, we must open the existing dataset first diff --git a/rust/lance/src/dataset/builder.rs b/rust/lance/src/dataset/builder.rs index 89d290604a4..60323a9a2ff 100644 --- a/rust/lance/src/dataset/builder.rs +++ b/rust/lance/src/dataset/builder.rs @@ -95,8 +95,6 @@ impl DatasetBuilder { /// # Arguments /// * `namespace` - The namespace implementation to fetch table info from /// * `table_id` - The table identifier (e.g., vec!["my_table"]) - /// * `ignore_namespace_table_storage_options` - If true, storage options returned from - /// the namespace's `describe_table()` will be ignored (treated as None). Defaults to false. /// /// # Example /// ```ignore @@ -111,28 +109,17 @@ impl DatasetBuilder { /// /// // Load a dataset using storage options from namespace /// let dataset = DatasetBuilder::from_namespace( - /// namespace.clone(), - /// vec!["my_table".to_string()], - /// false, - /// ) - /// .await? - /// .load() - /// .await?; - /// - /// // Load a dataset ignoring namespace storage options - /// let dataset = DatasetBuilder::from_namespace( /// namespace, /// vec!["my_table".to_string()], - /// true, /// ) /// .await? /// .load() /// .await?; /// ``` + #[allow(deprecated)] pub async fn from_namespace( namespace: Arc, table_id: Vec, - ignore_namespace_table_storage_options: bool, ) -> Result { let request = DescribeTableRequest { id: Some(table_id.clone()), @@ -156,11 +143,8 @@ impl DatasetBuilder { let mut builder = Self::from_uri(table_uri); - let namespace_storage_options = if ignore_namespace_table_storage_options { - None - } else { - response.storage_options - }; + // Use namespace storage options if available + let namespace_storage_options = response.storage_options; builder.storage_options_override = namespace_storage_options.clone(); From b65e193cf5fd9bc7fd412eefe7bf6899f5595213 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Thu, 15 Jan 2026 23:00:40 -0800 Subject: [PATCH 3/9] fix clippy --- .../src/object_store/providers/aws.rs | 53 ++++++++----------- 1 file changed, 23 insertions(+), 30 deletions(-) diff --git a/rust/lance-io/src/object_store/providers/aws.rs b/rust/lance-io/src/object_store/providers/aws.rs index 6ec643c5e8e..4e7ce43d9f2 100644 --- a/rust/lance-io/src/object_store/providers/aws.rs +++ b/rust/lance-io/src/object_store/providers/aws.rs @@ -760,11 +760,11 @@ mod tests { ), ("aws_session_token".to_string(), "TOKEN_CACHED".to_string()), ("expires_at_millis".to_string(), expires_at.to_string()), + ("refresh_offset_millis".to_string(), "300000".to_string()), // 5 minute refresh offset ]); let provider = DynamicStorageOptionsCredentialProvider::from_provider_with_initial( mock.clone(), - Duration::from_secs(300), // 5 minute refresh offset initial_options, ); @@ -798,11 +798,11 @@ mod tests { "SECRET_EXPIRED".to_string(), ), ("expires_at_millis".to_string(), expired_time.to_string()), + ("refresh_offset_millis".to_string(), "300000".to_string()), // 5 minute refresh offset ]); let provider = DynamicStorageOptionsCredentialProvider::from_provider_with_initial( mock.clone(), - Duration::from_secs(300), // 5 minute refresh offset initial_options, ); @@ -820,27 +820,24 @@ mod tests { async fn test_dynamic_credential_provider_refresh_lead_time() { MockClock::set_system_time(Duration::from_secs(100_000)); - // Create a mock provider that returns credentials expiring in 4 minutes + // Create a mock provider that returns credentials expiring in 30 seconds let mock = Arc::new(MockStorageOptionsProvider::new(Some( - 240_000, // Expires in 4 minutes + 30_000, // Expires in 30 seconds ))); - // Create credential provider with 5 minute refresh offset - // This means credentials should be refreshed when they have less than 5 minutes left - let provider = DynamicStorageOptionsCredentialProvider::from_provider( - mock.clone(), - Duration::from_secs(300), // 5 minute refresh offset - ); + // Create credential provider with default 60 second refresh offset + // This means credentials should be refreshed when they have less than 60 seconds left + let provider = DynamicStorageOptionsCredentialProvider::from_provider(mock.clone()); // First call should fetch credentials from provider (no initial cache) - // Credentials expire in 4 minutes, which is less than our 5 minute refresh offset, + // Credentials expire in 30 seconds, which is less than our 60 second refresh offset, // so they should be considered "needs refresh" immediately let cred = provider.get_credential().await.unwrap(); assert_eq!(cred.key_id, "AKID_1"); assert_eq!(mock.get_call_count().await, 1); - // Second call should trigger refresh because credentials expire in 4 minutes - // but our refresh lead time is 5 minutes (now + 5min > expires_at) + // Second call should trigger refresh because credentials expire in 30 seconds + // but our refresh lead time is 60 seconds (now + 60sec > expires_at) // The mock will return new credentials (AKID_2) with the same expiration let cred = provider.get_credential().await.unwrap(); assert_eq!(cred.key_id, "AKID_2"); @@ -851,16 +848,13 @@ mod tests { async fn test_dynamic_credential_provider_no_initial_cache() { MockClock::set_system_time(Duration::from_secs(100_000)); - // Create a mock provider that returns credentials expiring in 10 minutes + // Create a mock provider that returns credentials expiring in 2 minutes let mock = Arc::new(MockStorageOptionsProvider::new(Some( - 600_000, // Expires in 10 minutes + 120_000, // Expires in 2 minutes ))); - // Create credential provider without initial cache - let provider = DynamicStorageOptionsCredentialProvider::from_provider( - mock.clone(), - Duration::from_secs(300), // 5 minute refresh offset - ); + // Create credential provider without initial cache, using default 60 second refresh offset + let provider = DynamicStorageOptionsCredentialProvider::from_provider(mock.clone()); // First call should fetch from provider (call count = 1) let cred = provider.get_credential().await.unwrap(); @@ -869,21 +863,22 @@ mod tests { assert_eq!(cred.token, Some("TOKEN_1".to_string())); assert_eq!(mock.get_call_count().await, 1); - // Second call should use cached credentials (not expired yet) + // Second call should use cached credentials (not expired yet, still > 60 seconds remaining) let cred = provider.get_credential().await.unwrap(); assert_eq!(cred.key_id, "AKID_1"); assert_eq!(mock.get_call_count().await, 1); // Still 1, didn't fetch again - // Advance time to 6 minutes - should trigger refresh (within 5 min refresh offset) - MockClock::set_system_time(Duration::from_secs(100_000 + 360)); + // Advance time to 90 seconds - should trigger refresh (within 60 sec refresh offset) + // At this point, credentials expire in 30 seconds (< 60 sec offset) + MockClock::set_system_time(Duration::from_secs(100_000 + 90)); let cred = provider.get_credential().await.unwrap(); assert_eq!(cred.key_id, "AKID_2"); assert_eq!(cred.secret_key, "SECRET_2"); assert_eq!(cred.token, Some("TOKEN_2".to_string())); assert_eq!(mock.get_call_count().await, 2); - // Advance time to 11 minutes total - should trigger another refresh - MockClock::set_system_time(Duration::from_secs(100_000 + 660)); + // Advance time to 210 seconds total (90 + 120) - should trigger another refresh + MockClock::set_system_time(Duration::from_secs(100_000 + 210)); let cred = provider.get_credential().await.unwrap(); assert_eq!(cred.key_id, "AKID_3"); assert_eq!(cred.secret_key, "SECRET_3"); @@ -911,12 +906,12 @@ mod tests { ), ("aws_session_token".to_string(), "TOKEN_INITIAL".to_string()), ("expires_at_millis".to_string(), expires_at.to_string()), + ("refresh_offset_millis".to_string(), "300000".to_string()), // 5 minute refresh offset ]); // Create credential provider with initial options let provider = DynamicStorageOptionsCredentialProvider::from_provider_with_initial( mock.clone(), - Duration::from_secs(300), // 5 minute refresh offset initial_options, ); @@ -968,7 +963,6 @@ mod tests { let provider = Arc::new(DynamicStorageOptionsCredentialProvider::from_provider( mock.clone(), - Duration::from_secs(300), )); // Spawn 10 concurrent tasks that all try to get credentials at the same time @@ -1024,6 +1018,7 @@ mod tests { ), ("aws_session_token".to_string(), "TOKEN_OLD".to_string()), ("expires_at_millis".to_string(), expires_at.to_string()), + ("refresh_offset_millis".to_string(), "300000".to_string()), // 5 minute refresh offset ]); // Mock will return credentials expiring in 1 hour @@ -1034,7 +1029,6 @@ mod tests { let provider = Arc::new( DynamicStorageOptionsCredentialProvider::from_provider_with_initial( mock.clone(), - Duration::from_secs(300), initial_options, ), ); @@ -1090,7 +1084,6 @@ mod tests { // Create an accessor with the mock provider let accessor = Arc::new(StorageOptionsAccessor::with_provider( mock_storage_provider.clone(), - Duration::from_secs(300), )); // Create an explicit AWS credentials provider @@ -1151,13 +1144,13 @@ mod tests { "TOKEN_FROM_ACCESSOR".to_string(), ), ("expires_at_millis".to_string(), expires_at.to_string()), + ("refresh_offset_millis".to_string(), "300000".to_string()), // 5 minute refresh offset ]); // Create an accessor with initial options and provider let accessor = Arc::new(StorageOptionsAccessor::with_initial_and_provider( initial_options, mock_storage_provider.clone(), - Duration::from_secs(300), )); // Build credentials with accessor but NO explicit aws_credentials From 91fb13cd5138187e67fbe7f52ded6573701753f1 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Thu, 15 Jan 2026 23:04:13 -0800 Subject: [PATCH 4/9] fix integ --- java/src/test/java/org/lance/NamespaceIntegrationTest.java | 2 ++ python/python/tests/test_namespace_integration.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/java/src/test/java/org/lance/NamespaceIntegrationTest.java b/java/src/test/java/org/lance/NamespaceIntegrationTest.java index 85e86ff5d09..2d6f8ab1443 100644 --- a/java/src/test/java/org/lance/NamespaceIntegrationTest.java +++ b/java/src/test/java/org/lance/NamespaceIntegrationTest.java @@ -203,6 +203,8 @@ private Map modifyStorageOptions( long expiresAtMillis = System.currentTimeMillis() + (credentialExpiresInSeconds * 1000L); modified.put("expires_at_millis", String.valueOf(expiresAtMillis)); + // Set refresh offset to 1 second (1000ms) for short-lived credential tests + modified.put("refresh_offset_millis", "1000"); return modified; } diff --git a/python/python/tests/test_namespace_integration.py b/python/python/tests/test_namespace_integration.py index 01a3b6859e9..30489496e38 100644 --- a/python/python/tests/test_namespace_integration.py +++ b/python/python/tests/test_namespace_integration.py @@ -128,6 +128,8 @@ def _modify_storage_options( (time.time() + self.credential_expires_in_seconds) * 1000 ) modified["expires_at_millis"] = str(expires_at_millis) + # Set refresh offset to 1 second (1000ms) for short-lived credential tests + modified["refresh_offset_millis"] = "1000" return modified From d001b90519538067be8296ec7f925a18bd99fd9f Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Thu, 15 Jan 2026 23:08:14 -0800 Subject: [PATCH 5/9] fix --- rust/lance-namespace-impls/src/rest_adapter.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/rust/lance-namespace-impls/src/rest_adapter.rs b/rust/lance-namespace-impls/src/rest_adapter.rs index 899863793ff..ad669094fb1 100644 --- a/rust/lance-namespace-impls/src/rest_adapter.rs +++ b/rust/lance-namespace-impls/src/rest_adapter.rs @@ -2700,7 +2700,6 @@ mod tests { namespace.clone(), table_id.clone(), None, - false, ) .await .unwrap(); @@ -2729,7 +2728,6 @@ mod tests { namespace.clone(), table_id.clone(), Some(params_append), - false, ) .await .unwrap(); @@ -2758,7 +2756,6 @@ mod tests { namespace.clone(), table_id.clone(), Some(params_overwrite), - false, ) .await .unwrap(); From 4e4dd5d9042512dd0398ce34def2cb1c31fe296d Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Thu, 15 Jan 2026 23:53:50 -0800 Subject: [PATCH 6/9] make accessor private --- java/lance-jni/src/blocking_dataset.rs | 29 ++----------------- java/lance-jni/src/transaction.rs | 11 ++++--- java/lance-jni/src/utils.rs | 10 +++---- rust/lance-io/src/object_store.rs | 9 +++--- .../lance-namespace-impls/src/rest_adapter.rs | 12 +++----- rust/lance/src/dataset.rs | 5 ---- rust/lance/src/dataset/builder.rs | 5 ---- 7 files changed, 20 insertions(+), 61 deletions(-) diff --git a/java/lance-jni/src/blocking_dataset.rs b/java/lance-jni/src/blocking_dataset.rs index 28fe3f0ae5c..8d8b468f6cc 100644 --- a/java/lance-jni/src/blocking_dataset.rs +++ b/java/lance-jni/src/blocking_dataset.rs @@ -76,13 +76,6 @@ pub struct BlockingDataset { } impl BlockingDataset { - /// Get the storage options provider that was used when opening this dataset - #[deprecated(note = "Use storage_options_accessor() instead")] - pub fn get_storage_options_provider(&self) -> Option> { - #[allow(deprecated)] - self.inner.storage_options_provider() - } - /// Get the initial storage options used to open this dataset. /// /// Returns the options that were provided when the dataset was opened, @@ -102,16 +95,6 @@ impl BlockingDataset { .map_err(|e| Error::io_error(e.to_string())) } - /// Get the storage options accessor for this dataset. - /// - /// The accessor bundles static storage options and optional dynamic provider, - /// handling caching and refresh logic internally. - pub fn storage_options_accessor( - &self, - ) -> Option> { - self.inner.storage_options_accessor() - } - pub fn drop(uri: &str, storage_options: HashMap) -> Result<()> { RT.block_on(async move { let registry = Arc::new(ObjectStoreRegistry::default()); @@ -149,14 +132,12 @@ impl BlockingDataset { serialized_manifest: Option<&[u8]>, storage_options_provider: Option>, ) -> Result { - let mut store_params = ObjectStoreParams { + let store_params = ObjectStoreParams { block_size: block_size.map(|size| size as usize), - storage_options: Some(storage_options.clone()), + storage_options: Some(storage_options), + storage_options_provider, ..Default::default() }; - if let Some(provider) = storage_options_provider.clone() { - store_params.storage_options_provider = Some(provider); - } let params = ReadParams { index_cache_size_bytes: index_cache_size_bytes as usize, metadata_cache_size_bytes: metadata_cache_size_bytes as usize, @@ -169,10 +150,6 @@ impl BlockingDataset { if let Some(ver) = version { builder = builder.with_version(ver as u64); } - builder = builder.with_storage_options(storage_options); - if let Some(provider) = storage_options_provider.clone() { - builder = builder.with_storage_options_provider(provider) - } if let Some(serialized_manifest) = serialized_manifest { builder = builder.with_serialized_manifest(serialized_manifest)?; diff --git a/java/lance-jni/src/transaction.rs b/java/lance-jni/src/transaction.rs index 923de6736f0..b8f63afb996 100644 --- a/java/lance-jni/src/transaction.rs +++ b/java/lance-jni/src/transaction.rs @@ -756,18 +756,17 @@ fn inner_commit_transaction<'local>( let write_param_jmap = JMap::from_env(env, &write_param_jobj)?; let write_param = to_rust_map(env, &write_param_jmap)?; - // Get the Dataset's storage_options_provider - let storage_options_provider = { + // Get the Dataset's storage_options_accessor + let storage_options_accessor = { let dataset_guard = unsafe { env.get_rust_field::<_, _, BlockingDataset>(&java_dataset, NATIVE_DATASET) }?; - #[allow(deprecated)] - dataset_guard.get_storage_options_provider() + dataset_guard.inner.storage_options_accessor() }; - // Build ObjectStoreParams using write_param for storage_options and provider from Dataset + // Build ObjectStoreParams using write_param for storage_options and accessor from Dataset let store_params = ObjectStoreParams { storage_options: Some(write_param), - storage_options_provider, + storage_options_accessor, ..Default::default() }; diff --git a/java/lance-jni/src/utils.rs b/java/lance-jni/src/utils.rs index 567182c39a1..0efd077e052 100644 --- a/java/lance-jni/src/utils.rs +++ b/java/lance-jni/src/utils.rs @@ -89,13 +89,11 @@ pub fn extract_write_params( extract_storage_options(env, storage_options_obj)?; // Extract storage options provider if present - let storage_options_provider = env + let storage_options_provider: Option> = env .get_optional(storage_options_provider_obj, |env, provider_obj| { JavaStorageOptionsProvider::new(env, provider_obj) - })?; - - let storage_options_provider_arc: Option> = - storage_options_provider.map(|v| Arc::new(v) as Arc); + })? + .map(|p| Arc::new(p) as Arc); if let Some(initial_bases) = env.get_list_opt(initial_bases, |env, elem| elem.extract_object(env))? @@ -109,7 +107,7 @@ pub fn extract_write_params( write_params.store_params = Some(ObjectStoreParams { storage_options: Some(storage_options), - storage_options_provider: storage_options_provider_arc, + storage_options_provider, ..Default::default() }); Ok(write_params) diff --git a/rust/lance-io/src/object_store.rs b/rust/lance-io/src/object_store.rs index 9ec16462486..bc6060ba4fa 100644 --- a/rust/lance-io/src/object_store.rs +++ b/rust/lance-io/src/object_store.rs @@ -197,11 +197,10 @@ pub struct ObjectStoreParams { pub aws_credentials: Option, pub object_store_wrapper: Option>, pub storage_options: Option>, - /// Dynamic storage options provider for automatic credential refresh - #[deprecated( - since = "0.25.0", - note = "Use storage_options_accessor instead for unified access to storage options" - )] + /// Dynamic storage options provider for automatic credential refresh. + /// + /// When set, the provider will be called to refresh storage options when they expire. + /// The `expires_at_millis` key in the returned options controls when refresh occurs. pub storage_options_provider: Option>, /// Unified storage options accessor with caching and automatic refresh /// diff --git a/rust/lance-namespace-impls/src/rest_adapter.rs b/rust/lance-namespace-impls/src/rest_adapter.rs index ad669094fb1..b63331c8a66 100644 --- a/rust/lance-namespace-impls/src/rest_adapter.rs +++ b/rust/lance-namespace-impls/src/rest_adapter.rs @@ -2695,14 +2695,10 @@ mod tests { .unwrap(); let reader1 = RecordBatchIterator::new(vec![data1].into_iter().map(Ok), schema.clone()); - let dataset = Dataset::write_into_namespace( - reader1, - namespace.clone(), - table_id.clone(), - None, - ) - .await - .unwrap(); + let dataset = + Dataset::write_into_namespace(reader1, namespace.clone(), table_id.clone(), None) + .await + .unwrap(); assert_eq!(dataset.count_rows(None).await.unwrap(), 3); assert_eq!(dataset.version().version, 1); diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index f7df62d1c37..177b3284b69 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -1617,11 +1617,6 @@ impl Dataset { } /// Returns the storage options provider used when opening this dataset, if any. - #[deprecated( - since = "0.25.0", - note = "Use storage_options_accessor() instead for unified access to storage options" - )] - #[allow(deprecated)] pub fn storage_options_provider( &self, ) -> Option> { diff --git a/rust/lance/src/dataset/builder.rs b/rust/lance/src/dataset/builder.rs index 60323a9a2ff..e3eb8efe348 100644 --- a/rust/lance/src/dataset/builder.rs +++ b/rust/lance/src/dataset/builder.rs @@ -335,11 +335,6 @@ impl DatasetBuilder { /// .with_s3_credentials_refresh_offset(Duration::from_secs(600)) /// .load() /// .await?; - #[deprecated( - since = "0.25.0", - note = "Use with_storage_options_accessor() instead for unified access to storage options" - )] - #[allow(deprecated)] pub fn with_storage_options_provider( mut self, provider: Arc, From 5840cb1a27d54f617986a9a9e75b37e58b50c48f Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Fri, 16 Jan 2026 00:36:44 -0800 Subject: [PATCH 7/9] refactor2 --- java/lance-jni/src/blocking_dataset.rs | 52 +- java/lance-jni/src/file_reader.rs | 4 +- java/lance-jni/src/file_writer.rs | 4 +- java/lance-jni/src/transaction.rs | 38 +- java/lance-jni/src/utils.rs | 17 +- java/src/main/java/org/lance/Dataset.java | 6 +- python/python/lance/dataset.py | 4 +- python/src/dataset.rs | 53 +- python/src/file.rs | 20 +- rust/lance-io/src/object_store.rs | 84 +-- rust/lance-io/src/object_store/providers.rs | 7 +- .../src/object_store/providers/aws.rs | 27 +- .../src/object_store/providers/azure.rs | 31 +- .../src/object_store/providers/gcp.rs | 21 +- .../src/object_store/providers/huggingface.rs | 16 +- .../src/object_store/providers/local.rs | 4 +- .../src/object_store/providers/memory.rs | 4 +- .../src/object_store/providers/oss.rs | 5 +- .../src/object_store/storage_options.rs | 579 ++++++++++++++++- .../object_store/storage_options_accessor.rs | 584 ------------------ rust/lance-namespace-impls/src/dir.rs | 9 +- .../lance-namespace-impls/src/dir/manifest.rs | 6 +- rust/lance-table/src/io/commit.rs | 6 +- rust/lance/src/dataset.rs | 65 +- rust/lance/src/dataset/builder.rs | 90 ++- rust/lance/src/dataset/fragment/write.rs | 8 +- rust/lance/src/io.rs | 5 +- 27 files changed, 950 insertions(+), 799 deletions(-) delete mode 100644 rust/lance-io/src/object_store/storage_options_accessor.rs diff --git a/java/lance-jni/src/blocking_dataset.rs b/java/lance-jni/src/blocking_dataset.rs index 8d8b468f6cc..acea6166905 100644 --- a/java/lance-jni/src/blocking_dataset.rs +++ b/java/lance-jni/src/blocking_dataset.rs @@ -81,8 +81,8 @@ impl BlockingDataset { /// Returns the options that were provided when the dataset was opened, /// without any refresh from the provider. Returns None if no storage options /// were provided. - pub fn storage_options(&self) -> Option> { - self.inner.storage_options().cloned() + pub fn initial_storage_options(&self) -> Option> { + self.inner.initial_storage_options().cloned() } /// Get the latest storage options, potentially refreshed from the provider. @@ -99,7 +99,9 @@ impl BlockingDataset { RT.block_on(async move { let registry = Arc::new(ObjectStoreRegistry::default()); let object_store_params = ObjectStoreParams { - storage_options: Some(storage_options.clone()), + storage_options_accessor: Some(Arc::new( + lance::io::StorageOptionsAccessor::static_options(storage_options), + )), ..Default::default() }; let (object_store, path) = @@ -132,10 +134,26 @@ impl BlockingDataset { serialized_manifest: Option<&[u8]>, storage_options_provider: Option>, ) -> Result { + // Create storage options accessor from storage_options and provider + let accessor = match (storage_options.is_empty(), storage_options_provider) { + (false, Some(provider)) => Some(Arc::new( + lance::io::StorageOptionsAccessor::with_initial_and_provider( + storage_options, + provider, + ), + )), + (false, None) => Some(Arc::new(lance::io::StorageOptionsAccessor::static_options( + storage_options, + ))), + (true, Some(provider)) => Some(Arc::new( + lance::io::StorageOptionsAccessor::with_provider(provider), + )), + (true, None) => None, + }; + let store_params = ObjectStoreParams { block_size: block_size.map(|size| size as usize), - storage_options: Some(storage_options), - storage_options_provider, + storage_options_accessor: accessor, ..Default::default() }; let params = ReadParams { @@ -165,12 +183,19 @@ impl BlockingDataset { read_version: Option, storage_options: HashMap, ) -> Result { + let accessor = if storage_options.is_empty() { + None + } else { + Some(Arc::new(lance::io::StorageOptionsAccessor::static_options( + storage_options, + ))) + }; let inner = RT.block_on(Dataset::commit( uri, operation, read_version, Some(ObjectStoreParams { - storage_options: Some(storage_options), + storage_options_accessor: accessor, ..Default::default() }), None, @@ -1298,21 +1323,24 @@ fn inner_latest_version_id(env: &mut JNIEnv, java_dataset: JObject) -> Result( +pub extern "system" fn Java_org_lance_Dataset_nativeGetInitialStorageOptions<'local>( mut env: JNIEnv<'local>, java_dataset: JObject, ) -> JObject<'local> { - ok_or_throw!(env, inner_get_storage_options(&mut env, java_dataset)) + ok_or_throw!( + env, + inner_get_initial_storage_options(&mut env, java_dataset) + ) } -fn inner_get_storage_options<'local>( +fn inner_get_initial_storage_options<'local>( env: &mut JNIEnv<'local>, java_dataset: JObject, ) -> Result> { let storage_options = { let dataset_guard = unsafe { env.get_rust_field::<_, _, BlockingDataset>(java_dataset, NATIVE_DATASET) }?; - dataset_guard.storage_options() + dataset_guard.initial_storage_options() }; match storage_options { Some(opts) => opts.into_java(env), @@ -2205,7 +2233,9 @@ fn transform_jstorage_options( Ok(storage_options .map(|options| { Some(ObjectStoreParams { - storage_options: Some(options), + storage_options_accessor: Some(Arc::new( + lance::io::StorageOptionsAccessor::static_options(options), + )), ..Default::default() }) }) diff --git a/java/lance-jni/src/file_reader.rs b/java/lance-jni/src/file_reader.rs index 11591b3acea..83e1e5d45ab 100644 --- a/java/lance-jni/src/file_reader.rs +++ b/java/lance-jni/src/file_reader.rs @@ -112,7 +112,9 @@ fn inner_open<'local>( let storage_options = to_rust_map(env, &jmap)?; let reader = RT.block_on(async move { let object_params = ObjectStoreParams { - storage_options: Some(storage_options), + storage_options_accessor: Some(Arc::new( + lance::io::StorageOptionsAccessor::static_options(storage_options), + )), ..Default::default() }; let (obj_store, path) = ObjectStore::from_uri_and_params( diff --git a/java/lance-jni/src/file_writer.rs b/java/lance-jni/src/file_writer.rs index 600d7de2845..3328ddc4d3c 100644 --- a/java/lance-jni/src/file_writer.rs +++ b/java/lance-jni/src/file_writer.rs @@ -94,7 +94,9 @@ fn inner_open<'local>( let writer = RT.block_on(async move { let object_params = ObjectStoreParams { - storage_options: Some(storage_options), + storage_options_accessor: Some(Arc::new( + lance::io::StorageOptionsAccessor::static_options(storage_options), + )), ..Default::default() }; let (obj_store, path) = ObjectStore::from_uri_and_params( diff --git a/java/lance-jni/src/transaction.rs b/java/lance-jni/src/transaction.rs index b8f63afb996..e5cc16413c1 100644 --- a/java/lance-jni/src/transaction.rs +++ b/java/lance-jni/src/transaction.rs @@ -756,16 +756,46 @@ fn inner_commit_transaction<'local>( let write_param_jmap = JMap::from_env(env, &write_param_jobj)?; let write_param = to_rust_map(env, &write_param_jmap)?; - // Get the Dataset's storage_options_accessor + // Get the Dataset's storage_options_accessor and merge with write_param let storage_options_accessor = { let dataset_guard = unsafe { env.get_rust_field::<_, _, BlockingDataset>(&java_dataset, NATIVE_DATASET) }?; - dataset_guard.inner.storage_options_accessor() + let existing_accessor = dataset_guard.inner.storage_options_accessor(); + + // Merge write_param with existing accessor's initial options + match existing_accessor { + Some(accessor) => { + let mut merged = accessor + .initial_storage_options() + .cloned() + .unwrap_or_default(); + merged.extend(write_param); + if let Some(provider) = accessor.provider().cloned() { + Some(Arc::new( + lance::io::StorageOptionsAccessor::with_initial_and_provider( + merged, provider, + ), + )) + } else { + Some(Arc::new(lance::io::StorageOptionsAccessor::static_options( + merged, + ))) + } + } + None => { + if !write_param.is_empty() { + Some(Arc::new(lance::io::StorageOptionsAccessor::static_options( + write_param, + ))) + } else { + None + } + } + } }; - // Build ObjectStoreParams using write_param for storage_options and accessor from Dataset + // Build ObjectStoreParams using the merged accessor let store_params = ObjectStoreParams { - storage_options: Some(write_param), storage_options_accessor, ..Default::default() }; diff --git a/java/lance-jni/src/utils.rs b/java/lance-jni/src/utils.rs index 0efd077e052..d12a60286c5 100644 --- a/java/lance-jni/src/utils.rs +++ b/java/lance-jni/src/utils.rs @@ -105,9 +105,22 @@ pub fn extract_write_params( write_params.target_base_names_or_paths = Some(names); } + // Create storage options accessor from storage_options and provider + let accessor = match (storage_options.is_empty(), storage_options_provider) { + (false, Some(provider)) => Some(Arc::new( + lance::io::StorageOptionsAccessor::with_initial_and_provider(storage_options, provider), + )), + (false, None) => Some(Arc::new(lance::io::StorageOptionsAccessor::static_options( + storage_options, + ))), + (true, Some(provider)) => Some(Arc::new(lance::io::StorageOptionsAccessor::with_provider( + provider, + ))), + (true, None) => None, + }; + write_params.store_params = Some(ObjectStoreParams { - storage_options: Some(storage_options), - storage_options_provider, + storage_options_accessor: accessor, ..Default::default() }); Ok(write_params) diff --git a/java/src/main/java/org/lance/Dataset.java b/java/src/main/java/org/lance/Dataset.java index 6b7e914ea35..596e9f83c91 100644 --- a/java/src/main/java/org/lance/Dataset.java +++ b/java/src/main/java/org/lance/Dataset.java @@ -759,14 +759,14 @@ public long latestVersion() { * * @return the initial storage options, or null if none were provided */ - public Map getStorageOptions() { + public Map getInitialStorageOptions() { try (LockManager.ReadLock readLock = lockManager.acquireReadLock()) { Preconditions.checkArgument(nativeDatasetHandle != 0, "Dataset is closed"); - return nativeGetStorageOptions(); + return nativeGetInitialStorageOptions(); } } - private native Map nativeGetStorageOptions(); + private native Map nativeGetInitialStorageOptions(); /** * Get the latest storage options, potentially refreshed from the provider. diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 4a7f75c64b6..3355b459ebf 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -2224,7 +2224,7 @@ def latest_version(self) -> int: return self._ds.latest_version() @property - def storage_options(self) -> Optional[Dict[str, str]]: + def initial_storage_options(self) -> Optional[Dict[str, str]]: """ Get the initial storage options used to open this dataset. @@ -2232,7 +2232,7 @@ def storage_options(self) -> Optional[Dict[str, str]]: without any refresh from the provider. Returns None if no storage options were provided. """ - return self._ds.storage_options() + return self._ds.initial_storage_options() def latest_storage_options(self) -> Optional[Dict[str, str]]: """ diff --git a/python/src/dataset.rs b/python/src/dataset.rs index e6a954c4439..7e2f682d876 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -1520,8 +1520,8 @@ impl Dataset { /// This returns the options that were provided when the dataset was opened, /// without any refresh from the provider. Returns None if no storage options /// were provided. - fn storage_options(&self) -> Option> { - self.ds.storage_options().cloned() + fn initial_storage_options(&self) -> Option> { + self.ds.initial_storage_options().cloned() } /// Get the latest storage options, potentially refreshed from the provider. @@ -1564,7 +1564,9 @@ impl Dataset { // `version` can be a version number or a tag name. // `storage_options` will be forwarded to the object store params for the new dataset. let store_params = storage_options.as_ref().map(|opts| ObjectStoreParams { - storage_options: Some(opts.clone()), + storage_options_accessor: Some(Arc::new( + lance::io::StorageOptionsAccessor::static_options(opts.clone()), + )), ..Default::default() }); @@ -1743,7 +1745,9 @@ impl Dataset { let mut new_self = self.ds.as_ref().clone(); let reference = self.transform_ref(reference)?; let store_params = storage_options.map(|opts| ObjectStoreParams { - storage_options: Some(opts), + storage_options_accessor: Some(Arc::new( + lance::io::StorageOptionsAccessor::static_options(opts), + )), ..Default::default() }); let created = rt() @@ -2263,14 +2267,14 @@ impl Dataset { detached: Option, max_retries: Option, ) -> PyResult { - let provider = storage_options_provider - .map(crate::storage_options::py_object_to_storage_options_provider) - .transpose()?; + let accessor = crate::storage_options::create_accessor_from_python( + storage_options.clone(), + storage_options_provider, + )?; - let object_store_params = if storage_options.is_some() || provider.is_some() { + let object_store_params = if accessor.is_some() { Some(ObjectStoreParams { - storage_options: storage_options.clone(), - storage_options_provider: provider, + storage_options_accessor: accessor, ..Default::default() }) } else { @@ -2327,14 +2331,14 @@ impl Dataset { detached: Option, max_retries: Option, ) -> PyResult<(Self, PyLance)> { - let provider = storage_options_provider - .map(crate::storage_options::py_object_to_storage_options_provider) - .transpose()?; + let accessor = crate::storage_options::create_accessor_from_python( + storage_options.clone(), + storage_options_provider, + )?; - let object_store_params = if storage_options.is_some() || provider.is_some() { + let object_store_params = if accessor.is_some() { Some(ObjectStoreParams { - storage_options: storage_options.clone(), - storage_options_provider: provider, + storage_options_accessor: accessor, ..Default::default() }) } else { @@ -3133,18 +3137,17 @@ pub fn get_write_params(options: &Bound<'_, PyDict>) -> PyResult>(options, "storage_options")?; let storage_options_provider = - get_dict_opt::>(options, "storage_options_provider")? - .map(|py_obj| { - crate::storage_options::py_object_to_storage_options_provider( - py_obj.bind(options.py()), - ) - }) - .transpose()?; + get_dict_opt::>(options, "storage_options_provider")?; if storage_options.is_some() || storage_options_provider.is_some() { - p.store_params = Some(ObjectStoreParams { + let accessor = crate::storage_options::create_accessor_from_python( storage_options, - storage_options_provider, + storage_options_provider + .as_ref() + .map(|py_obj| py_obj.bind(options.py())), + )?; + p.store_params = Some(ObjectStoreParams { + storage_options_accessor: accessor, ..Default::default() }); } diff --git a/python/src/file.rs b/python/src/file.rs index 80c66d739d1..5c479e0e3a8 100644 --- a/python/src/file.rs +++ b/python/src/file.rs @@ -382,16 +382,30 @@ pub async fn object_store_from_uri_or_path( object_store_from_uri_or_path_with_provider(uri_or_path, storage_options, None).await } -#[allow(deprecated)] pub async fn object_store_from_uri_or_path_with_provider( uri_or_path: impl AsRef, storage_options: Option>, storage_options_provider: Option>, ) -> PyResult<(Arc, Path)> { let object_store_registry = Arc::new(lance::io::ObjectStoreRegistry::default()); + + let accessor = match (storage_options, storage_options_provider) { + (Some(opts), Some(provider)) => Some(Arc::new( + lance::io::StorageOptionsAccessor::with_initial_and_provider(opts, provider), + )), + (None, Some(provider)) => { + Some(Arc::new(lance::io::StorageOptionsAccessor::with_provider( + provider, + ))) + } + (Some(opts), None) => Some(Arc::new( + lance::io::StorageOptionsAccessor::static_options(opts), + )), + (None, None) => None, + }; + let object_store_params = ObjectStoreParams { - storage_options: storage_options.clone(), - storage_options_provider, + storage_options_accessor: accessor, ..Default::default() }; diff --git a/rust/lance-io/src/object_store.rs b/rust/lance-io/src/object_store.rs index bc6060ba4fa..9947781504f 100644 --- a/rust/lance-io/src/object_store.rs +++ b/rust/lance-io/src/object_store.rs @@ -35,7 +35,6 @@ use super::local::LocalObjectReader; mod list_retry; pub mod providers; pub mod storage_options; -pub mod storage_options_accessor; mod tracing; use crate::object_reader::SmallReader; use crate::object_writer::WriteResult; @@ -65,10 +64,9 @@ pub const DEFAULT_DOWNLOAD_RETRY_COUNT: usize = 3; pub use providers::{ObjectStoreProvider, ObjectStoreRegistry}; pub use storage_options::{ - LanceNamespaceStorageOptionsProvider, StorageOptionsProvider, EXPIRES_AT_MILLIS_KEY, - REFRESH_OFFSET_MILLIS_KEY, + LanceNamespaceStorageOptionsProvider, StorageOptionsAccessor, StorageOptionsProvider, + EXPIRES_AT_MILLIS_KEY, REFRESH_OFFSET_MILLIS_KEY, }; -pub use storage_options_accessor::StorageOptionsAccessor; #[async_trait] pub trait ObjectStoreExt { @@ -196,16 +194,11 @@ pub struct ObjectStoreParams { #[cfg(feature = "aws")] pub aws_credentials: Option, pub object_store_wrapper: Option>, - pub storage_options: Option>, - /// Dynamic storage options provider for automatic credential refresh. - /// - /// When set, the provider will be called to refresh storage options when they expire. - /// The `expires_at_millis` key in the returned options controls when refresh occurs. - pub storage_options_provider: Option>, /// Unified storage options accessor with caching and automatic refresh /// - /// This is the preferred way to provide storage options. It bundles static options - /// with an optional dynamic provider and handles all caching and refresh logic. + /// Provides storage options and optionally a dynamic provider for automatic + /// credential refresh. Use `StorageOptionsAccessor::static_options()` for static + /// options or `StorageOptionsAccessor::with_initial_and_provider()` for dynamic refresh. pub storage_options_accessor: Option>, /// Use constant size upload parts for multipart uploads. Only necessary /// for Cloudflare R2, which doesn't support variable size parts. When this @@ -225,8 +218,6 @@ impl Default for ObjectStoreParams { #[cfg(feature = "aws")] aws_credentials: None, object_store_wrapper: None, - storage_options: None, - storage_options_provider: None, storage_options_accessor: None, use_constant_size_upload_parts: false, list_is_lexically_ordered: None, @@ -235,30 +226,18 @@ impl Default for ObjectStoreParams { } impl ObjectStoreParams { - /// Get or create a StorageOptionsAccessor from the params - /// - /// If `storage_options_accessor` is already set, returns it. - /// Otherwise, creates an accessor from the legacy `storage_options` and - /// `storage_options_provider` fields. - #[allow(deprecated)] - pub fn get_or_create_accessor(&self) -> Option> { - if let Some(accessor) = &self.storage_options_accessor { - return Some(accessor.clone()); - } + /// Get the StorageOptionsAccessor from the params + pub fn get_accessor(&self) -> Option> { + self.storage_options_accessor.clone() + } - // Create accessor from legacy fields - match (&self.storage_options, &self.storage_options_provider) { - (Some(opts), Some(provider)) => Some(Arc::new( - StorageOptionsAccessor::with_initial_and_provider(opts.clone(), provider.clone()), - )), - (None, Some(provider)) => Some(Arc::new(StorageOptionsAccessor::with_provider( - provider.clone(), - ))), - (Some(opts), None) => Some(Arc::new(StorageOptionsAccessor::static_options( - opts.clone(), - ))), - (None, None) => None, - } + /// Get storage options from the accessor, if any + /// + /// Returns the initial storage options from the accessor without triggering refresh. + pub fn storage_options(&self) -> Option<&HashMap> { + self.storage_options_accessor + .as_ref() + .and_then(|a| a.initial_storage_options()) } } @@ -266,7 +245,7 @@ impl ObjectStoreParams { impl std::hash::Hash for ObjectStoreParams { #[allow(deprecated)] fn hash(&self, state: &mut H) { - // For hashing, we use pointer values for ObjectStore, S3 credentials, wrapper, and storage options provider + // For hashing, we use pointer values for ObjectStore, S3 credentials, wrapper self.block_size.hash(state); if let Some((store, url)) = &self.object_store { Arc::as_ptr(store).hash(state); @@ -280,15 +259,6 @@ impl std::hash::Hash for ObjectStoreParams { if let Some(wrapper) = &self.object_store_wrapper { Arc::as_ptr(wrapper).hash(state); } - if let Some(storage_options) = &self.storage_options { - for (key, value) in storage_options { - key.hash(state); - value.hash(state); - } - } - if let Some(provider) = &self.storage_options_provider { - provider.provider_id().hash(state); - } if let Some(accessor) = &self.storage_options_accessor { accessor.accessor_id().hash(state); } @@ -308,7 +278,7 @@ impl PartialEq for ObjectStoreParams { } // For equality, we use pointer comparison for ObjectStore, S3 credentials, wrapper - // For storage_options_provider and accessor, we use provider_id()/accessor_id() for semantic equality + // For accessor, we use accessor_id() for semantic equality self.block_size == other.block_size && self .object_store @@ -321,15 +291,6 @@ impl PartialEq for ObjectStoreParams { && self.s3_credentials_refresh_offset == other.s3_credentials_refresh_offset && self.object_store_wrapper.as_ref().map(Arc::as_ptr) == other.object_store_wrapper.as_ref().map(Arc::as_ptr) - && self.storage_options == other.storage_options - && self - .storage_options_provider - .as_ref() - .map(|p| p.provider_id()) - == other - .storage_options_provider - .as_ref() - .map(|p| p.provider_id()) && self .storage_options_accessor .as_ref() @@ -467,7 +428,7 @@ impl ObjectStore { if let Some((store, path)) = params.object_store.as_ref() { let mut inner = store.clone(); let store_prefix = - registry.calculate_object_store_prefix(uri, params.storage_options.as_ref())?; + registry.calculate_object_store_prefix(uri, params.storage_options())?; if let Some(wrapper) = params.object_store_wrapper.as_ref() { inner = wrapper.wrap(&store_prefix, inner); } @@ -1037,8 +998,11 @@ mod tests { ) { // Test the default let registry = Arc::new(ObjectStoreRegistry::default()); + let accessor = storage_options + .clone() + .map(|opts| Arc::new(StorageOptionsAccessor::static_options(opts))); let params = ObjectStoreParams { - storage_options: storage_options.clone(), + storage_options_accessor: accessor.clone(), ..ObjectStoreParams::default() }; let (store, _) = ObjectStore::from_uri_and_params(registry, uri, ¶ms) @@ -1050,7 +1014,7 @@ mod tests { let registry = Arc::new(ObjectStoreRegistry::default()); let params = ObjectStoreParams { block_size: Some(1024), - storage_options: storage_options.clone(), + storage_options_accessor: accessor, ..ObjectStoreParams::default() }; let (store, _) = ObjectStore::from_uri_and_params(registry, uri, ¶ms) diff --git a/rust/lance-io/src/object_store/providers.rs b/rust/lance-io/src/object_store/providers.rs index f2aef00583b..1848cf30b27 100644 --- a/rust/lance-io/src/object_store/providers.rs +++ b/rust/lance-io/src/object_store/providers.rs @@ -209,7 +209,7 @@ impl ObjectStoreRegistry { }; let cache_path = - provider.calculate_object_store_prefix(&base_path, params.storage_options.as_ref())?; + provider.calculate_object_store_prefix(&base_path, params.storage_options())?; let cache_key = (cache_path.clone(), params.clone()); // Check if we have a cached store for this base path and params @@ -417,12 +417,15 @@ mod tests { #[tokio::test] async fn test_stats_hit_miss_tracking() { + use crate::object_store::StorageOptionsAccessor; let registry = ObjectStoreRegistry::default(); let url = Url::parse("memory://test").unwrap(); let params1 = ObjectStoreParams::default(); let params2 = ObjectStoreParams { - storage_options: Some(HashMap::from([("k".into(), "v".into())])), + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::static_options( + HashMap::from([("k".into(), "v".into())]), + ))), ..Default::default() }; diff --git a/rust/lance-io/src/object_store/providers/aws.rs b/rust/lance-io/src/object_store/providers/aws.rs index 4e7ce43d9f2..53e5a93bde1 100644 --- a/rust/lance-io/src/object_store/providers/aws.rs +++ b/rust/lance-io/src/object_store/providers/aws.rs @@ -56,8 +56,8 @@ impl AwsStoreProvider { let mut s3_storage_options = storage_options.as_s3_options(); let region = resolve_s3_region(base_path, &s3_storage_options).await?; - // Get or create accessor from params - let accessor = params.get_or_create_accessor(); + // Get accessor from params + let accessor = params.get_accessor(); let (aws_creds, region) = build_aws_credential( params.s3_credentials_refresh_offset, @@ -136,7 +136,7 @@ impl ObjectStoreProvider for AwsStoreProvider { ) -> Result { let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); let mut storage_options = - StorageOptions(params.storage_options.clone().unwrap_or_default()); + StorageOptions(params.storage_options().cloned().unwrap_or_default()); storage_options.with_env_s3(); let download_retry_count = storage_options.download_retry_count(); @@ -176,7 +176,7 @@ impl ObjectStoreProvider for AwsStoreProvider { download_retry_count, io_tracker: Default::default(), store_prefix: self - .calculate_object_store_prefix(&base_path, params.storage_options.as_ref())?, + .calculate_object_store_prefix(&base_path, params.storage_options())?, }) } } @@ -443,10 +443,14 @@ impl ObjectStoreParams { aws_credentials: Option, region: Option, ) -> Self { + let storage_options_accessor = region.map(|region| { + let opts: HashMap = + [("region".into(), region)].iter().cloned().collect(); + Arc::new(StorageOptionsAccessor::static_options(opts)) + }); Self { aws_credentials, - storage_options: region - .map(|region| [("region".into(), region)].iter().cloned().collect()), + storage_options_accessor, ..Default::default() } } @@ -667,13 +671,16 @@ mod tests { #[tokio::test] async fn test_use_opendal_flag() { + use crate::object_store::StorageOptionsAccessor; let provider = AwsStoreProvider; let url = Url::parse("s3://test-bucket/path").unwrap(); let params_with_flag = ObjectStoreParams { - storage_options: Some(HashMap::from([ - ("use_opendal".to_string(), "true".to_string()), - ("region".to_string(), "us-west-2".to_string()), - ])), + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::static_options( + HashMap::from([ + ("use_opendal".to_string(), "true".to_string()), + ("region".to_string(), "us-west-2".to_string()), + ]), + ))), ..Default::default() }; diff --git a/rust/lance-io/src/object_store/providers/azure.rs b/rust/lance-io/src/object_store/providers/azure.rs index 084e61a72e2..1c64f412d69 100644 --- a/rust/lance-io/src/object_store/providers/azure.rs +++ b/rust/lance-io/src/object_store/providers/azure.rs @@ -95,7 +95,7 @@ impl ObjectStoreProvider for AzureBlobStoreProvider { async fn new_store(&self, base_path: Url, params: &ObjectStoreParams) -> Result { let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); let mut storage_options = - StorageOptions(params.storage_options.clone().unwrap_or_default()); + StorageOptions(params.storage_options().cloned().unwrap_or_default()); storage_options.with_env_azure(); let download_retry_count = storage_options.download_retry_count(); @@ -124,7 +124,7 @@ impl ObjectStoreProvider for AzureBlobStoreProvider { download_retry_count, io_tracker: Default::default(), store_prefix: self - .calculate_object_store_prefix(&base_path, params.storage_options.as_ref())?, + .calculate_object_store_prefix(&base_path, params.storage_options())?, }) } @@ -232,21 +232,24 @@ mod tests { #[tokio::test] async fn test_use_opendal_flag() { + use crate::object_store::StorageOptionsAccessor; let provider = AzureBlobStoreProvider; let url = Url::parse("az://test-container/path").unwrap(); let params_with_flag = ObjectStoreParams { - storage_options: Some(HashMap::from([ - ("use_opendal".to_string(), "true".to_string()), - ("account_name".to_string(), "test_account".to_string()), - ( - "endpoint".to_string(), - "https://test_account.blob.core.windows.net".to_string(), - ), - ( - "account_key".to_string(), - "dGVzdF9hY2NvdW50X2tleQ==".to_string(), - ), - ])), + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::static_options( + HashMap::from([ + ("use_opendal".to_string(), "true".to_string()), + ("account_name".to_string(), "test_account".to_string()), + ( + "endpoint".to_string(), + "https://test_account.blob.core.windows.net".to_string(), + ), + ( + "account_key".to_string(), + "dGVzdF9hY2NvdW50X2tleQ==".to_string(), + ), + ]), + ))), ..Default::default() }; diff --git a/rust/lance-io/src/object_store/providers/gcp.rs b/rust/lance-io/src/object_store/providers/gcp.rs index 282aac13b94..c5fa7e6fca7 100644 --- a/rust/lance-io/src/object_store/providers/gcp.rs +++ b/rust/lance-io/src/object_store/providers/gcp.rs @@ -96,7 +96,7 @@ impl ObjectStoreProvider for GcsStoreProvider { async fn new_store(&self, base_path: Url, params: &ObjectStoreParams) -> Result { let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); let mut storage_options = - StorageOptions(params.storage_options.clone().unwrap_or_default()); + StorageOptions(params.storage_options().cloned().unwrap_or_default()); storage_options.with_env_gcs(); let download_retry_count = storage_options.download_retry_count(); @@ -125,7 +125,7 @@ impl ObjectStoreProvider for GcsStoreProvider { download_retry_count, io_tracker: Default::default(), store_prefix: self - .calculate_object_store_prefix(&base_path, params.storage_options.as_ref())?, + .calculate_object_store_prefix(&base_path, params.storage_options())?, }) } } @@ -182,16 +182,19 @@ mod tests { #[tokio::test] async fn test_use_opendal_flag() { + use crate::object_store::StorageOptionsAccessor; let provider = GcsStoreProvider; let url = Url::parse("gs://test-bucket/path").unwrap(); let params_with_flag = ObjectStoreParams { - storage_options: Some(HashMap::from([ - ("use_opendal".to_string(), "true".to_string()), - ( - "service_account".to_string(), - "test@example.iam.gserviceaccount.com".to_string(), - ), - ])), + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::static_options( + HashMap::from([ + ("use_opendal".to_string(), "true".to_string()), + ( + "service_account".to_string(), + "test@example.iam.gserviceaccount.com".to_string(), + ), + ]), + ))), ..Default::default() }; diff --git a/rust/lance-io/src/object_store/providers/huggingface.rs b/rust/lance-io/src/object_store/providers/huggingface.rs index 5bbdf9fcd1c..47a752768f0 100644 --- a/rust/lance-io/src/object_store/providers/huggingface.rs +++ b/rust/lance-io/src/object_store/providers/huggingface.rs @@ -65,7 +65,7 @@ impl ObjectStoreProvider for HuggingfaceStoreProvider { } = parse_hf_url(&base_path)?; let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); - let storage_options = StorageOptions(params.storage_options.clone().unwrap_or_default()); + let storage_options = StorageOptions(params.storage_options().cloned().unwrap_or_default()); let download_retry_count = storage_options.download_retry_count(); // Build OpenDAL config with allowed keys only. @@ -115,7 +115,7 @@ impl ObjectStoreProvider for HuggingfaceStoreProvider { download_retry_count, io_tracker: Default::default(), store_prefix: self - .calculate_object_store_prefix(&base_path, params.storage_options.as_ref())?, + .calculate_object_store_prefix(&base_path, params.storage_options())?, }) } @@ -159,12 +159,13 @@ mod tests { #[test] fn storage_option_revision_takes_precedence() { + use crate::object_store::StorageOptionsAccessor; + use std::sync::Arc; let url = Url::parse("hf://datasets/acme/repo/data/file").unwrap(); let params = ObjectStoreParams { - storage_options: Some(HashMap::from([( - String::from("hf_revision"), - String::from("stable"), - )])), + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::static_options( + HashMap::from([(String::from("hf_revision"), String::from("stable"))]), + ))), ..Default::default() }; // new_store should accept without creating operator; test precedence via builder config @@ -177,8 +178,7 @@ mod tests { config_map.insert("repo_type".to_string(), repo_type); config_map.insert("repo".to_string(), repo_id); if let Some(rev) = params - .storage_options - .as_ref() + .storage_options() .unwrap() .get("hf_revision") .cloned() diff --git a/rust/lance-io/src/object_store/providers/local.rs b/rust/lance-io/src/object_store/providers/local.rs index b7820668013..78c8c9632c4 100644 --- a/rust/lance-io/src/object_store/providers/local.rs +++ b/rust/lance-io/src/object_store/providers/local.rs @@ -20,7 +20,7 @@ pub struct FileStoreProvider; impl ObjectStoreProvider for FileStoreProvider { async fn new_store(&self, base_path: Url, params: &ObjectStoreParams) -> Result { let block_size = params.block_size.unwrap_or(DEFAULT_LOCAL_BLOCK_SIZE); - let storage_options = StorageOptions(params.storage_options.clone().unwrap_or_default()); + let storage_options = StorageOptions(params.storage_options().cloned().unwrap_or_default()); let download_retry_count = storage_options.download_retry_count(); Ok(ObjectStore { inner: Arc::new(LocalFileSystem::new()), @@ -33,7 +33,7 @@ impl ObjectStoreProvider for FileStoreProvider { download_retry_count, io_tracker: Default::default(), store_prefix: self - .calculate_object_store_prefix(&base_path, params.storage_options.as_ref())?, + .calculate_object_store_prefix(&base_path, params.storage_options())?, }) } diff --git a/rust/lance-io/src/object_store/providers/memory.rs b/rust/lance-io/src/object_store/providers/memory.rs index 91e67411d44..addc2fafc80 100644 --- a/rust/lance-io/src/object_store/providers/memory.rs +++ b/rust/lance-io/src/object_store/providers/memory.rs @@ -19,7 +19,7 @@ pub struct MemoryStoreProvider; impl ObjectStoreProvider for MemoryStoreProvider { async fn new_store(&self, base_path: Url, params: &ObjectStoreParams) -> Result { let block_size = params.block_size.unwrap_or(DEFAULT_LOCAL_BLOCK_SIZE); - let storage_options = StorageOptions(params.storage_options.clone().unwrap_or_default()); + let storage_options = StorageOptions(params.storage_options().cloned().unwrap_or_default()); let download_retry_count = storage_options.download_retry_count(); Ok(ObjectStore { inner: Arc::new(InMemory::new()), @@ -32,7 +32,7 @@ impl ObjectStoreProvider for MemoryStoreProvider { download_retry_count, io_tracker: Default::default(), store_prefix: self - .calculate_object_store_prefix(&base_path, params.storage_options.as_ref())?, + .calculate_object_store_prefix(&base_path, params.storage_options())?, }) } diff --git a/rust/lance-io/src/object_store/providers/oss.rs b/rust/lance-io/src/object_store/providers/oss.rs index 115227a0d90..aace58921d2 100644 --- a/rust/lance-io/src/object_store/providers/oss.rs +++ b/rust/lance-io/src/object_store/providers/oss.rs @@ -22,7 +22,7 @@ pub struct OssStoreProvider; impl ObjectStoreProvider for OssStoreProvider { async fn new_store(&self, base_path: Url, params: &ObjectStoreParams) -> Result { let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); - let storage_options = StorageOptions(params.storage_options.clone().unwrap_or_default()); + let storage_options = StorageOptions(params.storage_options().cloned().unwrap_or_default()); let bucket = base_path .host_str() @@ -107,8 +107,7 @@ impl ObjectStoreProvider for OssStoreProvider { io_parallelism: DEFAULT_CLOUD_IO_PARALLELISM, download_retry_count: storage_options.download_retry_count(), io_tracker: Default::default(), - store_prefix: self - .calculate_object_store_prefix(&url, params.storage_options.as_ref())?, + store_prefix: self.calculate_object_store_prefix(&url, params.storage_options())?, }) } } diff --git a/rust/lance-io/src/object_store/storage_options.rs b/rust/lance-io/src/object_store/storage_options.rs index 2e630279a73..8204210990b 100644 --- a/rust/lance-io/src/object_store/storage_options.rs +++ b/rust/lance-io/src/object_store/storage_options.rs @@ -1,21 +1,32 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -//! Storage options provider for dynamic credential fetching +//! Storage options provider and accessor for dynamic credential fetching //! -//! This module provides a trait for fetching storage options from various sources -//! (namespace servers, secret managers, etc.) with support for expiration tracking -//! and automatic refresh. +//! This module provides: +//! - [`StorageOptionsProvider`] trait for fetching storage options from various sources +//! (namespace servers, secret managers, etc.) with support for expiration tracking +//! - [`StorageOptionsAccessor`] for unified access to storage options with automatic +//! caching and refresh use std::collections::HashMap; use std::fmt; use std::sync::Arc; +use std::time::Duration; + +#[cfg(test)] +use mock_instant::thread_local::{SystemTime, UNIX_EPOCH}; + +#[cfg(not(test))] +use std::time::{SystemTime, UNIX_EPOCH}; -use crate::{Error, Result}; use async_trait::async_trait; use lance_namespace::models::DescribeTableRequest; use lance_namespace::LanceNamespace; use snafu::location; +use tokio::sync::RwLock; + +use crate::{Error, Result}; /// Key for the expiration timestamp in storage options HashMap pub const EXPIRES_AT_MILLIS_KEY: &str = "expires_at_millis"; @@ -23,6 +34,9 @@ pub const EXPIRES_AT_MILLIS_KEY: &str = "expires_at_millis"; /// Key for the refresh offset in storage options HashMap (milliseconds before expiry to refresh) pub const REFRESH_OFFSET_MILLIS_KEY: &str = "refresh_offset_millis"; +/// Default refresh offset: 60 seconds before expiration +const DEFAULT_REFRESH_OFFSET_MILLIS: u64 = 60_000; + /// Trait for providing storage options with expiration tracking /// /// Implementations can fetch storage options from various sources (namespace servers, @@ -142,3 +156,558 @@ impl StorageOptionsProvider for LanceNamespaceStorageOptionsProvider { ) } } + +/// Unified access to storage options with automatic caching and refresh +/// +/// This struct bundles static storage options with an optional dynamic provider, +/// handling all caching and refresh logic internally. It provides a single entry point +/// for accessing storage options regardless of whether they're static or dynamic. +/// +/// # Behavior +/// +/// - If only static options are provided, returns those options +/// - If a provider is configured, fetches from provider and caches results +/// - Automatically refreshes cached options before expiration (based on refresh_offset) +/// - Uses `expires_at_millis` key to track expiration +/// +/// # Thread Safety +/// +/// The accessor is thread-safe and can be shared across multiple tasks. +/// Concurrent refresh attempts are deduplicated using a try-lock mechanism. +pub struct StorageOptionsAccessor { + /// Initial/fallback static storage options + initial_options: Option>, + + /// Optional dynamic provider for refreshing options + provider: Option>, + + /// Cached storage options with expiration tracking + cache: Arc>>, + + /// Duration before expiry to trigger refresh + refresh_offset: Duration, +} + +impl fmt::Debug for StorageOptionsAccessor { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("StorageOptionsAccessor") + .field("has_initial_options", &self.initial_options.is_some()) + .field("has_provider", &self.provider.is_some()) + .field("refresh_offset", &self.refresh_offset) + .finish() + } +} + +#[derive(Debug, Clone)] +struct CachedStorageOptions { + options: HashMap, + expires_at_millis: Option, +} + +impl StorageOptionsAccessor { + /// Extract refresh offset from storage options, or use default + fn extract_refresh_offset(options: &HashMap) -> Duration { + options + .get(REFRESH_OFFSET_MILLIS_KEY) + .and_then(|s| s.parse::().ok()) + .map(Duration::from_millis) + .unwrap_or(Duration::from_millis(DEFAULT_REFRESH_OFFSET_MILLIS)) + } + + /// Create an accessor with only static options (no refresh capability) + /// + /// The returned accessor will always return the provided options. + /// This is useful when credentials don't expire or are managed externally. + pub fn static_options(options: HashMap) -> Self { + let expires_at_millis = options + .get(EXPIRES_AT_MILLIS_KEY) + .and_then(|s| s.parse::().ok()); + let refresh_offset = Self::extract_refresh_offset(&options); + + Self { + initial_options: Some(options.clone()), + provider: None, + cache: Arc::new(RwLock::new(Some(CachedStorageOptions { + options, + expires_at_millis, + }))), + refresh_offset, + } + } + + /// Create an accessor with a dynamic provider (no initial options) + /// + /// The accessor will fetch from the provider on first access and cache + /// the results. Refresh happens automatically before expiration. + /// Uses the default refresh offset (60 seconds) until options are fetched. + /// + /// # Arguments + /// * `provider` - The storage options provider for fetching fresh options + pub fn with_provider(provider: Arc) -> Self { + Self { + initial_options: None, + provider: Some(provider), + cache: Arc::new(RwLock::new(None)), + refresh_offset: Duration::from_millis(DEFAULT_REFRESH_OFFSET_MILLIS), + } + } + + /// Create an accessor with initial options and a dynamic provider + /// + /// Initial options are used until they expire, then the provider is called. + /// This avoids an immediate fetch when initial credentials are still valid. + /// The `refresh_offset_millis` key in initial_options controls refresh timing. + /// + /// # Arguments + /// * `initial_options` - Initial storage options to cache + /// * `provider` - The storage options provider for refreshing + pub fn with_initial_and_provider( + initial_options: HashMap, + provider: Arc, + ) -> Self { + let expires_at_millis = initial_options + .get(EXPIRES_AT_MILLIS_KEY) + .and_then(|s| s.parse::().ok()); + let refresh_offset = Self::extract_refresh_offset(&initial_options); + + Self { + initial_options: Some(initial_options.clone()), + provider: Some(provider), + cache: Arc::new(RwLock::new(Some(CachedStorageOptions { + options: initial_options, + expires_at_millis, + }))), + refresh_offset, + } + } + + /// Get current valid storage options + /// + /// - Returns cached options if not expired + /// - Fetches from provider if expired or not cached + /// - Falls back to initial_options if provider returns None + /// + /// # Errors + /// + /// Returns an error if: + /// - The provider fails to fetch options + /// - No options are available (no cache, no provider, no initial options) + pub async fn get_storage_options(&self) -> Result { + loop { + match self.do_get_storage_options().await? { + Some(options) => return Ok(options), + None => { + // Lock was busy, wait 10ms before retrying + tokio::time::sleep(Duration::from_millis(10)).await; + continue; + } + } + } + } + + async fn do_get_storage_options(&self) -> Result> { + // Check if we have valid cached options with read lock + { + let cached = self.cache.read().await; + if !self.needs_refresh(&cached) { + if let Some(cached_opts) = &*cached { + return Ok(Some(super::StorageOptions(cached_opts.options.clone()))); + } + } + } + + // If no provider, return initial options or error + let Some(provider) = &self.provider else { + return if let Some(initial) = &self.initial_options { + Ok(Some(super::StorageOptions(initial.clone()))) + } else { + Err(Error::IO { + source: Box::new(std::io::Error::other("No storage options available")), + location: location!(), + }) + }; + }; + + // Try to acquire write lock - if it fails, return None and let caller retry + let Ok(mut cache) = self.cache.try_write() else { + return Ok(None); + }; + + // Double-check if options are still stale after acquiring write lock + // (another thread might have refreshed them) + if !self.needs_refresh(&cache) { + if let Some(cached_opts) = &*cache { + return Ok(Some(super::StorageOptions(cached_opts.options.clone()))); + } + } + + log::debug!( + "Refreshing storage options from provider: {}", + provider.provider_id() + ); + + let storage_options_map = + provider + .fetch_storage_options() + .await + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to fetch storage options: {}", + e + ))), + location: location!(), + })?; + + let Some(options) = storage_options_map else { + // Provider returned None, fall back to initial options + if let Some(initial) = &self.initial_options { + return Ok(Some(super::StorageOptions(initial.clone()))); + } + return Err(Error::IO { + source: Box::new(std::io::Error::other( + "Provider returned no storage options", + )), + location: location!(), + }); + }; + + let expires_at_millis = options + .get(EXPIRES_AT_MILLIS_KEY) + .and_then(|s| s.parse::().ok()); + + if let Some(expires_at) = expires_at_millis { + let now_ms = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or(Duration::from_secs(0)) + .as_millis() as u64; + let expires_in_secs = (expires_at.saturating_sub(now_ms)) / 1000; + log::debug!( + "Successfully refreshed storage options from provider: {}, options expire in {} seconds", + provider.provider_id(), + expires_in_secs + ); + } else { + log::debug!( + "Successfully refreshed storage options from provider: {} (no expiration)", + provider.provider_id() + ); + } + + *cache = Some(CachedStorageOptions { + options: options.clone(), + expires_at_millis, + }); + + Ok(Some(super::StorageOptions(options))) + } + + fn needs_refresh(&self, cached: &Option) -> bool { + match cached { + None => true, + Some(cached_opts) => { + if let Some(expires_at_millis) = cached_opts.expires_at_millis { + let now_ms = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or(Duration::from_secs(0)) + .as_millis() as u64; + + // Refresh if we're within the refresh offset of expiration + let refresh_offset_millis = self.refresh_offset.as_millis() as u64; + now_ms + refresh_offset_millis >= expires_at_millis + } else { + // No expiration means options never expire + false + } + } + } + } + + /// Get the initial storage options without refresh + /// + /// Returns the initial options that were provided when creating the accessor. + /// This does not trigger any refresh, even if the options have expired. + pub fn initial_storage_options(&self) -> Option<&HashMap> { + self.initial_options.as_ref() + } + + /// Get the accessor ID for equality/hashing + /// + /// Returns the provider_id if a provider exists, otherwise generates + /// a stable ID from the initial options hash. + pub fn accessor_id(&self) -> String { + if let Some(provider) = &self.provider { + provider.provider_id() + } else if let Some(initial) = &self.initial_options { + // Generate a stable ID from initial options + use std::collections::hash_map::DefaultHasher; + use std::hash::{Hash, Hasher}; + + let mut hasher = DefaultHasher::new(); + let mut keys: Vec<_> = initial.keys().collect(); + keys.sort(); + for key in keys { + key.hash(&mut hasher); + initial.get(key).hash(&mut hasher); + } + format!("static_options_{:x}", hasher.finish()) + } else { + "empty_accessor".to_string() + } + } + + /// Check if this accessor has a dynamic provider + pub fn has_provider(&self) -> bool { + self.provider.is_some() + } + + /// Get the refresh offset duration + pub fn refresh_offset(&self) -> Duration { + self.refresh_offset + } + + /// Get the storage options provider, if any + pub fn provider(&self) -> Option<&Arc> { + self.provider.as_ref() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use mock_instant::thread_local::MockClock; + + #[derive(Debug)] + struct MockStorageOptionsProvider { + call_count: Arc>, + expires_in_millis: Option, + } + + impl MockStorageOptionsProvider { + fn new(expires_in_millis: Option) -> Self { + Self { + call_count: Arc::new(RwLock::new(0)), + expires_in_millis, + } + } + + async fn get_call_count(&self) -> usize { + *self.call_count.read().await + } + } + + #[async_trait] + impl StorageOptionsProvider for MockStorageOptionsProvider { + async fn fetch_storage_options(&self) -> Result>> { + let count = { + let mut c = self.call_count.write().await; + *c += 1; + *c + }; + + let mut options = HashMap::from([ + ("aws_access_key_id".to_string(), format!("AKID_{}", count)), + ( + "aws_secret_access_key".to_string(), + format!("SECRET_{}", count), + ), + ("aws_session_token".to_string(), format!("TOKEN_{}", count)), + ]); + + if let Some(expires_in) = self.expires_in_millis { + let now_ms = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis() as u64; + let expires_at = now_ms + expires_in; + options.insert(EXPIRES_AT_MILLIS_KEY.to_string(), expires_at.to_string()); + } + + Ok(Some(options)) + } + + fn provider_id(&self) -> String { + let ptr = Arc::as_ptr(&self.call_count) as usize; + format!("MockStorageOptionsProvider {{ id: {} }}", ptr) + } + } + + #[tokio::test] + async fn test_static_options_only() { + let options = HashMap::from([ + ("key1".to_string(), "value1".to_string()), + ("key2".to_string(), "value2".to_string()), + ]); + let accessor = StorageOptionsAccessor::static_options(options.clone()); + + let result = accessor.get_storage_options().await.unwrap(); + assert_eq!(result.0, options); + assert!(!accessor.has_provider()); + assert_eq!(accessor.initial_storage_options(), Some(&options)); + } + + #[tokio::test] + async fn test_provider_only() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); + let accessor = StorageOptionsAccessor::with_provider(mock_provider.clone()); + + let result = accessor.get_storage_options().await.unwrap(); + assert!(result.0.contains_key("aws_access_key_id")); + assert_eq!(result.0.get("aws_access_key_id").unwrap(), "AKID_1"); + assert!(accessor.has_provider()); + assert_eq!(accessor.initial_storage_options(), None); + assert_eq!(mock_provider.get_call_count().await, 1); + } + + #[tokio::test] + async fn test_initial_and_provider_uses_initial_first() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + let now_ms = MockClock::system_time().as_millis() as u64; + let expires_at = now_ms + 600_000; // 10 minutes from now + + let initial = HashMap::from([ + ("aws_access_key_id".to_string(), "INITIAL_KEY".to_string()), + ( + "aws_secret_access_key".to_string(), + "INITIAL_SECRET".to_string(), + ), + (EXPIRES_AT_MILLIS_KEY.to_string(), expires_at.to_string()), + ]); + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); + + let accessor = StorageOptionsAccessor::with_initial_and_provider( + initial.clone(), + mock_provider.clone(), + ); + + // First call uses initial + let result = accessor.get_storage_options().await.unwrap(); + assert_eq!(result.0.get("aws_access_key_id").unwrap(), "INITIAL_KEY"); + assert_eq!(mock_provider.get_call_count().await, 0); // Provider not called yet + } + + #[tokio::test] + async fn test_caching_and_refresh() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); // 10 min expiry + // Use with_initial_and_provider to set custom refresh_offset_millis (5 min = 300000ms) + let now_ms = MockClock::system_time().as_millis() as u64; + let expires_at = now_ms + 600_000; // 10 minutes from now + let initial = HashMap::from([ + (EXPIRES_AT_MILLIS_KEY.to_string(), expires_at.to_string()), + (REFRESH_OFFSET_MILLIS_KEY.to_string(), "300000".to_string()), // 5 min refresh offset + ]); + let accessor = + StorageOptionsAccessor::with_initial_and_provider(initial, mock_provider.clone()); + + // First call uses initial cached options + let result = accessor.get_storage_options().await.unwrap(); + assert!(result.0.contains_key(EXPIRES_AT_MILLIS_KEY)); + assert_eq!(mock_provider.get_call_count().await, 0); + + // Advance time to 6 minutes - should trigger refresh (within 5 min refresh offset) + MockClock::set_system_time(Duration::from_secs(100_000 + 360)); + let result = accessor.get_storage_options().await.unwrap(); + assert_eq!(result.0.get("aws_access_key_id").unwrap(), "AKID_1"); + assert_eq!(mock_provider.get_call_count().await, 1); + } + + #[tokio::test] + async fn test_expired_initial_triggers_refresh() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + let now_ms = MockClock::system_time().as_millis() as u64; + let expired_time = now_ms - 1_000; // Expired 1 second ago + + let initial = HashMap::from([ + ("aws_access_key_id".to_string(), "EXPIRED_KEY".to_string()), + (EXPIRES_AT_MILLIS_KEY.to_string(), expired_time.to_string()), + ]); + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); + + let accessor = + StorageOptionsAccessor::with_initial_and_provider(initial, mock_provider.clone()); + + // Should fetch from provider since initial is expired + let result = accessor.get_storage_options().await.unwrap(); + assert_eq!(result.0.get("aws_access_key_id").unwrap(), "AKID_1"); + assert_eq!(mock_provider.get_call_count().await, 1); + } + + #[tokio::test] + async fn test_accessor_id_with_provider() { + let mock_provider = Arc::new(MockStorageOptionsProvider::new(None)); + let accessor = StorageOptionsAccessor::with_provider(mock_provider); + + let id = accessor.accessor_id(); + assert!(id.starts_with("MockStorageOptionsProvider")); + } + + #[tokio::test] + async fn test_accessor_id_static() { + let options = HashMap::from([("key".to_string(), "value".to_string())]); + let accessor = StorageOptionsAccessor::static_options(options); + + let id = accessor.accessor_id(); + assert!(id.starts_with("static_options_")); + } + + #[tokio::test] + async fn test_concurrent_access() { + // Create a mock provider with far future expiration + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(9999999999999))); + + let accessor = Arc::new(StorageOptionsAccessor::with_provider(mock_provider.clone())); + + // Spawn 10 concurrent tasks that all try to get options at the same time + let mut handles = vec![]; + for i in 0..10 { + let acc = accessor.clone(); + let handle = tokio::spawn(async move { + let result = acc.get_storage_options().await.unwrap(); + assert_eq!(result.0.get("aws_access_key_id").unwrap(), "AKID_1"); + i + }); + handles.push(handle); + } + + // Wait for all tasks to complete + let results: Vec<_> = futures::future::join_all(handles) + .await + .into_iter() + .map(|r| r.unwrap()) + .collect(); + + // Verify all 10 tasks completed successfully + assert_eq!(results.len(), 10); + + // The provider should have been called exactly once + let call_count = mock_provider.get_call_count().await; + assert_eq!( + call_count, 1, + "Provider should be called exactly once despite concurrent access" + ); + } + + #[tokio::test] + async fn test_no_expiration_never_refreshes() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + let mock_provider = Arc::new(MockStorageOptionsProvider::new(None)); // No expiration + let accessor = StorageOptionsAccessor::with_provider(mock_provider.clone()); + + // First call fetches + accessor.get_storage_options().await.unwrap(); + assert_eq!(mock_provider.get_call_count().await, 1); + + // Advance time significantly + MockClock::set_system_time(Duration::from_secs(200_000)); + + // Should still use cached options + accessor.get_storage_options().await.unwrap(); + assert_eq!(mock_provider.get_call_count().await, 1); + } +} diff --git a/rust/lance-io/src/object_store/storage_options_accessor.rs b/rust/lance-io/src/object_store/storage_options_accessor.rs deleted file mode 100644 index 7e282f9f589..00000000000 --- a/rust/lance-io/src/object_store/storage_options_accessor.rs +++ /dev/null @@ -1,584 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright The Lance Authors - -//! Unified storage options accessor with caching and automatic refresh -//! -//! This module provides [`StorageOptionsAccessor`] which bundles static storage options -//! with an optional dynamic provider, handling all caching and refresh logic internally. - -use std::collections::HashMap; -use std::fmt; -use std::sync::Arc; -use std::time::Duration; - -#[cfg(test)] -use mock_instant::thread_local::{SystemTime, UNIX_EPOCH}; - -#[cfg(not(test))] -use std::time::{SystemTime, UNIX_EPOCH}; - -use tokio::sync::RwLock; - -use super::storage_options::StorageOptionsProvider; -use super::{StorageOptions, EXPIRES_AT_MILLIS_KEY, REFRESH_OFFSET_MILLIS_KEY}; -use lance_core::{Error, Result}; -use snafu::location; - -/// Default refresh offset: 60 seconds before expiration -const DEFAULT_REFRESH_OFFSET_MILLIS: u64 = 60_000; - -/// Unified access to storage options with automatic caching and refresh -/// -/// This struct bundles static storage options with an optional dynamic provider, -/// handling all caching and refresh logic internally. It provides a single entry point -/// for accessing storage options regardless of whether they're static or dynamic. -/// -/// # Behavior -/// -/// - If only static options are provided, returns those options -/// - If a provider is configured, fetches from provider and caches results -/// - Automatically refreshes cached options before expiration (based on refresh_offset) -/// - Uses `expires_at_millis` key to track expiration -/// -/// # Thread Safety -/// -/// The accessor is thread-safe and can be shared across multiple tasks. -/// Concurrent refresh attempts are deduplicated using a try-lock mechanism. -pub struct StorageOptionsAccessor { - /// Initial/fallback static storage options - initial_options: Option>, - - /// Optional dynamic provider for refreshing options - provider: Option>, - - /// Cached storage options with expiration tracking - cache: Arc>>, - - /// Duration before expiry to trigger refresh - refresh_offset: Duration, -} - -impl fmt::Debug for StorageOptionsAccessor { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("StorageOptionsAccessor") - .field("has_initial_options", &self.initial_options.is_some()) - .field("has_provider", &self.provider.is_some()) - .field("refresh_offset", &self.refresh_offset) - .finish() - } -} - -#[derive(Debug, Clone)] -struct CachedStorageOptions { - options: HashMap, - expires_at_millis: Option, -} - -impl StorageOptionsAccessor { - /// Extract refresh offset from storage options, or use default - fn extract_refresh_offset(options: &HashMap) -> Duration { - options - .get(REFRESH_OFFSET_MILLIS_KEY) - .and_then(|s| s.parse::().ok()) - .map(Duration::from_millis) - .unwrap_or(Duration::from_millis(DEFAULT_REFRESH_OFFSET_MILLIS)) - } - - /// Create an accessor with only static options (no refresh capability) - /// - /// The returned accessor will always return the provided options. - /// This is useful when credentials don't expire or are managed externally. - pub fn static_options(options: HashMap) -> Self { - let expires_at_millis = options - .get(EXPIRES_AT_MILLIS_KEY) - .and_then(|s| s.parse::().ok()); - let refresh_offset = Self::extract_refresh_offset(&options); - - Self { - initial_options: Some(options.clone()), - provider: None, - cache: Arc::new(RwLock::new(Some(CachedStorageOptions { - options, - expires_at_millis, - }))), - refresh_offset, - } - } - - /// Create an accessor with a dynamic provider (no initial options) - /// - /// The accessor will fetch from the provider on first access and cache - /// the results. Refresh happens automatically before expiration. - /// Uses the default refresh offset (60 seconds) until options are fetched. - /// - /// # Arguments - /// * `provider` - The storage options provider for fetching fresh options - pub fn with_provider(provider: Arc) -> Self { - Self { - initial_options: None, - provider: Some(provider), - cache: Arc::new(RwLock::new(None)), - refresh_offset: Duration::from_millis(DEFAULT_REFRESH_OFFSET_MILLIS), - } - } - - /// Create an accessor with initial options and a dynamic provider - /// - /// Initial options are used until they expire, then the provider is called. - /// This avoids an immediate fetch when initial credentials are still valid. - /// The `refresh_offset_millis` key in initial_options controls refresh timing. - /// - /// # Arguments - /// * `initial_options` - Initial storage options to cache - /// * `provider` - The storage options provider for refreshing - pub fn with_initial_and_provider( - initial_options: HashMap, - provider: Arc, - ) -> Self { - let expires_at_millis = initial_options - .get(EXPIRES_AT_MILLIS_KEY) - .and_then(|s| s.parse::().ok()); - let refresh_offset = Self::extract_refresh_offset(&initial_options); - - Self { - initial_options: Some(initial_options.clone()), - provider: Some(provider), - cache: Arc::new(RwLock::new(Some(CachedStorageOptions { - options: initial_options, - expires_at_millis, - }))), - refresh_offset, - } - } - - /// Get current valid storage options - /// - /// - Returns cached options if not expired - /// - Fetches from provider if expired or not cached - /// - Falls back to initial_options if provider returns None - /// - /// # Errors - /// - /// Returns an error if: - /// - The provider fails to fetch options - /// - No options are available (no cache, no provider, no initial options) - pub async fn get_storage_options(&self) -> Result { - loop { - match self.do_get_storage_options().await? { - Some(options) => return Ok(options), - None => { - // Lock was busy, wait 10ms before retrying - tokio::time::sleep(Duration::from_millis(10)).await; - continue; - } - } - } - } - - async fn do_get_storage_options(&self) -> Result> { - // Check if we have valid cached options with read lock - { - let cached = self.cache.read().await; - if !self.needs_refresh(&cached) { - if let Some(cached_opts) = &*cached { - return Ok(Some(StorageOptions(cached_opts.options.clone()))); - } - } - } - - // If no provider, return initial options or error - let Some(provider) = &self.provider else { - return if let Some(initial) = &self.initial_options { - Ok(Some(StorageOptions(initial.clone()))) - } else { - Err(Error::IO { - source: Box::new(std::io::Error::other("No storage options available")), - location: location!(), - }) - }; - }; - - // Try to acquire write lock - if it fails, return None and let caller retry - let Ok(mut cache) = self.cache.try_write() else { - return Ok(None); - }; - - // Double-check if options are still stale after acquiring write lock - // (another thread might have refreshed them) - if !self.needs_refresh(&cache) { - if let Some(cached_opts) = &*cache { - return Ok(Some(StorageOptions(cached_opts.options.clone()))); - } - } - - log::debug!( - "Refreshing storage options from provider: {}", - provider.provider_id() - ); - - let storage_options_map = - provider - .fetch_storage_options() - .await - .map_err(|e| Error::IO { - source: Box::new(std::io::Error::other(format!( - "Failed to fetch storage options: {}", - e - ))), - location: location!(), - })?; - - let Some(options) = storage_options_map else { - // Provider returned None, fall back to initial options - if let Some(initial) = &self.initial_options { - return Ok(Some(StorageOptions(initial.clone()))); - } - return Err(Error::IO { - source: Box::new(std::io::Error::other( - "Provider returned no storage options", - )), - location: location!(), - }); - }; - - let expires_at_millis = options - .get(EXPIRES_AT_MILLIS_KEY) - .and_then(|s| s.parse::().ok()); - - if let Some(expires_at) = expires_at_millis { - let now_ms = SystemTime::now() - .duration_since(UNIX_EPOCH) - .unwrap_or(Duration::from_secs(0)) - .as_millis() as u64; - let expires_in_secs = (expires_at.saturating_sub(now_ms)) / 1000; - log::debug!( - "Successfully refreshed storage options from provider: {}, options expire in {} seconds", - provider.provider_id(), - expires_in_secs - ); - } else { - log::debug!( - "Successfully refreshed storage options from provider: {} (no expiration)", - provider.provider_id() - ); - } - - *cache = Some(CachedStorageOptions { - options: options.clone(), - expires_at_millis, - }); - - Ok(Some(StorageOptions(options))) - } - - fn needs_refresh(&self, cached: &Option) -> bool { - match cached { - None => true, - Some(cached_opts) => { - if let Some(expires_at_millis) = cached_opts.expires_at_millis { - let now_ms = SystemTime::now() - .duration_since(UNIX_EPOCH) - .unwrap_or(Duration::from_secs(0)) - .as_millis() as u64; - - // Refresh if we're within the refresh offset of expiration - let refresh_offset_millis = self.refresh_offset.as_millis() as u64; - now_ms + refresh_offset_millis >= expires_at_millis - } else { - // No expiration means options never expire - false - } - } - } - } - - /// Get the initial storage options without refresh - /// - /// Returns the initial options that were provided when creating the accessor. - /// This does not trigger any refresh, even if the options have expired. - pub fn initial_storage_options(&self) -> Option<&HashMap> { - self.initial_options.as_ref() - } - - /// Get the accessor ID for equality/hashing - /// - /// Returns the provider_id if a provider exists, otherwise generates - /// a stable ID from the initial options hash. - pub fn accessor_id(&self) -> String { - if let Some(provider) = &self.provider { - provider.provider_id() - } else if let Some(initial) = &self.initial_options { - // Generate a stable ID from initial options - use std::collections::hash_map::DefaultHasher; - use std::hash::{Hash, Hasher}; - - let mut hasher = DefaultHasher::new(); - let mut keys: Vec<_> = initial.keys().collect(); - keys.sort(); - for key in keys { - key.hash(&mut hasher); - initial.get(key).hash(&mut hasher); - } - format!("static_options_{:x}", hasher.finish()) - } else { - "empty_accessor".to_string() - } - } - - /// Check if this accessor has a dynamic provider - pub fn has_provider(&self) -> bool { - self.provider.is_some() - } - - /// Get the refresh offset duration - pub fn refresh_offset(&self) -> Duration { - self.refresh_offset - } - - /// Get the storage options provider, if any - pub fn provider(&self) -> Option<&Arc> { - self.provider.as_ref() - } -} - -#[cfg(test)] -mod tests { - use super::*; - use async_trait::async_trait; - use mock_instant::thread_local::MockClock; - - #[derive(Debug)] - struct MockStorageOptionsProvider { - call_count: Arc>, - expires_in_millis: Option, - } - - impl MockStorageOptionsProvider { - fn new(expires_in_millis: Option) -> Self { - Self { - call_count: Arc::new(RwLock::new(0)), - expires_in_millis, - } - } - - async fn get_call_count(&self) -> usize { - *self.call_count.read().await - } - } - - #[async_trait] - impl StorageOptionsProvider for MockStorageOptionsProvider { - async fn fetch_storage_options(&self) -> Result>> { - let count = { - let mut c = self.call_count.write().await; - *c += 1; - *c - }; - - let mut options = HashMap::from([ - ("aws_access_key_id".to_string(), format!("AKID_{}", count)), - ( - "aws_secret_access_key".to_string(), - format!("SECRET_{}", count), - ), - ("aws_session_token".to_string(), format!("TOKEN_{}", count)), - ]); - - if let Some(expires_in) = self.expires_in_millis { - let now_ms = SystemTime::now() - .duration_since(UNIX_EPOCH) - .unwrap() - .as_millis() as u64; - let expires_at = now_ms + expires_in; - options.insert(EXPIRES_AT_MILLIS_KEY.to_string(), expires_at.to_string()); - } - - Ok(Some(options)) - } - - fn provider_id(&self) -> String { - let ptr = Arc::as_ptr(&self.call_count) as usize; - format!("MockStorageOptionsProvider {{ id: {} }}", ptr) - } - } - - #[tokio::test] - async fn test_static_options_only() { - let options = HashMap::from([ - ("key1".to_string(), "value1".to_string()), - ("key2".to_string(), "value2".to_string()), - ]); - let accessor = StorageOptionsAccessor::static_options(options.clone()); - - let result = accessor.get_storage_options().await.unwrap(); - assert_eq!(result.0, options); - assert!(!accessor.has_provider()); - assert_eq!(accessor.initial_storage_options(), Some(&options)); - } - - #[tokio::test] - async fn test_provider_only() { - MockClock::set_system_time(Duration::from_secs(100_000)); - - let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); - let accessor = StorageOptionsAccessor::with_provider(mock_provider.clone()); - - let result = accessor.get_storage_options().await.unwrap(); - assert!(result.0.contains_key("aws_access_key_id")); - assert_eq!(result.0.get("aws_access_key_id").unwrap(), "AKID_1"); - assert!(accessor.has_provider()); - assert_eq!(accessor.initial_storage_options(), None); - assert_eq!(mock_provider.get_call_count().await, 1); - } - - #[tokio::test] - async fn test_initial_and_provider_uses_initial_first() { - MockClock::set_system_time(Duration::from_secs(100_000)); - - let now_ms = MockClock::system_time().as_millis() as u64; - let expires_at = now_ms + 600_000; // 10 minutes from now - - let initial = HashMap::from([ - ("aws_access_key_id".to_string(), "INITIAL_KEY".to_string()), - ( - "aws_secret_access_key".to_string(), - "INITIAL_SECRET".to_string(), - ), - (EXPIRES_AT_MILLIS_KEY.to_string(), expires_at.to_string()), - ]); - let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); - - let accessor = StorageOptionsAccessor::with_initial_and_provider( - initial.clone(), - mock_provider.clone(), - ); - - // First call uses initial - let result = accessor.get_storage_options().await.unwrap(); - assert_eq!(result.0.get("aws_access_key_id").unwrap(), "INITIAL_KEY"); - assert_eq!(mock_provider.get_call_count().await, 0); // Provider not called yet - } - - #[tokio::test] - async fn test_caching_and_refresh() { - MockClock::set_system_time(Duration::from_secs(100_000)); - - let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); // 10 min expiry - // Use with_initial_and_provider to set custom refresh_offset_millis (5 min = 300000ms) - let now_ms = MockClock::system_time().as_millis() as u64; - let expires_at = now_ms + 600_000; // 10 minutes from now - let initial = HashMap::from([ - (EXPIRES_AT_MILLIS_KEY.to_string(), expires_at.to_string()), - (REFRESH_OFFSET_MILLIS_KEY.to_string(), "300000".to_string()), // 5 min refresh offset - ]); - let accessor = - StorageOptionsAccessor::with_initial_and_provider(initial, mock_provider.clone()); - - // First call uses initial cached options - let result = accessor.get_storage_options().await.unwrap(); - assert!(result.0.contains_key(EXPIRES_AT_MILLIS_KEY)); - assert_eq!(mock_provider.get_call_count().await, 0); - - // Advance time to 6 minutes - should trigger refresh (within 5 min refresh offset) - MockClock::set_system_time(Duration::from_secs(100_000 + 360)); - let result = accessor.get_storage_options().await.unwrap(); - assert_eq!(result.0.get("aws_access_key_id").unwrap(), "AKID_1"); - assert_eq!(mock_provider.get_call_count().await, 1); - } - - #[tokio::test] - async fn test_expired_initial_triggers_refresh() { - MockClock::set_system_time(Duration::from_secs(100_000)); - - let now_ms = MockClock::system_time().as_millis() as u64; - let expired_time = now_ms - 1_000; // Expired 1 second ago - - let initial = HashMap::from([ - ("aws_access_key_id".to_string(), "EXPIRED_KEY".to_string()), - (EXPIRES_AT_MILLIS_KEY.to_string(), expired_time.to_string()), - ]); - let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); - - let accessor = - StorageOptionsAccessor::with_initial_and_provider(initial, mock_provider.clone()); - - // Should fetch from provider since initial is expired - let result = accessor.get_storage_options().await.unwrap(); - assert_eq!(result.0.get("aws_access_key_id").unwrap(), "AKID_1"); - assert_eq!(mock_provider.get_call_count().await, 1); - } - - #[tokio::test] - async fn test_accessor_id_with_provider() { - let mock_provider = Arc::new(MockStorageOptionsProvider::new(None)); - let accessor = StorageOptionsAccessor::with_provider(mock_provider); - - let id = accessor.accessor_id(); - assert!(id.starts_with("MockStorageOptionsProvider")); - } - - #[tokio::test] - async fn test_accessor_id_static() { - let options = HashMap::from([("key".to_string(), "value".to_string())]); - let accessor = StorageOptionsAccessor::static_options(options); - - let id = accessor.accessor_id(); - assert!(id.starts_with("static_options_")); - } - - #[tokio::test] - async fn test_concurrent_access() { - // Create a mock provider with far future expiration - let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(9999999999999))); - - let accessor = Arc::new(StorageOptionsAccessor::with_provider(mock_provider.clone())); - - // Spawn 10 concurrent tasks that all try to get options at the same time - let mut handles = vec![]; - for i in 0..10 { - let acc = accessor.clone(); - let handle = tokio::spawn(async move { - let result = acc.get_storage_options().await.unwrap(); - assert_eq!(result.0.get("aws_access_key_id").unwrap(), "AKID_1"); - i - }); - handles.push(handle); - } - - // Wait for all tasks to complete - let results: Vec<_> = futures::future::join_all(handles) - .await - .into_iter() - .map(|r| r.unwrap()) - .collect(); - - // Verify all 10 tasks completed successfully - assert_eq!(results.len(), 10); - - // The provider should have been called exactly once - let call_count = mock_provider.get_call_count().await; - assert_eq!( - call_count, 1, - "Provider should be called exactly once despite concurrent access" - ); - } - - #[tokio::test] - async fn test_no_expiration_never_refreshes() { - MockClock::set_system_time(Duration::from_secs(100_000)); - - let mock_provider = Arc::new(MockStorageOptionsProvider::new(None)); // No expiration - let accessor = StorageOptionsAccessor::with_provider(mock_provider.clone()); - - // First call fetches - accessor.get_storage_options().await.unwrap(); - assert_eq!(mock_provider.get_call_count().await, 1); - - // Advance time significantly - MockClock::set_system_time(Duration::from_secs(200_000)); - - // Should still use cached options - accessor.get_storage_options().await.unwrap(); - assert_eq!(mock_provider.get_call_count().await, 1); - } -} diff --git a/rust/lance-namespace-impls/src/dir.rs b/rust/lance-namespace-impls/src/dir.rs index 0722347d684..005e09641cd 100644 --- a/rust/lance-namespace-impls/src/dir.rs +++ b/rust/lance-namespace-impls/src/dir.rs @@ -471,8 +471,11 @@ impl DirectoryNamespaceBuilder { session: &Option>, ) -> Result<(Arc, Path)> { // Build ObjectStoreParams from storage options + let accessor = storage_options.clone().map(|opts| { + Arc::new(lance_io::object_store::StorageOptionsAccessor::static_options(opts)) + }); let params = ObjectStoreParams { - storage_options: storage_options.clone(), + storage_options_accessor: accessor, ..Default::default() }; @@ -1262,7 +1265,9 @@ impl LanceNamespace for DirectoryNamespace { }; let store_params = self.storage_options.as_ref().map(|opts| ObjectStoreParams { - storage_options: Some(opts.clone()), + storage_options_accessor: Some(Arc::new( + lance_io::object_store::StorageOptionsAccessor::static_options(opts.clone()), + )), ..Default::default() }); diff --git a/rust/lance-namespace-impls/src/dir/manifest.rs b/rust/lance-namespace-impls/src/dir/manifest.rs index bfcb9602b9a..433a0ce4719 100644 --- a/rust/lance-namespace-impls/src/dir/manifest.rs +++ b/rust/lance-namespace-impls/src/dir/manifest.rs @@ -982,7 +982,11 @@ impl ManifestNamespace { let write_params = WriteParams { session, store_params: storage_options.as_ref().map(|opts| ObjectStoreParams { - storage_options: Some(opts.clone()), + storage_options_accessor: Some(Arc::new( + lance_io::object_store::StorageOptionsAccessor::static_options( + opts.clone(), + ), + )), ..Default::default() }), ..Default::default() diff --git a/rust/lance-table/src/io/commit.rs b/rust/lance-table/src/io/commit.rs index d93fa4b3c86..12e48121b36 100644 --- a/rust/lance-table/src/io/commit.rs +++ b/rust/lance-table/src/io/commit.rs @@ -766,14 +766,14 @@ pub async fn commit_handler_from_url( }; let options = options.clone().unwrap_or_default(); let storage_options_raw = - StorageOptions(options.storage_options.clone().unwrap_or_default()); + StorageOptions(options.storage_options().cloned().unwrap_or_default()); let dynamo_endpoint = get_dynamodb_endpoint(&storage_options_raw); let storage_options = storage_options_raw.as_s3_options(); let region = storage_options.get(&AmazonS3ConfigKey::Region).cloned(); - // Get or create accessor from the options - let accessor = options.get_or_create_accessor(); + // Get accessor from the options + let accessor = options.get_accessor(); let (aws_creds, region) = build_aws_credential( options.s3_credentials_refresh_offset, diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index 177b3284b69..fe0a989dbd6 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -36,7 +36,7 @@ use lance_file::version::LanceFileVersion; use lance_index::DatasetIndexExt; use lance_io::object_store::{ LanceNamespaceStorageOptionsProvider, ObjectStore, ObjectStoreParams, StorageOptions, - StorageOptionsAccessor, + StorageOptionsAccessor, StorageOptionsProvider, }; use lance_io::utils::{read_last_block, read_message, read_metadata_offset, read_struct}; use lance_namespace::LanceNamespace; @@ -809,7 +809,6 @@ impl Dataset { /// * `namespace` - The namespace to use for table management /// * `table_id` - The table identifier /// * `params` - Write parameters - #[allow(deprecated)] pub async fn write_into_namespace( batches: impl RecordBatchReader + Send + 'static, namespace: Arc, @@ -865,22 +864,26 @@ impl Dataset { // Set initial credentials and provider from namespace if let Some(namespace_storage_options) = response.storage_options { - let provider = Arc::new(LanceNamespaceStorageOptionsProvider::new( - namespace, table_id, - )); + let provider: Arc = Arc::new( + LanceNamespaceStorageOptionsProvider::new(namespace, table_id), + ); // Merge namespace storage options with any existing options let mut merged_options = write_params .store_params .as_ref() - .and_then(|p| p.storage_options.clone()) + .and_then(|p| p.storage_options().cloned()) .unwrap_or_default(); merged_options.extend(namespace_storage_options); + let accessor = Arc::new(StorageOptionsAccessor::with_initial_and_provider( + merged_options, + provider, + )); + let existing_params = write_params.store_params.take().unwrap_or_default(); write_params.store_params = Some(ObjectStoreParams { - storage_options: Some(merged_options), - storage_options_provider: Some(provider), + storage_options_accessor: Some(accessor), ..existing_params }); } @@ -910,23 +913,28 @@ impl Dataset { // Set initial credentials and provider from namespace if let Some(namespace_storage_options) = response.storage_options { - let provider = Arc::new(LanceNamespaceStorageOptionsProvider::new( - namespace.clone(), - table_id.clone(), - )); + let provider: Arc = + Arc::new(LanceNamespaceStorageOptionsProvider::new( + namespace.clone(), + table_id.clone(), + )); // Merge namespace storage options with any existing options let mut merged_options = write_params .store_params .as_ref() - .and_then(|p| p.storage_options.clone()) + .and_then(|p| p.storage_options().cloned()) .unwrap_or_default(); merged_options.extend(namespace_storage_options); + let accessor = Arc::new(StorageOptionsAccessor::with_initial_and_provider( + merged_options, + provider, + )); + let existing_params = write_params.store_params.take().unwrap_or_default(); write_params.store_params = Some(ObjectStoreParams { - storage_options: Some(merged_options), - storage_options_provider: Some(provider), + storage_options_accessor: Some(accessor), ..existing_params }); } @@ -936,11 +944,8 @@ impl Dataset { // assumes no dataset exists and converts the mode to CREATE. let mut builder = DatasetBuilder::from_uri(uri.as_str()); if let Some(ref store_params) = write_params.store_params { - if let Some(ref storage_options) = store_params.storage_options { - builder = builder.with_storage_options(storage_options.clone()); - } - if let Some(ref provider) = store_params.storage_options_provider { - builder = builder.with_storage_options_provider(provider.clone()); + if let Some(accessor) = &store_params.storage_options_accessor { + builder = builder.with_storage_options_accessor(accessor.clone()); } } let dataset = Arc::new(builder.load().await?); @@ -1610,10 +1615,18 @@ impl Dataset { /// /// This returns the static initial options without triggering any refresh. /// For the latest refreshed options, use [`Self::latest_storage_options`]. + #[deprecated(since = "0.25.0", note = "Use initial_storage_options() instead")] pub fn storage_options(&self) -> Option<&HashMap> { + self.initial_storage_options() + } + + /// Returns the initial storage options without triggering any refresh. + /// + /// For the latest refreshed options, use [`Self::latest_storage_options`]. + pub fn initial_storage_options(&self) -> Option<&HashMap> { self.store_params .as_ref() - .and_then(|params| params.storage_options.as_ref()) + .and_then(|params| params.storage_options()) } /// Returns the storage options provider used when opening this dataset, if any. @@ -1622,7 +1635,8 @@ impl Dataset { ) -> Option> { self.store_params .as_ref() - .and_then(|params| params.storage_options_provider.clone()) + .and_then(|params| params.storage_options_accessor.as_ref()) + .and_then(|accessor| accessor.provider().cloned()) } /// Returns the unified storage options accessor for this dataset, if any. @@ -1633,7 +1647,7 @@ impl Dataset { pub fn storage_options_accessor(&self) -> Option> { self.store_params .as_ref() - .and_then(|params| params.get_or_create_accessor()) + .and_then(|params| params.get_accessor()) } /// Returns the latest (possibly refreshed) storage options. @@ -1655,9 +1669,8 @@ impl Dataset { return Ok(Some(options)); } - // Fallback to legacy storage_options field if no accessor - // This handles the case where storage_options was set without a provider - Ok(self.storage_options().cloned().map(StorageOptions)) + // Fallback to initial storage options if no accessor + Ok(self.initial_storage_options().cloned().map(StorageOptions)) } pub fn data_dir(&self) -> Path { diff --git a/rust/lance/src/dataset/builder.rs b/rust/lance/src/dataset/builder.rs index e3eb8efe348..789ab2b1dcc 100644 --- a/rust/lance/src/dataset/builder.rs +++ b/rust/lance/src/dataset/builder.rs @@ -148,9 +148,12 @@ impl DatasetBuilder { builder.storage_options_override = namespace_storage_options.clone(); - if namespace_storage_options.is_some() { - builder.options.storage_options_provider = Some(Arc::new( + if let Some(initial_opts) = namespace_storage_options { + let provider: Arc = Arc::new( LanceNamespaceStorageOptionsProvider::new(namespace, table_id), + ); + builder.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_initial_and_provider(initial_opts, provider), )); } @@ -273,7 +276,26 @@ impl DatasetBuilder { /// - [S3 options](https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html#variants) /// - [Google options](https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html#variants) pub fn with_storage_options(mut self, storage_options: HashMap) -> Self { - self.options.storage_options = Some(storage_options); + // Merge with existing options if accessor exists, otherwise create new static accessor + if let Some(existing) = self.options.storage_options_accessor.take() { + let mut merged = existing + .initial_storage_options() + .cloned() + .unwrap_or_default(); + merged.extend(storage_options); + if let Some(provider) = existing.provider().cloned() { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_initial_and_provider(merged, provider), + )); + } else { + self.options.storage_options_accessor = + Some(Arc::new(StorageOptionsAccessor::static_options(merged))); + } + } else { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::static_options(storage_options), + )); + } self } @@ -285,9 +307,25 @@ impl DatasetBuilder { /// .with_storage_option("region", "us-east-1"); /// ``` pub fn with_storage_option(mut self, key: impl AsRef, value: impl AsRef) -> Self { - let mut storage_options = self.options.storage_options.unwrap_or_default(); + let mut storage_options = self.options.storage_options().cloned().unwrap_or_default(); storage_options.insert(key.as_ref().to_string(), value.as_ref().to_string()); - self.options.storage_options = Some(storage_options); + + // Merge with existing accessor if present + if let Some(existing) = self.options.storage_options_accessor.take() { + if let Some(provider) = existing.provider().cloned() { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_initial_and_provider(storage_options, provider), + )); + } else { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::static_options(storage_options), + )); + } + } else { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::static_options(storage_options), + )); + } self } @@ -339,7 +377,20 @@ impl DatasetBuilder { mut self, provider: Arc, ) -> Self { - self.options.storage_options_provider = Some(provider); + // Preserve existing storage options if any + if let Some(existing) = self.options.storage_options_accessor.take() { + if let Some(initial) = existing.initial_storage_options().cloned() { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_initial_and_provider(initial, provider), + )); + } else { + self.options.storage_options_accessor = + Some(Arc::new(StorageOptionsAccessor::with_provider(provider))); + } + } else { + self.options.storage_options_accessor = + Some(Arc::new(StorageOptionsAccessor::with_provider(provider))); + } self } @@ -432,8 +483,8 @@ impl DatasetBuilder { let storage_options = self .options - .storage_options - .clone() + .storage_options() + .cloned() .map(StorageOptions::new) .unwrap_or_default(); let download_retry_count = storage_options.download_retry_count(); @@ -492,12 +543,29 @@ impl DatasetBuilder { } async fn load_impl(mut self) -> Result { - // Apply storage_options_override last to ensure namespace options take precedence + // Apply storage_options_override to merge namespace options with any existing accessor if let Some(override_opts) = self.storage_options_override.take() { - let mut merged_opts = self.options.storage_options.clone().unwrap_or_default(); + // Get existing options and merge + let mut merged_opts = self.options.storage_options().cloned().unwrap_or_default(); // Override with namespace storage options - they take precedence merged_opts.extend(override_opts); - self.options.storage_options = Some(merged_opts); + + // Update accessor with merged options + if let Some(accessor) = &self.options.storage_options_accessor { + if let Some(provider) = accessor.provider().cloned() { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_initial_and_provider(merged_opts, provider), + )); + } else { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::static_options(merged_opts), + )); + } + } else { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::static_options(merged_opts), + )); + } } let session = match self.session.as_ref() { diff --git a/rust/lance/src/dataset/fragment/write.rs b/rust/lance/src/dataset/fragment/write.rs index 0ab7dd15d21..a9c87e44a05 100644 --- a/rust/lance/src/dataset/fragment/write.rs +++ b/rust/lance/src/dataset/fragment/write.rs @@ -303,12 +303,12 @@ impl<'a> FragmentCreateBuilder<'a> { async fn existing_dataset_schema(&self) -> Result> { let mut builder = DatasetBuilder::from_uri(self.dataset_uri); - let storage_options = self + let accessor = self .write_params .and_then(|p| p.store_params.as_ref()) - .and_then(|p| p.storage_options.clone()); - if let Some(storage_options) = storage_options { - builder = builder.with_storage_options(storage_options); + .and_then(|p| p.storage_options_accessor.clone()); + if let Some(accessor) = accessor { + builder = builder.with_storage_options_accessor(accessor); } match builder.load().await { Ok(dataset) => { diff --git a/rust/lance/src/io.rs b/rust/lance/src/io.rs index 1ad45ce2d68..1113ef0a2a7 100644 --- a/rust/lance/src/io.rs +++ b/rust/lance/src/io.rs @@ -9,6 +9,9 @@ pub mod exec; pub use lance_io::{ bytes_read_counter, iops_counter, - object_store::{ObjectStore, ObjectStoreParams, ObjectStoreRegistry, WrappingObjectStore}, + object_store::{ + ObjectStore, ObjectStoreParams, ObjectStoreRegistry, StorageOptionsAccessor, + WrappingObjectStore, + }, stream::RecordBatchStream, }; From 5b40746dae3767eb3fa4b88cf717365836722d4c Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Fri, 16 Jan 2026 00:46:48 -0800 Subject: [PATCH 8/9] fix: fix failing CI tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix s3_test.rs to use storage_options_accessor instead of storage_options - Fix formatting in python/src/file.rs 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- python/src/file.rs | 14 ++++++-------- rust/lance/src/io/commit/s3_test.rs | 10 +++++----- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/python/src/file.rs b/python/src/file.rs index 5c479e0e3a8..0bff33d3201 100644 --- a/python/src/file.rs +++ b/python/src/file.rs @@ -393,14 +393,12 @@ pub async fn object_store_from_uri_or_path_with_provider( (Some(opts), Some(provider)) => Some(Arc::new( lance::io::StorageOptionsAccessor::with_initial_and_provider(opts, provider), )), - (None, Some(provider)) => { - Some(Arc::new(lance::io::StorageOptionsAccessor::with_provider( - provider, - ))) - } - (Some(opts), None) => Some(Arc::new( - lance::io::StorageOptionsAccessor::static_options(opts), - )), + (None, Some(provider)) => Some(Arc::new(lance::io::StorageOptionsAccessor::with_provider( + provider, + ))), + (Some(opts), None) => Some(Arc::new(lance::io::StorageOptionsAccessor::static_options( + opts, + ))), (None, None) => None, }; diff --git a/rust/lance/src/io/commit/s3_test.rs b/rust/lance/src/io/commit/s3_test.rs index 35e64703688..8486a11060a 100644 --- a/rust/lance/src/io/commit/s3_test.rs +++ b/rust/lance/src/io/commit/s3_test.rs @@ -8,7 +8,7 @@ use crate::{ dataset::{ builder::DatasetBuilder, CommitBuilder, InsertBuilder, ReadParams, WriteMode, WriteParams, }, - io::ObjectStoreParams, + io::{ObjectStoreParams, StorageOptionsAccessor}, }; use aws_config::{BehaviorVersion, ConfigLoader, Region, SdkConfig}; use aws_sdk_s3::{config::Credentials, Client as S3Client}; @@ -186,12 +186,12 @@ async fn test_concurrent_writers() { // Create a table let store_params = ObjectStoreParams { object_store_wrapper: Some(io_tracker.clone()), - storage_options: Some( + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::static_options( CONFIG .iter() .map(|(k, v)| (k.to_string(), v.to_string())) .collect(), - ), + ))), ..Default::default() }; let write_params = WriteParams { @@ -270,12 +270,12 @@ async fn test_ddb_open_iops() { // Create a table let store_params = ObjectStoreParams { object_store_wrapper: Some(io_tracker.clone()), - storage_options: Some( + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::static_options( CONFIG .iter() .map(|(k, v)| (k.to_string(), v.to_string())) .collect(), - ), + ))), ..Default::default() }; let write_params = WriteParams { From ae7efdc66b8ed9f291341b1dafacdf6288f6bf93 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Fri, 16 Jan 2026 11:55:16 -0800 Subject: [PATCH 9/9] refactor: rename static_options to with_static_options for consistency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Renamed StorageOptionsAccessor::static_options() to with_static_options() for naming consistency with with_provider() and with_initial_and_provider(). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- java/lance-jni/src/blocking_dataset.rs | 16 ++++++++-------- java/lance-jni/src/file_reader.rs | 2 +- java/lance-jni/src/file_writer.rs | 2 +- java/lance-jni/src/transaction.rs | 12 ++++++------ java/lance-jni/src/utils.rs | 6 +++--- python/src/dataset.rs | 4 ++-- python/src/file.rs | 6 +++--- python/src/storage_options.rs | 8 +++++--- rust/lance-io/src/object_store.rs | 4 ++-- rust/lance-io/src/object_store/providers.rs | 2 +- rust/lance-io/src/object_store/providers/aws.rs | 4 ++-- .../lance-io/src/object_store/providers/azure.rs | 2 +- rust/lance-io/src/object_store/providers/gcp.rs | 2 +- .../src/object_store/providers/huggingface.rs | 2 +- .../lance-io/src/object_store/storage_options.rs | 6 +++--- rust/lance-namespace-impls/src/dir.rs | 4 ++-- rust/lance-namespace-impls/src/dir/manifest.rs | 2 +- rust/lance/src/dataset/builder.rs | 15 ++++++++------- rust/lance/src/io/commit/s3_test.rs | 4 ++-- 19 files changed, 53 insertions(+), 50 deletions(-) diff --git a/java/lance-jni/src/blocking_dataset.rs b/java/lance-jni/src/blocking_dataset.rs index acea6166905..10647e0540c 100644 --- a/java/lance-jni/src/blocking_dataset.rs +++ b/java/lance-jni/src/blocking_dataset.rs @@ -100,7 +100,7 @@ impl BlockingDataset { let registry = Arc::new(ObjectStoreRegistry::default()); let object_store_params = ObjectStoreParams { storage_options_accessor: Some(Arc::new( - lance::io::StorageOptionsAccessor::static_options(storage_options), + lance::io::StorageOptionsAccessor::with_static_options(storage_options), )), ..Default::default() }; @@ -142,9 +142,9 @@ impl BlockingDataset { provider, ), )), - (false, None) => Some(Arc::new(lance::io::StorageOptionsAccessor::static_options( - storage_options, - ))), + (false, None) => Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(storage_options), + )), (true, Some(provider)) => Some(Arc::new( lance::io::StorageOptionsAccessor::with_provider(provider), )), @@ -186,9 +186,9 @@ impl BlockingDataset { let accessor = if storage_options.is_empty() { None } else { - Some(Arc::new(lance::io::StorageOptionsAccessor::static_options( - storage_options, - ))) + Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(storage_options), + )) }; let inner = RT.block_on(Dataset::commit( uri, @@ -2234,7 +2234,7 @@ fn transform_jstorage_options( .map(|options| { Some(ObjectStoreParams { storage_options_accessor: Some(Arc::new( - lance::io::StorageOptionsAccessor::static_options(options), + lance::io::StorageOptionsAccessor::with_static_options(options), )), ..Default::default() }) diff --git a/java/lance-jni/src/file_reader.rs b/java/lance-jni/src/file_reader.rs index 83e1e5d45ab..ccaac121579 100644 --- a/java/lance-jni/src/file_reader.rs +++ b/java/lance-jni/src/file_reader.rs @@ -113,7 +113,7 @@ fn inner_open<'local>( let reader = RT.block_on(async move { let object_params = ObjectStoreParams { storage_options_accessor: Some(Arc::new( - lance::io::StorageOptionsAccessor::static_options(storage_options), + lance::io::StorageOptionsAccessor::with_static_options(storage_options), )), ..Default::default() }; diff --git a/java/lance-jni/src/file_writer.rs b/java/lance-jni/src/file_writer.rs index 3328ddc4d3c..ebc5b1c328b 100644 --- a/java/lance-jni/src/file_writer.rs +++ b/java/lance-jni/src/file_writer.rs @@ -95,7 +95,7 @@ fn inner_open<'local>( let writer = RT.block_on(async move { let object_params = ObjectStoreParams { storage_options_accessor: Some(Arc::new( - lance::io::StorageOptionsAccessor::static_options(storage_options), + lance::io::StorageOptionsAccessor::with_static_options(storage_options), )), ..Default::default() }; diff --git a/java/lance-jni/src/transaction.rs b/java/lance-jni/src/transaction.rs index e5cc16413c1..42331f99b5f 100644 --- a/java/lance-jni/src/transaction.rs +++ b/java/lance-jni/src/transaction.rs @@ -777,16 +777,16 @@ fn inner_commit_transaction<'local>( ), )) } else { - Some(Arc::new(lance::io::StorageOptionsAccessor::static_options( - merged, - ))) + Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(merged), + )) } } None => { if !write_param.is_empty() { - Some(Arc::new(lance::io::StorageOptionsAccessor::static_options( - write_param, - ))) + Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(write_param), + )) } else { None } diff --git a/java/lance-jni/src/utils.rs b/java/lance-jni/src/utils.rs index d12a60286c5..02c5596d74b 100644 --- a/java/lance-jni/src/utils.rs +++ b/java/lance-jni/src/utils.rs @@ -110,9 +110,9 @@ pub fn extract_write_params( (false, Some(provider)) => Some(Arc::new( lance::io::StorageOptionsAccessor::with_initial_and_provider(storage_options, provider), )), - (false, None) => Some(Arc::new(lance::io::StorageOptionsAccessor::static_options( - storage_options, - ))), + (false, None) => Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(storage_options), + )), (true, Some(provider)) => Some(Arc::new(lance::io::StorageOptionsAccessor::with_provider( provider, ))), diff --git a/python/src/dataset.rs b/python/src/dataset.rs index 7e2f682d876..11047f33c12 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -1565,7 +1565,7 @@ impl Dataset { // `storage_options` will be forwarded to the object store params for the new dataset. let store_params = storage_options.as_ref().map(|opts| ObjectStoreParams { storage_options_accessor: Some(Arc::new( - lance::io::StorageOptionsAccessor::static_options(opts.clone()), + lance::io::StorageOptionsAccessor::with_static_options(opts.clone()), )), ..Default::default() }); @@ -1746,7 +1746,7 @@ impl Dataset { let reference = self.transform_ref(reference)?; let store_params = storage_options.map(|opts| ObjectStoreParams { storage_options_accessor: Some(Arc::new( - lance::io::StorageOptionsAccessor::static_options(opts), + lance::io::StorageOptionsAccessor::with_static_options(opts), )), ..Default::default() }); diff --git a/python/src/file.rs b/python/src/file.rs index 0bff33d3201..4256d4e74d6 100644 --- a/python/src/file.rs +++ b/python/src/file.rs @@ -396,9 +396,9 @@ pub async fn object_store_from_uri_or_path_with_provider( (None, Some(provider)) => Some(Arc::new(lance::io::StorageOptionsAccessor::with_provider( provider, ))), - (Some(opts), None) => Some(Arc::new(lance::io::StorageOptionsAccessor::static_options( - opts, - ))), + (Some(opts), None) => Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(opts), + )), (None, None) => None, }; diff --git a/python/src/storage_options.rs b/python/src/storage_options.rs index 0c8a4d2d00b..7e2c6011ece 100644 --- a/python/src/storage_options.rs +++ b/python/src/storage_options.rs @@ -191,9 +191,9 @@ impl PyStorageOptionsAccessor { impl PyStorageOptionsAccessor { /// Create an accessor with only static options (no refresh capability) #[staticmethod] - fn static_options(options: HashMap) -> Self { + fn with_static_options(options: HashMap) -> Self { Self { - inner: Arc::new(StorageOptionsAccessor::static_options(options)), + inner: Arc::new(StorageOptionsAccessor::with_static_options(options)), } } @@ -285,7 +285,9 @@ pub fn create_accessor_from_python( rust_provider, )))) } - (Some(opts), None) => Ok(Some(Arc::new(StorageOptionsAccessor::static_options(opts)))), + (Some(opts), None) => Ok(Some(Arc::new(StorageOptionsAccessor::with_static_options( + opts, + )))), (None, None) => Ok(None), } } diff --git a/rust/lance-io/src/object_store.rs b/rust/lance-io/src/object_store.rs index 9947781504f..44324fa3ee3 100644 --- a/rust/lance-io/src/object_store.rs +++ b/rust/lance-io/src/object_store.rs @@ -197,7 +197,7 @@ pub struct ObjectStoreParams { /// Unified storage options accessor with caching and automatic refresh /// /// Provides storage options and optionally a dynamic provider for automatic - /// credential refresh. Use `StorageOptionsAccessor::static_options()` for static + /// credential refresh. Use `StorageOptionsAccessor::with_static_options()` for static /// options or `StorageOptionsAccessor::with_initial_and_provider()` for dynamic refresh. pub storage_options_accessor: Option>, /// Use constant size upload parts for multipart uploads. Only necessary @@ -1000,7 +1000,7 @@ mod tests { let registry = Arc::new(ObjectStoreRegistry::default()); let accessor = storage_options .clone() - .map(|opts| Arc::new(StorageOptionsAccessor::static_options(opts))); + .map(|opts| Arc::new(StorageOptionsAccessor::with_static_options(opts))); let params = ObjectStoreParams { storage_options_accessor: accessor.clone(), ..ObjectStoreParams::default() diff --git a/rust/lance-io/src/object_store/providers.rs b/rust/lance-io/src/object_store/providers.rs index 1848cf30b27..b48e6f20079 100644 --- a/rust/lance-io/src/object_store/providers.rs +++ b/rust/lance-io/src/object_store/providers.rs @@ -423,7 +423,7 @@ mod tests { let params1 = ObjectStoreParams::default(); let params2 = ObjectStoreParams { - storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::static_options( + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::with_static_options( HashMap::from([("k".into(), "v".into())]), ))), ..Default::default() diff --git a/rust/lance-io/src/object_store/providers/aws.rs b/rust/lance-io/src/object_store/providers/aws.rs index 53e5a93bde1..982470581f2 100644 --- a/rust/lance-io/src/object_store/providers/aws.rs +++ b/rust/lance-io/src/object_store/providers/aws.rs @@ -446,7 +446,7 @@ impl ObjectStoreParams { let storage_options_accessor = region.map(|region| { let opts: HashMap = [("region".into(), region)].iter().cloned().collect(); - Arc::new(StorageOptionsAccessor::static_options(opts)) + Arc::new(StorageOptionsAccessor::with_static_options(opts)) }); Self { aws_credentials, @@ -675,7 +675,7 @@ mod tests { let provider = AwsStoreProvider; let url = Url::parse("s3://test-bucket/path").unwrap(); let params_with_flag = ObjectStoreParams { - storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::static_options( + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::with_static_options( HashMap::from([ ("use_opendal".to_string(), "true".to_string()), ("region".to_string(), "us-west-2".to_string()), diff --git a/rust/lance-io/src/object_store/providers/azure.rs b/rust/lance-io/src/object_store/providers/azure.rs index 1c64f412d69..7bf566c8972 100644 --- a/rust/lance-io/src/object_store/providers/azure.rs +++ b/rust/lance-io/src/object_store/providers/azure.rs @@ -236,7 +236,7 @@ mod tests { let provider = AzureBlobStoreProvider; let url = Url::parse("az://test-container/path").unwrap(); let params_with_flag = ObjectStoreParams { - storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::static_options( + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::with_static_options( HashMap::from([ ("use_opendal".to_string(), "true".to_string()), ("account_name".to_string(), "test_account".to_string()), diff --git a/rust/lance-io/src/object_store/providers/gcp.rs b/rust/lance-io/src/object_store/providers/gcp.rs index c5fa7e6fca7..dba5cd8dd40 100644 --- a/rust/lance-io/src/object_store/providers/gcp.rs +++ b/rust/lance-io/src/object_store/providers/gcp.rs @@ -186,7 +186,7 @@ mod tests { let provider = GcsStoreProvider; let url = Url::parse("gs://test-bucket/path").unwrap(); let params_with_flag = ObjectStoreParams { - storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::static_options( + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::with_static_options( HashMap::from([ ("use_opendal".to_string(), "true".to_string()), ( diff --git a/rust/lance-io/src/object_store/providers/huggingface.rs b/rust/lance-io/src/object_store/providers/huggingface.rs index 47a752768f0..55c5f6d50b9 100644 --- a/rust/lance-io/src/object_store/providers/huggingface.rs +++ b/rust/lance-io/src/object_store/providers/huggingface.rs @@ -163,7 +163,7 @@ mod tests { use std::sync::Arc; let url = Url::parse("hf://datasets/acme/repo/data/file").unwrap(); let params = ObjectStoreParams { - storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::static_options( + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::with_static_options( HashMap::from([(String::from("hf_revision"), String::from("stable"))]), ))), ..Default::default() diff --git a/rust/lance-io/src/object_store/storage_options.rs b/rust/lance-io/src/object_store/storage_options.rs index 8204210990b..d0f5cc20e93 100644 --- a/rust/lance-io/src/object_store/storage_options.rs +++ b/rust/lance-io/src/object_store/storage_options.rs @@ -218,7 +218,7 @@ impl StorageOptionsAccessor { /// /// The returned accessor will always return the provided options. /// This is useful when credentials don't expire or are managed externally. - pub fn static_options(options: HashMap) -> Self { + pub fn with_static_options(options: HashMap) -> Self { let expires_at_millis = options .get(EXPIRES_AT_MILLIS_KEY) .and_then(|s| s.parse::().ok()); @@ -537,7 +537,7 @@ mod tests { ("key1".to_string(), "value1".to_string()), ("key2".to_string(), "value2".to_string()), ]); - let accessor = StorageOptionsAccessor::static_options(options.clone()); + let accessor = StorageOptionsAccessor::with_static_options(options.clone()); let result = accessor.get_storage_options().await.unwrap(); assert_eq!(result.0, options); @@ -649,7 +649,7 @@ mod tests { #[tokio::test] async fn test_accessor_id_static() { let options = HashMap::from([("key".to_string(), "value".to_string())]); - let accessor = StorageOptionsAccessor::static_options(options); + let accessor = StorageOptionsAccessor::with_static_options(options); let id = accessor.accessor_id(); assert!(id.starts_with("static_options_")); diff --git a/rust/lance-namespace-impls/src/dir.rs b/rust/lance-namespace-impls/src/dir.rs index 005e09641cd..875df33e580 100644 --- a/rust/lance-namespace-impls/src/dir.rs +++ b/rust/lance-namespace-impls/src/dir.rs @@ -472,7 +472,7 @@ impl DirectoryNamespaceBuilder { ) -> Result<(Arc, Path)> { // Build ObjectStoreParams from storage options let accessor = storage_options.clone().map(|opts| { - Arc::new(lance_io::object_store::StorageOptionsAccessor::static_options(opts)) + Arc::new(lance_io::object_store::StorageOptionsAccessor::with_static_options(opts)) }); let params = ObjectStoreParams { storage_options_accessor: accessor, @@ -1266,7 +1266,7 @@ impl LanceNamespace for DirectoryNamespace { let store_params = self.storage_options.as_ref().map(|opts| ObjectStoreParams { storage_options_accessor: Some(Arc::new( - lance_io::object_store::StorageOptionsAccessor::static_options(opts.clone()), + lance_io::object_store::StorageOptionsAccessor::with_static_options(opts.clone()), )), ..Default::default() }); diff --git a/rust/lance-namespace-impls/src/dir/manifest.rs b/rust/lance-namespace-impls/src/dir/manifest.rs index 433a0ce4719..49d19712e26 100644 --- a/rust/lance-namespace-impls/src/dir/manifest.rs +++ b/rust/lance-namespace-impls/src/dir/manifest.rs @@ -983,7 +983,7 @@ impl ManifestNamespace { session, store_params: storage_options.as_ref().map(|opts| ObjectStoreParams { storage_options_accessor: Some(Arc::new( - lance_io::object_store::StorageOptionsAccessor::static_options( + lance_io::object_store::StorageOptionsAccessor::with_static_options( opts.clone(), ), )), diff --git a/rust/lance/src/dataset/builder.rs b/rust/lance/src/dataset/builder.rs index 789ab2b1dcc..9eba7009d19 100644 --- a/rust/lance/src/dataset/builder.rs +++ b/rust/lance/src/dataset/builder.rs @@ -288,12 +288,13 @@ impl DatasetBuilder { StorageOptionsAccessor::with_initial_and_provider(merged, provider), )); } else { - self.options.storage_options_accessor = - Some(Arc::new(StorageOptionsAccessor::static_options(merged))); + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_static_options(merged), + )); } } else { self.options.storage_options_accessor = Some(Arc::new( - StorageOptionsAccessor::static_options(storage_options), + StorageOptionsAccessor::with_static_options(storage_options), )); } self @@ -318,12 +319,12 @@ impl DatasetBuilder { )); } else { self.options.storage_options_accessor = Some(Arc::new( - StorageOptionsAccessor::static_options(storage_options), + StorageOptionsAccessor::with_static_options(storage_options), )); } } else { self.options.storage_options_accessor = Some(Arc::new( - StorageOptionsAccessor::static_options(storage_options), + StorageOptionsAccessor::with_static_options(storage_options), )); } self @@ -558,12 +559,12 @@ impl DatasetBuilder { )); } else { self.options.storage_options_accessor = Some(Arc::new( - StorageOptionsAccessor::static_options(merged_opts), + StorageOptionsAccessor::with_static_options(merged_opts), )); } } else { self.options.storage_options_accessor = Some(Arc::new( - StorageOptionsAccessor::static_options(merged_opts), + StorageOptionsAccessor::with_static_options(merged_opts), )); } } diff --git a/rust/lance/src/io/commit/s3_test.rs b/rust/lance/src/io/commit/s3_test.rs index 8486a11060a..1402fb25d46 100644 --- a/rust/lance/src/io/commit/s3_test.rs +++ b/rust/lance/src/io/commit/s3_test.rs @@ -186,7 +186,7 @@ async fn test_concurrent_writers() { // Create a table let store_params = ObjectStoreParams { object_store_wrapper: Some(io_tracker.clone()), - storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::static_options( + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::with_static_options( CONFIG .iter() .map(|(k, v)| (k.to_string(), v.to_string())) @@ -270,7 +270,7 @@ async fn test_ddb_open_iops() { // Create a table let store_params = ObjectStoreParams { object_store_wrapper: Some(io_tracker.clone()), - storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::static_options( + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::with_static_options( CONFIG .iter() .map(|(k, v)| (k.to_string(), v.to_string()))