From 207699b3854dc4bdfb93aa3291b0584709dd6d22 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Thu, 9 Oct 2025 15:10:28 -0700 Subject: [PATCH 01/11] commit --- protos/transaction.proto | 2 + python/python/lance/__init__.py | 2 + python/python/lance/dataset.py | 50 + python/python/tests/test_multi_bases.py | 515 +++++++++ python/src/dataset.rs | 99 +- python/src/lib.rs | 3 +- python/src/transaction.rs | 18 +- rust/lance-io/src/object_store.rs | 21 + rust/lance-table/src/feature_flags.rs | 52 +- rust/lance-table/src/format/fragment.rs | 20 +- rust/lance-table/src/format/manifest.rs | 62 +- rust/lance/src/dataset.rs | 39 +- rust/lance/src/dataset/builder.rs | 4 + rust/lance/src/dataset/fragment.rs | 40 +- rust/lance/src/dataset/fragment/write.rs | 1 + rust/lance/src/dataset/optimize.rs | 1 + rust/lance/src/dataset/transaction.rs | 78 +- rust/lance/src/dataset/write.rs | 998 +++++++++++++++++- rust/lance/src/dataset/write/commit.rs | 2 + rust/lance/src/dataset/write/insert.rs | 56 +- rust/lance/src/dataset/write/merge_insert.rs | 2 + .../dataset/write/merge_insert/exec/write.rs | 1 + rust/lance/src/dataset/write/update.rs | 1 + rust/lance/src/io/commit.rs | 18 +- rust/lance/src/io/commit/conflict_resolver.rs | 2 + rust/lance/src/utils/test.rs | 1 + 26 files changed, 1987 insertions(+), 101 deletions(-) create mode 100644 python/python/tests/test_multi_bases.py diff --git a/protos/transaction.proto b/protos/transaction.proto index a817835be29..5840f3180ad 100644 --- a/protos/transaction.proto +++ b/protos/transaction.proto @@ -68,6 +68,8 @@ message Transaction { map schema_metadata = 3; // Key-value pairs to merge with existing config. map config_upsert_values = 4; + // The base paths to be added for the initial dataset creation + repeated BasePath initial_bases = 5; } // Add or replace a new secondary index. diff --git a/python/python/lance/__init__.py b/python/python/lance/__init__.py index 9bc9c8ccd77..ca94f80bf52 100644 --- a/python/python/lance/__init__.py +++ b/python/python/lance/__init__.py @@ -26,6 +26,7 @@ ) from .fragment import FragmentMetadata, LanceFragment from .lance import ( + DatasetBasePath, FFILanceTableProvider, ScanStatistics, bytes_read_counter, @@ -47,6 +48,7 @@ __all__ = [ "BlobColumn", "BlobFile", + "DatasetBasePath", "DataStatistics", "FieldStatistics", "FragmentMetadata", diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index ad4f66e3b7e..43cd7285e02 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -52,6 +52,7 @@ CleanupStats, Compaction, CompactionMetrics, + DatasetBasePath, LanceSchema, ScanStatistics, _Dataset, @@ -4897,6 +4898,7 @@ def write_dataset( auto_cleanup_options: Optional[AutoCleanupConfig] = None, commit_message: Optional[str] = None, transaction_properties: Optional[Dict[str, str]] = None, + target_bases: Optional[List[Union[str, DatasetBasePath]]] = None, ) -> LanceDataset: """Write a given data_obj to the given uri @@ -4975,6 +4977,53 @@ def write_dataset( and can be retrieved using read_transaction(). If both `commit_message` and `properties` are provided, `commit_message` will override any "lance.commit.message" key in `properties`. + target_bases: list of str or DatasetBasePath, optional + Specifies which base paths to use for writing data. Can contain: + + - :class:`DatasetBasePath` objects: Register new base paths (**CREATE mode only**) + - Strings: Reference existing bases by name or path (for all modes) + + **Important**: OVERWRITE mode **always inherits** existing base_paths and cannot + register new ones. Only CREATE mode can register new bases. + + When a string is provided, it will be resolved by trying to match: + 1. Base name (e.g., "primary", "archive") + 2. Base path URI (e.g., "s3://bucket/path") + + For CREATE mode, you can define new bases and write to them in one call: + + >>> bases = [DatasetBasePath("s3://bucket1/data", name="primary"), + ... DatasetBasePath("s3://bucket2/data", name="archive")] + >>> ds = write_dataset(data, "s3://bucket1/dataset", mode="create", + ... target_bases=bases) + + For OVERWRITE and APPEND modes, reference existing bases by name or path: + + >>> # OVERWRITE always inherits existing base_paths + >>> write_dataset(data, ds, mode="overwrite", target_bases=["primary"]) + >>> write_dataset(data, ds, mode="append", target_bases=["primary"]) + >>> write_dataset(data, ds, mode="append", target_bases=["s3://bucket1/data"]) + + Examples + -------- + Create a dataset with multiple base paths: + + >>> import lance + >>> from lance import DatasetBasePath + >>> bases = [ + ... DatasetBasePath("s3://fast-storage/data", name="hot"), + ... DatasetBasePath("s3://archive-storage/data", name="cold") + ... ] + >>> ds = lance.write_dataset(data, "s3://fast-storage/mydataset", + ... mode="create", target_bases=bases) + + Append data to a specific base by name: + + >>> lance.write_dataset(new_data, ds, mode="append", target_bases=["hot"]) + + Overwrite data (inherits existing base_paths): + + >>> lance.write_dataset(new_data, ds, mode="overwrite", target_bases=["cold"]) """ if use_legacy_format is not None: warnings.warn( @@ -5016,6 +5065,7 @@ def write_dataset( "enable_stable_row_ids": enable_stable_row_ids, "auto_cleanup_options": auto_cleanup_options, "transaction_properties": merged_properties, + "target_bases": target_bases, } if commit_lock: diff --git a/python/python/tests/test_multi_bases.py b/python/python/tests/test_multi_bases.py new file mode 100644 index 00000000000..dcd1e84c7fc --- /dev/null +++ b/python/python/tests/test_multi_bases.py @@ -0,0 +1,515 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + +""" +Tests for multi-bases dataset functionality. +""" + +import shutil +import tempfile +import uuid +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path + +import lance +import pandas as pd +import pyarrow as pa +import pytest +from lance import DatasetBasePath + + +class TestMultiBases: + """Test multi-bases dataset functionality with local file system.""" + + def setup_method(self): + """Set up test directories for each test.""" + self.test_dir = tempfile.mkdtemp() + self.test_id = str(uuid.uuid4())[:8] + + # Create primary and additional path directories + self.primary_uri = str(Path(self.test_dir) / "primary") + self.path1_uri = str(Path(self.test_dir) / f"path1_{self.test_id}") + self.path2_uri = str(Path(self.test_dir) / f"path2_{self.test_id}") + self.path3_uri = str(Path(self.test_dir) / f"path3_{self.test_id}") + + # Create directories + for uri in [self.primary_uri, self.path1_uri, self.path2_uri, self.path3_uri]: + Path(uri).mkdir(parents=True, exist_ok=True) + + def teardown_method(self): + """Clean up test directories after each test.""" + if hasattr(self, "test_dir"): + shutil.rmtree(self.test_dir, ignore_errors=True) + + def create_test_data(self, num_rows=500, id_offset=0): + """Create test data for multi-bases tests.""" + return pd.DataFrame( + { + "id": range(id_offset, id_offset + num_rows), + "value": [f"value_{i}" for i in range(id_offset, id_offset + num_rows)], + "score": [i * 0.1 for i in range(id_offset, id_offset + num_rows)], + } + ) + + def test_multi_bases_create_and_read(self): + """Test creating a multi-bases dataset and reading it back.""" + data = self.create_test_data(500) + + # Create dataset with multi-bases layout + dataset = lance.write_dataset( + data, + self.primary_uri, + mode="create", + target_bases=[ + DatasetBasePath(self.path1_uri, name="path1"), + DatasetBasePath(self.path2_uri, name="path2"), + DatasetBasePath(self.path3_uri, name="path3"), + "path2", # Write data to path2 + ], + max_rows_per_file=100, # Force multiple fragments + ) + + assert dataset is not None + assert dataset.uri == self.primary_uri + + # Verify we can read the data back + result = dataset.to_table().to_pandas() + assert len(result) == 500 + + # Verify data integrity + pd.testing.assert_frame_equal( + result.sort_values("id").reset_index(drop=True), + data.sort_values("id").reset_index(drop=True), + ) + + def test_multi_bases_append_mode(self): + """Test appending data to a multi-bases dataset.""" + # Create initial dataset + initial_data = self.create_test_data(300) + + dataset = lance.write_dataset( + initial_data, + self.primary_uri, + mode="create", + target_bases=[ + DatasetBasePath(self.path1_uri, name="path1"), + DatasetBasePath(self.path2_uri, name="path2"), + "path1", # Write to path1 + ], + max_rows_per_file=100, + ) + + # Create additional data to append + append_data = self.create_test_data(100, id_offset=300) + + # Append to different path + updated_dataset = lance.write_dataset( + append_data, + dataset, + mode="append", + target_bases=["path2"], # Write to path2 + max_rows_per_file=50, + ) + + # Verify total data + result = updated_dataset.to_table().to_pandas() + assert len(result) == 400 + + # Verify all data is present + expected_ids = set(range(400)) + actual_ids = set(result["id"].tolist()) + assert actual_ids == expected_ids + + def test_multi_bases_overwrite_mode_inherits_bases(self): + """Test OVERWRITE mode inherits existing base configuration.""" + # Create initial dataset with multi-base configuration + initial_data = self.create_test_data(200) + + lance.write_dataset( + initial_data, + self.primary_uri, + mode="create", + target_bases=[ + DatasetBasePath(self.path1_uri, name="path1"), + DatasetBasePath(self.path2_uri, name="path2"), + "path1", # Write to path1 + ], + max_rows_per_file=100, + ) + + # Create new data for overwrite + overwrite_data = self.create_test_data(150, id_offset=100) + + # Overwrite - should inherit existing bases (path1, path2) + # Write to path2 (existing base, referenced by name) + updated_dataset = lance.write_dataset( + overwrite_data, + self.primary_uri, + mode="overwrite", + target_bases=["path2"], # Reference existing base by name + max_rows_per_file=75, + ) + + # Verify overwritten data + result = updated_dataset.to_table().to_pandas() + assert len(result) == 150 + + # Verify data content + expected_ids = set(range(100, 250)) + actual_ids = set(result["id"].tolist()) + assert actual_ids == expected_ids + + # Verify base_paths are preserved from initial dataset + base_paths = updated_dataset._ds.base_paths() + assert len(base_paths) == 2 + assert any(bp.name == "path1" for bp in base_paths.values()) + assert any(bp.name == "path2" for bp in base_paths.values()) + + def test_multi_bases_overwrite_mode_primary_path_default(self): + """Test that OVERWRITE mode defaults to primary path when no target specified.""" + # Create initial dataset with explicit data file bases + initial_data = self.create_test_data(100) + + lance.write_dataset( + initial_data, + self.primary_uri, + mode="create", + target_bases=[ + DatasetBasePath(self.path1_uri, name="path1"), + DatasetBasePath(self.path2_uri, name="path2"), + "path1", # Write to path1 + ], + max_rows_per_file=50, + ) + + # Overwrite without specifying target - should use primary path + # This clears the old base_paths and writes to primary path only + overwrite_data = self.create_test_data(75, id_offset=200) + + updated_dataset = lance.write_dataset( + overwrite_data, + self.primary_uri, + mode="overwrite", + # No target_bases specified - data goes to primary path + # Old bases (path1, path2) are NOT preserved + max_rows_per_file=25, + ) + + # Verify overwritten data + result = updated_dataset.to_table().to_pandas() + assert len(result) == 75 + + # Verify data content + expected_ids = set(range(200, 275)) + actual_ids = set(result["id"].tolist()) + assert actual_ids == expected_ids + + # Verify base_paths are preserved from previous manifest + base_paths = updated_dataset._ds.base_paths() + # Old path1 and path2 ARE preserved in manifest + assert len(base_paths) == 2 + assert any(bp.name == "path1" for bp in base_paths.values()) + assert any(bp.name == "path2" for bp in base_paths.values()) + + # Since base_paths exist, we can append to them by name + append_data = self.create_test_data(25, id_offset=300) + + final_dataset = lance.write_dataset( + append_data, + updated_dataset, + mode="append", + target_bases=["path2"], # Specify which base to use + ) + + final_result = final_dataset.to_table().to_pandas() + assert len(final_result) == 100 + + def test_concurrent_appends(self): + """Test concurrent appends to multi-bases dataset.""" + # Create initial dataset + initial_data = self.create_test_data(100) + + lance.write_dataset( + initial_data, + self.primary_uri, + mode="create", + target_bases=[ + DatasetBasePath(self.path1_uri, name="path1"), + DatasetBasePath(self.path2_uri, name="path2"), + DatasetBasePath(self.path3_uri, name="path3"), + "path1", # Write to path1 + ], + max_rows_per_file=50, + ) + + def append_worker(worker_id, target_path_name, start_id, num_rows): + """Worker function to append data concurrently.""" + worker_data = self.create_test_data(num_rows, id_offset=start_id) + + # Load dataset fresh for this worker + worker_dataset = lance.dataset(self.primary_uri) + + # Perform append + return lance.write_dataset( + worker_data, + worker_dataset, + mode="append", + target_bases=[target_path_name], # Reference by name + max_rows_per_file=25, + ) + + # Configure concurrent append tasks + append_tasks = [ + {"worker_id": 1, "target_path": "path1", "start_id": 100, "num_rows": 50}, + {"worker_id": 2, "target_path": "path2", "start_id": 150, "num_rows": 50}, + {"worker_id": 3, "target_path": "path3", "start_id": 200, "num_rows": 50}, + ] + + # Execute concurrent appends + results = [] + with ThreadPoolExecutor(max_workers=3) as executor: + future_to_task = { + executor.submit( + append_worker, + task["worker_id"], + task["target_path"], + task["start_id"], + task["num_rows"], + ): task + for task in append_tasks + } + + for future in as_completed(future_to_task): + results.append(future.result()) + + # Verify final dataset integrity + final_dataset = lance.dataset(self.primary_uri) + final_data = final_dataset.to_table().to_pandas() + + # Should have initial 100 + 3 * 50 = 250 rows + assert len(final_data) == 250 + + # Verify all expected IDs are present + expected_ids = set(range(250)) + actual_ids = set(final_data["id"].tolist()) + assert actual_ids == expected_ids + + def test_validation_errors(self): + """Test validation errors for invalid multi-bases configurations.""" + data = self.create_test_data(100) + + # Test 1: Target reference not found in new bases for CREATE mode + with pytest.raises(Exception, match="not found"): + lance.write_dataset( + data, + self.primary_uri, + mode="create", + target_bases=[ + DatasetBasePath(self.path1_uri, name="path1"), + DatasetBasePath(self.path2_uri, name="path2"), + "path3", # Not in new bases + ], + ) + + # Test 2: DatasetBasePath in APPEND mode + # First create a base dataset + base_dataset = lance.write_dataset( + data, + self.primary_uri, + mode="create", + target_bases=[ + DatasetBasePath(self.path1_uri, name="path1"), + DatasetBasePath(self.path2_uri, name="path2"), + "path1", + ], + ) + + with pytest.raises(Exception, match="Cannot register new bases in Append mode"): + lance.write_dataset( + data, + base_dataset, + mode="append", + target_bases=[ + DatasetBasePath( + name="path3", path=self.path3_uri + ), # New base not allowed + "path3", + ], + ) + + # Test 3: Write to dataset with explicit data file bases without target_bases + with pytest.raises(Exception, match="target_bases"): + lance.write_dataset( + data, + base_dataset, + mode="append", + # No target_bases specified + ) + + def test_fragment_distribution(self): + """Test that fragments are correctly distributed and readable.""" + data = self.create_test_data(1000) + + dataset = lance.write_dataset( + data, + self.primary_uri, + mode="create", + target_bases=[ + DatasetBasePath(self.path1_uri, name="path1"), + DatasetBasePath(self.path2_uri, name="path2"), + "path1", + ], + max_rows_per_file=100, # Should create 10 fragments + ) + + # Verify fragment count + fragments = list(dataset.get_fragments()) + assert len(fragments) >= 10 # At least 10 fragments + + # Verify all data is readable + result = dataset.to_table().to_pandas() + assert len(result) == 1000 + + # Test scanning with filters + filtered_result = dataset.scanner(filter="id < 500").to_table().to_pandas() + assert len(filtered_result) == 500 + assert all(filtered_result["id"] < 500) + + def test_schema_consistency(self): + """Test that schema is consistent across multi-bases operations.""" + # Create dataset with specific schema + schema = pa.schema( + [ + pa.field("id", pa.int64()), + pa.field("name", pa.string()), + pa.field("score", pa.float64()), + ] + ) + + data = pa.table( + { + "id": [1, 2, 3, 4, 5], + "name": ["a", "b", "c", "d", "e"], + "score": [1.0, 2.0, 3.0, 4.0, 5.0], + }, + schema=schema, + ) + + dataset = lance.write_dataset( + data, + self.primary_uri, + mode="create", + target_bases=[ + DatasetBasePath(self.path1_uri, name="path1"), + DatasetBasePath(self.path2_uri, name="path2"), + "path1", + ], + ) + + # Verify schema + assert dataset.schema == schema + + # Append with same schema to different path + append_data = pa.table( + {"id": [6, 7, 8], "name": ["f", "g", "h"], "score": [6.0, 7.0, 8.0]}, + schema=schema, + ) + + updated_dataset = lance.write_dataset( + append_data, dataset, mode="append", target_bases=["path2"] + ) + + # Schema should remain consistent + assert updated_dataset.schema == schema + + # Verify all data + result = updated_dataset.to_table() + assert len(result) == 8 + assert result.schema == schema + + def test_dataset_reopening(self): + """Test reopening a multi-bases dataset.""" + data = self.create_test_data(300) + + # Create multi-bases dataset + dataset = lance.write_dataset( + data, + self.primary_uri, + mode="create", + target_bases=[ + DatasetBasePath(self.path1_uri, name="path1"), + DatasetBasePath(self.path2_uri, name="path2"), + "path1", + ], + max_rows_per_file=100, + ) + + # Close and reopen dataset + del dataset + reopened_dataset = lance.dataset(self.primary_uri) + + # Verify data is still accessible + result = reopened_dataset.to_table().to_pandas() + assert len(result) == 300 + + # Verify we can still append to different paths + append_data = self.create_test_data(50, id_offset=300) + + updated_dataset = lance.write_dataset( + append_data, + reopened_dataset, + mode="append", + target_bases=["path2"], # Reference by name + ) + + final_result = updated_dataset.to_table().to_pandas() + assert len(final_result) == 350 + + def test_single_path_mode(self): + """Test behavior with single path configuration.""" + data = self.create_test_data(100) + + # Test with single path (should work like normal dataset) + dataset = lance.write_dataset( + data, + self.primary_uri, + mode="create", + target_bases=[ + DatasetBasePath(self.path1_uri, name="path1"), + "path1", + ], + ) + + result = dataset.to_table().to_pandas() + assert len(result) == 100 + + # Verify data integrity + pd.testing.assert_frame_equal( + result.sort_values("id").reset_index(drop=True), + data.sort_values("id").reset_index(drop=True), + ) + + def test_primary_path_write_rejection(self): + """Test that writing to primary path URI is rejected.""" + data = self.create_test_data(50) + + # Create dataset with explicit data file bases + dataset = lance.write_dataset( + data, + self.primary_uri, + mode="create", + target_bases=[ + DatasetBasePath(self.path1_uri, name="path1"), + DatasetBasePath(self.path2_uri, name="path2"), + "path1", + ], + ) + + # Try to append using primary path URI (should fail - not in registered bases) + with pytest.raises(Exception, match="not found"): + lance.write_dataset( + data, + dataset, + mode="append", + target_bases=[self.primary_uri], # Primary path URI not registered + ) diff --git a/python/src/dataset.rs b/python/src/dataset.rs index de693429217..7b5933ef939 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -80,7 +80,7 @@ use lance_index::{ }; use lance_io::object_store::ObjectStoreParams; use lance_linalg::distance::MetricType; -use lance_table::format::Fragment; +use lance_table::format::{BasePath, Fragment}; use lance_table::io::commit::CommitHandler; use crate::error::PythonErrorExt; @@ -379,6 +379,63 @@ impl<'py> IntoPyObject<'py> for PyLance<&ColumnOrdering> { } } +/// Python binding for BasePath +#[pyclass(name = "DatasetBasePath", module = "lance")] +#[derive(Clone)] +pub struct DatasetBasePath { + #[pyo3(get)] + pub id: u32, + #[pyo3(get)] + pub name: Option, + #[pyo3(get)] + pub path: String, + #[pyo3(get)] + pub is_dataset_root: bool, +} + +#[pymethods] +impl DatasetBasePath { + #[new] + #[pyo3(signature = (path, name = None, is_dataset_root = false, id = None))] + fn new(path: String, name: Option, is_dataset_root: bool, id: Option) -> Self { + Self { + id: id.unwrap_or(0), + name, + path, + is_dataset_root, + } + } + + fn __repr__(&self) -> String { + format!( + "DatasetBasePath(id={}, name={:?}, path={}, is_dataset_root={})", + self.id, self.name, self.path, self.is_dataset_root + ) + } +} + +impl From for DatasetBasePath { + fn from(base_path: BasePath) -> Self { + Self { + id: base_path.id, + name: base_path.name, + path: base_path.path, + is_dataset_root: base_path.is_dataset_root, + } + } +} + +impl From for BasePath { + fn from(py_base_path: DatasetBasePath) -> Self { + BasePath::new( + py_base_path.id, + py_base_path.path.clone(), + py_base_path.name, + py_base_path.is_dataset_root, + ) + } +} + /// Lance Dataset that will be wrapped by another class in Python #[pyclass(name = "_Dataset", module = "_lib")] #[derive(Clone)] @@ -584,6 +641,21 @@ impl Dataset { PyBytes::new(py, &manifest_bytes).into() } + /// Get base paths from the manifest. + /// + /// Returns a dictionary mapping base_id to DatasetBasePath objects. + fn base_paths(&self, py: Python) -> PyResult { + let manifest = self.ds.manifest(); + let dict = pyo3::types::PyDict::new(py); + + for (base_id, base_path) in manifest.base_paths.iter() { + let py_base_path = DatasetBasePath::from(base_path.clone()); + dict.set_item(base_id, py_base_path.into_py_any(py)?)?; + } + + Ok(dict.into()) + } + /// Load index metadata. /// /// This call will open the index and return its concrete index type. @@ -2638,6 +2710,31 @@ pub fn get_write_params(options: &Bound<'_, PyDict>) -> PyResult>>(options, "target_bases")? { + let mut new_bases = Vec::new(); + let mut target_base_refs = Vec::new(); + + for item in target_bases_list.iter() { + if let Ok(dataset_base_path) = item.extract::() { + new_bases.push(BasePath::from(dataset_base_path)); + } else if let Ok(s) = item.extract::() { + target_base_refs.push(s); + } else { + return Err(PyValueError::new_err( + "target_bases must contain DatasetBasePath objects or strings" + )); + } + } + + if !new_bases.is_empty() { + p = p.with_initial_bases(new_bases); + } + + if !target_base_refs.is_empty() { + p = p.with_target_base_names_or_paths(target_base_refs); + } + } + // Handle properties if let Some(props) = get_dict_opt::>(options, "transaction_properties")? diff --git a/python/src/lib.rs b/python/src/lib.rs index 5365765ca69..419430179c1 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -47,7 +47,7 @@ use dataset::cleanup::CleanupStats; use dataset::optimize::{ PyCompaction, PyCompactionMetrics, PyCompactionPlan, PyCompactionTask, PyRewriteResult, }; -use dataset::{MergeInsertBuilder, PyFullTextQuery}; +use dataset::{DatasetBasePath, MergeInsertBuilder, PyFullTextQuery}; use env_logger::{Builder, Env}; use file::{ stable_version, LanceBufferDescriptor, LanceColumnMetadata, LanceFileMetadata, LanceFileReader, @@ -240,6 +240,7 @@ fn lance(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/python/src/transaction.rs b/python/src/transaction.rs index b6a8ea4536c..d4f2a072b0a 100644 --- a/python/src/transaction.rs +++ b/python/src/transaction.rs @@ -1,6 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors +use crate::dataset::DatasetBasePath; use crate::schema::LanceSchema; use crate::utils::{class_name, export_vec, extract_vec, PyLance}; use arrow::pyarrow::PyArrowType; @@ -10,7 +11,7 @@ use lance::dataset::transaction::{ UpdateMapEntry, UpdateMode, }; use lance::datatypes::Schema; -use lance_table::format::{DataFile, Fragment, IndexMetadata}; +use lance_table::format::{BasePath, DataFile, Fragment, IndexMetadata}; use pyo3::exceptions::PyValueError; use pyo3::types::PySet; use pyo3::{intern, prelude::*}; @@ -166,10 +167,25 @@ impl FromPyObject<'_> for PyLance { let fragments = extract_vec(&ob.getattr("fragments")?)?; + let initial_bases = ob + .getattr("initial_bases") + .ok() + .and_then(|attr| { + attr.extract::>>().ok() + }) + .flatten() + .map(|py_bases| { + py_bases + .into_iter() + .map(BasePath::from) + .collect::>() + }); + let op = Operation::Overwrite { schema, fragments, config_upsert_values: None, + initial_bases, }; Ok(Self(op)) } diff --git a/rust/lance-io/src/object_store.rs b/rust/lance-io/src/object_store.rs index a07ff3f3b23..8cd1562fe3f 100644 --- a/rust/lance-io/src/object_store.rs +++ b/rust/lance-io/src/object_store.rs @@ -359,6 +359,27 @@ impl ObjectStore { Ok((store, path)) } + /// Extract the path component from a URI without initializing the object store. + /// + /// This is a synchronous operation that only parses the URI and extracts the path, + /// without creating or initializing any object store instance. + /// + /// # Arguments + /// + /// * `registry` - The object store registry to get the provider + /// * `uri` - The URI to extract the path from + /// + /// # Returns + /// + /// The extracted path component + pub fn extract_path_from_uri(registry: Arc, uri: &str) -> Result { + let url = uri_to_url(uri)?; + let provider = registry.get_provider(url.scheme()).ok_or_else(|| { + Error::invalid_input(format!("Unknown scheme: {}", url.scheme()), location!()) + })?; + provider.extract_path(&url) + } + #[deprecated(note = "Use `from_uri` instead")] pub fn from_path(str_path: &str) -> Result<(Arc, Path)> { Self::from_uri_and_params( diff --git a/rust/lance-table/src/feature_flags.rs b/rust/lance-table/src/feature_flags.rs index a74a50a3255..f06e50799a2 100644 --- a/rust/lance-table/src/feature_flags.rs +++ b/rust/lance-table/src/feature_flags.rs @@ -18,8 +18,8 @@ pub const FLAG_STABLE_ROW_IDS: u64 = 2; pub const FLAG_USE_V2_FORMAT_DEPRECATED: u64 = 4; /// Table config is present pub const FLAG_TABLE_CONFIG: u64 = 8; -/// Dataset is a shallow clone with external base paths -pub const FLAG_SHALLOW_CLONE: u64 = 16; +/// Dataset uses multiple base paths (for shallow clones or multi-base datasets) +pub const FLAG_BASE_PATHS: u64 = 16; /// The first bit that is unknown as a feature flag pub const FLAG_UNKNOWN: u64 = 32; @@ -64,10 +64,10 @@ pub fn apply_feature_flags(manifest: &mut Manifest, enable_stable_row_id: bool) manifest.writer_feature_flags |= FLAG_TABLE_CONFIG; } - // Check if this is a shallow clone dataset by examining base_paths + // Check if this dataset uses multiple base paths (for shallow clones or multi-base datasets) if !manifest.base_paths.is_empty() { - manifest.reader_feature_flags |= FLAG_SHALLOW_CLONE; - manifest.writer_feature_flags |= FLAG_SHALLOW_CLONE; + manifest.reader_feature_flags |= FLAG_BASE_PATHS; + manifest.writer_feature_flags |= FLAG_BASE_PATHS; } Ok(()) @@ -97,7 +97,7 @@ mod tests { assert!(can_read_dataset(super::FLAG_STABLE_ROW_IDS)); assert!(can_read_dataset(super::FLAG_USE_V2_FORMAT_DEPRECATED)); assert!(can_read_dataset(super::FLAG_TABLE_CONFIG)); - assert!(can_read_dataset(super::FLAG_SHALLOW_CLONE)); + assert!(can_read_dataset(super::FLAG_BASE_PATHS)); assert!(can_read_dataset( super::FLAG_DELETION_FILES | super::FLAG_STABLE_ROW_IDS @@ -113,19 +113,19 @@ mod tests { assert!(can_write_dataset(super::FLAG_STABLE_ROW_IDS)); assert!(can_write_dataset(super::FLAG_USE_V2_FORMAT_DEPRECATED)); assert!(can_write_dataset(super::FLAG_TABLE_CONFIG)); - assert!(can_write_dataset(super::FLAG_SHALLOW_CLONE)); + assert!(can_write_dataset(super::FLAG_BASE_PATHS)); assert!(can_write_dataset( super::FLAG_DELETION_FILES | super::FLAG_STABLE_ROW_IDS | super::FLAG_USE_V2_FORMAT_DEPRECATED | super::FLAG_TABLE_CONFIG - | super::FLAG_SHALLOW_CLONE + | super::FLAG_BASE_PATHS )); assert!(!can_write_dataset(super::FLAG_UNKNOWN)); } #[test] - fn test_shallow_clone_feature_flags() { + fn test_base_paths_feature_flags() { use crate::format::{DataStorageFormat, Manifest}; use arrow_schema::{Field as ArrowField, Schema as ArrowSchema}; use lance_core::datatypes::Schema; @@ -138,7 +138,7 @@ mod tests { false, )]); let schema = Schema::try_from(&arrow_schema).unwrap(); - // Test 1: Normal dataset (no base_paths) should not have FLAG_SHALLOW_CLONE + // Test 1: Normal dataset (no base_paths) should not have FLAG_BASE_PATHS let mut normal_manifest = Manifest::new( schema.clone(), Arc::new(vec![]), @@ -147,28 +147,34 @@ mod tests { HashMap::new(), // Empty base_paths ); apply_feature_flags(&mut normal_manifest, false).unwrap(); - assert_eq!(normal_manifest.reader_feature_flags & FLAG_SHALLOW_CLONE, 0); - assert_eq!(normal_manifest.writer_feature_flags & FLAG_SHALLOW_CLONE, 0); - // Test 2: Cloned dataset (with base_paths) should have FLAG_SHALLOW_CLONE + assert_eq!(normal_manifest.reader_feature_flags & FLAG_BASE_PATHS, 0); + assert_eq!(normal_manifest.writer_feature_flags & FLAG_BASE_PATHS, 0); + // Test 2: Dataset with base_paths (shallow clone or multi-base) should have FLAG_BASE_PATHS let mut base_paths: HashMap = HashMap::new(); base_paths.insert( 1, - BasePath { - id: 1, - name: Some("test_ref".to_string()), - is_dataset_root: true, - path: "/path/to/original".to_string(), - }, + BasePath::new( + 1, + "file:///path/to/original".to_string(), + Some("test_ref".to_string()), + true, + ), ); - let mut cloned_manifest = Manifest::new( + let mut multi_base_manifest = Manifest::new( schema, Arc::new(vec![]), DataStorageFormat::default(), None, base_paths, ); - apply_feature_flags(&mut cloned_manifest, false).unwrap(); - assert_ne!(cloned_manifest.reader_feature_flags & FLAG_SHALLOW_CLONE, 0); - assert_ne!(cloned_manifest.writer_feature_flags & FLAG_SHALLOW_CLONE, 0); + apply_feature_flags(&mut multi_base_manifest, false).unwrap(); + assert_ne!( + multi_base_manifest.reader_feature_flags & FLAG_BASE_PATHS, + 0 + ); + assert_ne!( + multi_base_manifest.writer_feature_flags & FLAG_BASE_PATHS, + 0 + ); } } diff --git a/rust/lance-table/src/format/fragment.rs b/rust/lance-table/src/format/fragment.rs index a581bcbf433..4d3ab9cf877 100644 --- a/rust/lance-table/src/format/fragment.rs +++ b/rust/lance-table/src/format/fragment.rs @@ -54,6 +54,7 @@ impl DataFile { file_major_version: u32, file_minor_version: u32, file_size_bytes: Option>, + base_id: Option, ) -> Self { Self { path: path.into(), @@ -62,7 +63,7 @@ impl DataFile { file_major_version, file_minor_version, file_size_bytes: file_size_bytes.into(), - base_id: None, + base_id, } } @@ -83,7 +84,11 @@ impl DataFile { } } - pub fn new_legacy_from_fields(path: impl Into, fields: Vec) -> Self { + pub fn new_legacy_from_fields( + path: impl Into, + fields: Vec, + base_id: Option, + ) -> Self { Self::new( path, fields, @@ -91,6 +96,7 @@ impl DataFile { MAJOR_VERSION as u32, MINOR_VERSION as u32, None, + base_id, ) } @@ -98,6 +104,7 @@ impl DataFile { path: impl Into, schema: &Schema, file_size_bytes: Option>, + base_id: Option, ) -> Self { let mut field_ids = schema.field_ids(); field_ids.sort(); @@ -108,6 +115,7 @@ impl DataFile { MAJOR_VERSION as u32, MINOR_VERSION as u32, file_size_bytes, + base_id, ) } @@ -324,7 +332,7 @@ impl Fragment { ) -> Self { Self { id, - files: vec![DataFile::new_legacy(path, schema, None)], + files: vec![DataFile::new_legacy(path, schema, None, None)], deletion_file: None, physical_rows, row_id_meta: None, @@ -347,6 +355,7 @@ impl Fragment { major, minor, file_size_bytes, + None, ); self.files.push(data_file); self @@ -373,12 +382,14 @@ impl Fragment { major, minor, file_size_bytes, + None, )); } /// Add a new [`DataFile`] to this fragment. pub fn add_file_legacy(&mut self, path: &str, schema: &Schema) { - self.files.push(DataFile::new_legacy(path, schema, None)); + self.files + .push(DataFile::new_legacy(path, schema, None, None)); } // True if this fragment is made up of legacy v1 files, false otherwise @@ -519,6 +530,7 @@ mod tests { vec![DataFile::new_legacy_from_fields( path.to_string(), vec![0, 1, 2, 3], + None, )] ) } diff --git a/rust/lance-table/src/format/manifest.rs b/rust/lance-table/src/format/manifest.rs index a86bb460692..2679a906c74 100644 --- a/rust/lance-table/src/format/manifest.rs +++ b/rust/lance-table/src/format/manifest.rs @@ -21,7 +21,7 @@ use crate::format::pb; use lance_core::cache::LanceCache; use lance_core::datatypes::{Schema, StorageClass}; use lance_core::{Error, Result}; -use lance_io::object_store::ObjectStore; +use lance_io::object_store::{ObjectStore, ObjectStoreRegistry}; use lance_io::utils::read_struct; use snafu::location; @@ -296,12 +296,7 @@ impl Manifest { blob_dataset_version: self.blob_dataset_version, base_paths: { let mut base_paths = self.base_paths.clone(); - let base_path = BasePath { - id: ref_base_id, - name: ref_name, - is_dataset_root: true, - path: ref_path, - }; + let base_path = BasePath::new(ref_base_id, ref_path, ref_name, true); base_paths.insert(ref_base_id, base_path); base_paths }, @@ -575,14 +570,49 @@ impl Manifest { } } -#[derive(Debug, Clone, PartialEq, DeepSizeOf)] +#[derive(Debug, Clone, PartialEq)] pub struct BasePath { pub id: u32, pub name: Option, pub is_dataset_root: bool, + /// The full URI string (e.g., "s3://bucket/path") pub path: String, } +impl BasePath { + /// Create a new BasePath + /// + /// # Arguments + /// + /// * `id` - Unique identifier for this base path + /// * `path` - Full URI string (e.g., "s3://bucket/path", "/local/path") + /// * `name` - Optional human-readable name for this base + /// * `is_dataset_root` - Whether this is the dataset root or a data-only base + pub fn new(id: u32, path: String, name: Option, is_dataset_root: bool) -> Self { + Self { + id, + name, + is_dataset_root, + path, + } + } + + /// Extract the object store path from this BasePath's URI. + /// + /// This is a synchronous operation that parses the URI without initializing an object store. + pub fn extract_path(&self, registry: Arc) -> Result { + ObjectStore::extract_path_from_uri(registry, &self.path) + } +} + +impl DeepSizeOf for BasePath { + fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize { + self.name.deep_size_of_children(context) + + self.path.deep_size_of_children(context) * 2 + + size_of::() + } +} + #[derive(Debug, Clone, PartialEq, DeepSizeOf)] pub struct WriterVersion { pub library: String, @@ -701,6 +731,12 @@ impl ProtoStruct for Manifest { impl From for BasePath { fn from(p: pb::BasePath) -> Self { + Self::new(p.id, p.path, p.name, p.is_dataset_root) + } +} + +impl From for pb::BasePath { + fn from(p: BasePath) -> Self { Self { id: p.id, name: p.name, @@ -1029,7 +1065,11 @@ mod tests { let fragments = vec![ Fragment { id: 0, - files: vec![DataFile::new_legacy_from_fields("path1", vec![0, 1, 2])], + files: vec![DataFile::new_legacy_from_fields( + "path1", + vec![0, 1, 2], + None, + )], deletion_file: None, row_id_meta: None, physical_rows: None, @@ -1037,8 +1077,8 @@ mod tests { Fragment { id: 1, files: vec![ - DataFile::new_legacy_from_fields("path2", vec![0, 1, 43]), - DataFile::new_legacy_from_fields("path3", vec![2]), + DataFile::new_legacy_from_fields("path2", vec![0, 1, 43], None), + DataFile::new_legacy_from_fields("path3", vec![2], None), ], deletion_file: None, row_id_meta: None, diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index ea5d61aefc7..bc27d53f416 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -117,6 +117,7 @@ pub use write::merge_insert::{ MergeInsertBuilder, MergeInsertJob, MergeStats, UncommittedMergeInsert, WhenMatched, WhenNotMatched, WhenNotMatchedBySource, }; + pub use write::update::{UpdateBuilder, UpdateJob}; #[allow(deprecated)] pub use write::{ @@ -163,6 +164,10 @@ pub struct Dataset { /// File reader options to use when reading data files. pub(crate) file_reader_options: Option, + + /// Object store parameters used when opening this dataset. + /// These are used when creating object stores for additional base paths. + pub(crate) store_params: Option, } impl std::fmt::Debug for Dataset { @@ -552,6 +557,7 @@ impl Dataset { self.session.clone(), self.commit_handler.clone(), self.file_reader_options.clone(), + self.store_params.clone(), ) } @@ -662,6 +668,7 @@ impl Dataset { session: Arc, commit_handler: Arc, file_reader_options: Option, + store_params: Option, ) -> Result { let refs = Refs::new( object_store.clone(), @@ -688,6 +695,7 @@ impl Dataset { metadata_cache, index_cache, file_reader_options, + store_params, }) } @@ -830,6 +838,7 @@ impl Dataset { self.session.clone(), self.commit_handler.clone(), self.file_reader_options.clone(), + self.store_params.clone(), )?; Ok(Some(Arc::new(blobs_dataset))) } else { @@ -1365,8 +1374,7 @@ impl Dataset { location!(), ) })?; - - let path = Path::parse(base_path.path.as_str())?; + let path = base_path.extract_path(self.session.store_registry())?; if base_path.is_dataset_root { Ok(path.child(DATA_DIR)) } else { @@ -1377,6 +1385,25 @@ impl Dataset { } } + /// Get the ObjectStore for a specific path based on base_id + pub(crate) async fn object_store_for_base(&self, base_id: u32) -> Result> { + let base_path = self.manifest.base_paths.get(&base_id).ok_or_else(|| { + Error::invalid_input( + format!("Dataset base path with ID {} not found", base_id), + Default::default(), + ) + })?; + + let (store, _) = ObjectStore::from_uri_and_params( + self.session.store_registry(), + &base_path.path, + &self.store_params.clone().unwrap_or_default(), + ) + .await?; + + Ok(store) + } + pub(crate) fn dataset_dir_for_deletion(&self, deletion_file: &DeletionFile) -> Result { match deletion_file.base_id.as_ref() { Some(base_id) => { @@ -1400,7 +1427,7 @@ impl Dataset { location: location!(), }); } - Ok(Path::parse(base_path.path.as_str())?) + base_path.extract_path(self.session.store_registry()) } None => Ok(self.base.clone()), } @@ -1420,7 +1447,7 @@ impl Dataset { location!(), ) })?; - let path = Path::parse(base_path.path.as_str())?; + let path = base_path.extract_path(self.session.store_registry())?; if base_path.is_dataset_root { Ok(path.child(INDICES_DIR)) } else { @@ -1997,6 +2024,7 @@ pub(crate) fn load_new_transactions(dataset: &Dataset) -> NewTransactionResult<' dataset.session(), dataset.commit_handler.clone(), dataset.file_reader_options.clone(), + dataset.store_params.clone(), )?; let object_store = dataset_version.object_store(); let path = dataset_version @@ -2034,6 +2062,7 @@ pub(crate) fn load_new_transactions(dataset: &Dataset) -> NewTransactionResult<' dataset.session(), dataset.commit_handler.clone(), dataset.file_reader_options.clone(), + dataset.store_params.clone(), ) } else { // If we didn't get the latest manifest, we can still return the dataset @@ -4172,6 +4201,7 @@ mod tests { fragments: vec![], schema, config_upsert_values: None, + initial_bases: None, }; let test_dir = TempStdDir::default(); let test_uri = test_dir.to_str().unwrap(); @@ -4204,6 +4234,7 @@ mod tests { fragments: vec![], schema, config_upsert_values: None, + initial_bases: None, }; let test_uri = TempStrDir::default(); let read_version_0_transaction = Transaction::new(0, operation, None, None); diff --git a/rust/lance/src/dataset/builder.rs b/rust/lance/src/dataset/builder.rs index 0a756e26f95..fb1050962f6 100644 --- a/rust/lance/src/dataset/builder.rs +++ b/rust/lance/src/dataset/builder.rs @@ -316,6 +316,7 @@ impl DatasetBuilder { let manifest = self.manifest.take(); let file_reader_options = self.file_reader_options.clone(); + let store_params = self.options.clone(); let (object_store, base_path, commit_handler) = self.build_object_store().await?; // Two cases that need to check out after loading the manifest: @@ -364,6 +365,7 @@ impl DatasetBuilder { object_store, base_path, commit_handler, + Some(store_params), ) .await?; @@ -398,6 +400,7 @@ impl DatasetBuilder { object_store: Arc, base_path: Path, commit_handler: Arc, + store_params: Option, ) -> Result { let (manifest, location) = if let Some(mut manifest) = manifest { let location = commit_handler @@ -458,6 +461,7 @@ impl DatasetBuilder { session, commit_handler, file_reader_options, + store_params, ) } } diff --git a/rust/lance/src/dataset/fragment.rs b/rust/lance/src/dataset/fragment.rs index 66d93e22071..6ad17fa9e81 100644 --- a/rust/lance/src/dataset/fragment.rs +++ b/rust/lance/src/dataset/fragment.rs @@ -935,21 +935,29 @@ impl FileFragment { .dataset .data_file_dir(data_file)? .child(data_file.path.as_str()); - let (store_scheduler, reader_priority) = - if let Some(scan_scheduler) = read_config.scan_scheduler.as_ref() { - ( - scan_scheduler.clone(), - read_config.reader_priority.unwrap_or(0), - ) - } else { - ( - ScanScheduler::new( - self.dataset.object_store.clone(), - SchedulerConfig::max_bandwidth(&self.dataset.object_store), - ), - 0, - ) - }; + let (store_scheduler, reader_priority) = if let Some(base_id) = data_file.base_id { + // TODO: make object stores for non-default bases reuse the same scan scheduler + // currently we always create a new one + let object_store = self.dataset.object_store_for_base(base_id).await?; + let config = SchedulerConfig::max_bandwidth(&object_store); + ( + ScanScheduler::new(object_store, config), + read_config.reader_priority.unwrap_or(0), + ) + } else if let Some(scan_scheduler) = read_config.scan_scheduler.as_ref() { + ( + scan_scheduler.clone(), + read_config.reader_priority.unwrap_or(0), + ) + } else { + ( + ScanScheduler::new( + self.dataset.object_store.clone(), + SchedulerConfig::max_bandwidth(&self.dataset.object_store), + ), + 0, + ) + }; let file_scheduler = store_scheduler .open_file_with_priority(&path, reader_priority as u64, &data_file.file_size_bytes) .await?; @@ -3168,6 +3176,7 @@ mod tests { schema: schema.clone(), fragments, config_upsert_values: None, + initial_bases: None, }; let new_dataset = @@ -3277,6 +3286,7 @@ mod tests { fragments: vec![new_fragment], schema: full_schema.clone(), config_upsert_values: None, + initial_bases: None, }; let dataset = diff --git a/rust/lance/src/dataset/fragment/write.rs b/rust/lance/src/dataset/fragment/write.rs index 3186dedc7b3..5cb2eda363b 100644 --- a/rust/lance/src/dataset/fragment/write.rs +++ b/rust/lance/src/dataset/fragment/write.rs @@ -210,6 +210,7 @@ impl<'a> FragmentCreateBuilder<'a> { stream, params.into_owned(), LanceFileVersion::Stable, + None, // Fragment creation doesn't use target_bases ) .await } diff --git a/rust/lance/src/dataset/optimize.rs b/rust/lance/src/dataset/optimize.rs index 8aa359bfeb1..b780ff80552 100644 --- a/rust/lance/src/dataset/optimize.rs +++ b/rust/lance/src/dataset/optimize.rs @@ -734,6 +734,7 @@ async fn rewrite_files( dataset.schema().clone(), reader, params, + None, // Compaction doesn't use target_bases ) .await?; diff --git a/rust/lance/src/dataset/transaction.rs b/rust/lance/src/dataset/transaction.rs index 2bb689908cf..20738ab88c0 100644 --- a/rust/lance/src/dataset/transaction.rs +++ b/rust/lance/src/dataset/transaction.rs @@ -58,7 +58,9 @@ use lance_io::object_store::ObjectStore; use lance_table::feature_flags::{apply_feature_flags, FLAG_STABLE_ROW_IDS}; use lance_table::rowids::read_row_ids; use lance_table::{ - format::{pb, DataFile, DataStorageFormat, Fragment, IndexMetadata, Manifest, RowIdMeta}, + format::{ + pb, BasePath, DataFile, DataStorageFormat, Fragment, IndexMetadata, Manifest, RowIdMeta, + }, io::{ commit::CommitHandler, manifest::{read_manifest, read_manifest_indexes}, @@ -171,6 +173,7 @@ pub enum Operation { fragments: Vec, schema: Schema, config_upsert_values: Option>, + initial_bases: Option>, }, /// A new index has been created. CreateIndex { @@ -378,16 +381,19 @@ impl PartialEq for Operation { fragments: a_fragments, schema: a_schema, config_upsert_values: a_config, + initial_bases: a_initial, }, Self::Overwrite { fragments: b_fragments, schema: b_schema, config_upsert_values: b_config, + initial_bases: b_initial, }, ) => { compare_vec(a_fragments, b_fragments) && a_schema == b_schema && a_config == b_config + && a_initial == b_initial } ( Self::CreateIndex { @@ -1479,11 +1485,42 @@ impl Transaction { location: location!(), }); } - let reference_paths = match current_manifest { + let mut reference_paths = match current_manifest { Some(m) => m.base_paths.clone(), None => HashMap::new(), }; + if let Operation::Overwrite { + initial_bases: Some(initial_bases), + .. + } = &self.operation + { + if current_manifest.is_none() { + // CREATE mode: registering base paths + // Base IDs should have been assigned during write operation + // Validate uniqueness and insert them into the manifest + for base_path in initial_bases.iter() { + if reference_paths.contains_key(&base_path.id) { + return Err(Error::invalid_input( + format!( + "Duplicate base path ID {} detected. Base path IDs must be unique.", + base_path.id + ), + location!(), + )); + } + reference_paths.insert(base_path.id, base_path.clone()); + } + } else { + // OVERWRITE mode with initial_bases should have been rejected by validation + // This branch should never be reached + return Err(Error::invalid_input( + "OVERWRITE mode cannot register new bases. This should have been caught by validation.", + location!(), + )); + } + } + // Get the schema and the final fragment list let schema = match self.operation { Operation::Overwrite { ref schema, .. } => schema.clone(), @@ -1874,6 +1911,8 @@ impl Transaction { }; let mut manifest = if let Some(current_manifest) = current_manifest { + // OVERWRITE with initial_bases on existing dataset is not allowed (caught by validation) + // So we always use new_from_previous which preserves base_paths let mut prev_manifest = Manifest::new_from_previous( current_manifest, schema, @@ -2448,6 +2487,7 @@ impl TryFrom for Transaction { schema, schema_metadata: _schema_metadata, // TODO: handle metadata config_upsert_values, + initial_bases, })) => { let config_upsert_option = if config_upsert_values.is_empty() { Some(config_upsert_values) @@ -2462,6 +2502,11 @@ impl TryFrom for Transaction { .collect::>>()?, schema: Schema::from(&Fields(schema)), config_upsert_values: config_upsert_option, + initial_bases: if initial_bases.is_empty() { + None + } else { + Some(initial_bases.into_iter().map(BasePath::from).collect()) + }, } } Some(pb::transaction::Operation::ReserveFragments( @@ -2692,6 +2737,7 @@ impl TryFrom for Transaction { schema, schema_metadata: _schema_metadata, // TODO: handle metadata config_upsert_values, + initial_bases, }) => { let config_upsert_option = if config_upsert_values.is_empty() { Some(config_upsert_values) @@ -2706,6 +2752,11 @@ impl TryFrom for Transaction { .collect::>>()?, schema: Schema::from(&Fields(schema)), config_upsert_values: config_upsert_option, + initial_bases: if initial_bases.is_empty() { + None + } else { + Some(initial_bases.into_iter().map(BasePath::from).collect()) + }, }) } }) @@ -2825,6 +2876,7 @@ impl From<&Transaction> for pb::Transaction { fragments, schema, config_upsert_values, + initial_bases, } => { pb::transaction::Operation::Overwrite(pb::transaction::Overwrite { fragments: fragments.iter().map(pb::DataFragment::from).collect(), @@ -2833,6 +2885,16 @@ impl From<&Transaction> for pb::Transaction { config_upsert_values: config_upsert_values .clone() .unwrap_or(Default::default()), + initial_bases: initial_bases + .as_ref() + .map(|paths| { + paths + .iter() + .cloned() + .map(|bp: BasePath| -> pb::BasePath { bp.into() }) + .collect::>() + }) + .unwrap_or_default(), }) } Operation::ReserveFragments { num_fragments } => { @@ -2973,6 +3035,7 @@ impl From<&Transaction> for pb::Transaction { fragments, schema, config_upsert_values, + initial_bases, } => { pb::transaction::BlobOperation::BlobOverwrite(pb::transaction::Overwrite { fragments: fragments.iter().map(pb::DataFragment::from).collect(), @@ -2981,6 +3044,16 @@ impl From<&Transaction> for pb::Transaction { config_upsert_values: config_upsert_values .clone() .unwrap_or(Default::default()), + initial_bases: initial_bases + .as_ref() + .map(|paths| { + paths + .iter() + .cloned() + .map(|bp: BasePath| -> pb::BasePath { bp.into() }) + .collect::>() + }) + .unwrap_or_default(), }) } _ => panic!("Invalid blob operation: {:?}", value), @@ -3073,6 +3146,7 @@ pub fn validate_operation(manifest: Option<&Manifest>, operation: &Operation) -> fragments, schema, config_upsert_values: None, + initial_bases: _, } => schema_fragments_valid(Some(manifest), schema, fragments), Operation::Update { updated_fragments, diff --git a/rust/lance/src/dataset/write.rs b/rust/lance/src/dataset/write.rs index 0c351d60ac1..dc8c4323c6f 100644 --- a/rust/lance/src/dataset/write.rs +++ b/rust/lance/src/dataset/write.rs @@ -4,7 +4,7 @@ use std::collections::HashMap; use std::num::NonZero; use std::sync::Arc; - +use std::sync::atomic::AtomicUsize; use arrow_array::RecordBatch; use chrono::TimeDelta; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; @@ -25,7 +25,7 @@ use lance_file::v2::writer::FileWriterOptions; use lance_file::version::LanceFileVersion; use lance_file::writer::{FileWriter, ManifestProvider}; use lance_io::object_store::{ObjectStore, ObjectStoreParams, ObjectStoreRegistry}; -use lance_table::format::{DataFile, Fragment}; +use lance_table::format::{BasePath, DataFile, Fragment, Manifest}; use lance_table::io::commit::{commit_handler_from_url, CommitHandler}; use lance_table::io::manifest::ManifestDescribing; use object_store::path::Path; @@ -226,6 +226,25 @@ pub struct WriteParams { /// This can include commit messages, engine information, etc. /// this properties map will be persisted as part of Transaction object. pub transaction_properties: Option>>, + + /// New base paths to register in the manifest during dataset creation. + /// Each BasePath must have a properly assigned ID (non-zero). + /// Only used in CREATE/OVERWRITE modes for manifest registration. + /// IDs should be assigned by the caller before passing to WriteParams. + pub initial_bases: Option>, + + /// Target base IDs for writing data files. + /// When provided, all new data files will be written to bases with these IDs. + /// Used in all modes (CREATE, APPEND, OVERWRITE) to specify where data should be written. + /// The IDs must correspond to either: + /// - IDs in initial_bases (for CREATE/OVERWRITE modes) + /// - IDs already registered in the existing dataset manifest (for APPEND mode) + pub target_bases: Option>, + + /// Target base names or paths as strings (unresolved). + /// These will be resolved to IDs when the write operation executes. + /// Resolution happens at builder execution time when dataset context is available. + pub target_base_names_or_paths: Option>, } impl Default for WriteParams { @@ -247,6 +266,9 @@ impl Default for WriteParams { auto_cleanup: Some(AutoCleanupParams::default()), skip_auto_cleanup: false, transaction_properties: None, + initial_bases: None, + target_bases: None, + target_base_names_or_paths: None, } } } @@ -279,6 +301,52 @@ impl WriteParams { ..self } } + + /// Set the initial_bases for this WriteParams. + /// + /// This specifies new base paths to register in the manifest during dataset creation. + /// Each BasePath must have a properly assigned ID (non-zero) before calling this method. + /// Only used in CREATE/OVERWRITE modes for manifest registration. + pub fn with_initial_bases(self, bases: Vec) -> Self { + Self { + initial_bases: Some(bases), + ..self + } + } + + /// Set the target_bases for this WriteParams. + /// + /// This specifies the base IDs where data files should be written. + /// The IDs must correspond to either: + /// - IDs in initial_bases (for CREATE/OVERWRITE modes) + /// - IDs already registered in the existing dataset manifest (for APPEND mode) + pub fn with_target_bases(self, base_ids: Vec) -> Self { + Self { + target_bases: Some(base_ids), + ..self + } + } + + /// Store target base names or paths for deferred resolution. + /// + /// This method stores the references in `target_base_names_or_paths` field + /// to be resolved later at execution time when the dataset manifest is available. + /// + /// Resolution will happen at write execution time and will try to match: + /// 1. initial_bases by name + /// 2. initial_bases by path + /// 3. existing manifest by name + /// 4. existing manifest by path + /// + /// # Arguments + /// + /// * `references` - Vector of base names or paths to be resolved later + pub fn with_target_base_names_or_paths(self, references: Vec) -> Self { + Self { + target_base_names_or_paths: Some(references), + ..self + } + } } /// Writes the given data to the dataset and returns fragments. @@ -308,6 +376,7 @@ pub async fn do_write_fragments( data: SendableRecordBatchStream, params: WriteParams, storage_version: LanceFileVersion, + target_bases_info: Option>, ) -> Result> { // Convert arrow.json to lance.json (JSONB) for storage if needed // @@ -325,7 +394,13 @@ pub async fn do_write_fragments( .boxed() }; - let writer_generator = WriterGenerator::new(object_store, base_dir, schema, storage_version); + let writer_generator = WriterGenerator::new( + object_store, + base_dir, + schema, + storage_version, + target_bases_info + ); let mut writer: Option> = None; let mut num_rows_in_current_file = 0; let mut fragments = Vec::new(); @@ -377,6 +452,208 @@ pub struct WrittenFragments { pub blob: Option<(Vec, Schema)>, } +/// Resolve target bases for writing. +/// +/// This function should be called early in the write flow (e.g., in InsertBuilder) +/// before calling write_fragments_internal. +/// +/// This function: +/// 1. Assigns sequential IDs to base paths that have ID 0 in initial_bases +/// 2. Validates that all IDs in initial_bases are non-zero and unique +/// 3. Resolves target_base_names_or_paths to target_bases IDs +/// 4. Validates that all target base IDs are non-zero +/// 5. Prepares TargetBaseInfo structs with object stores for round-robin selection +/// +/// Returns Option> - Some if target bases are configured, None otherwise +pub(crate) async fn resolve_target_bases( + params: &mut WriteParams, + existing_manifest: Option<&Manifest>, + store_registry: Arc, +) -> Result>> { + // Step 1: Assign IDs to initial_bases that have ID 0 + if let Some(initial_bases) = &mut params.initial_bases { + let mut next_id = 1u32; + for base_path in initial_bases.iter_mut() { + if base_path.id == 0 { + base_path.id = next_id; + next_id += 1; + } else { + // Track the highest assigned ID to avoid conflicts + next_id = next_id.max(base_path.id + 1); + } + } + } + + // Step 2: Validate that IDs in initial_bases are non-zero and unique + if let Some(initial_bases) = ¶ms.initial_bases { + let mut seen_ids = std::collections::HashSet::new(); + for base_path in initial_bases { + if base_path.id == 0 { + return Err(Error::invalid_input( + format!( + "Base path '{}' has ID 0. IDs must be non-zero and assigned.", + base_path.name.as_ref().unwrap_or(&base_path.path) + ), + location!(), + )); + } + if !seen_ids.insert(base_path.id) { + return Err(Error::invalid_input( + format!("Duplicate base path ID: {}", base_path.id), + location!(), + )); + } + } + } + + // Step 3: Resolve target_base_names_or_paths to IDs if present + let resolved_target_bases = if let Some(ref names_or_paths) = params.target_base_names_or_paths + { + let mut resolved_ids = Vec::new(); + for reference in names_or_paths { + let ref_str = reference.as_str(); + let mut id_found = None; + + // Check initial_bases by name + if let Some(initial_bases) = ¶ms.initial_bases { + for base in initial_bases { + if let Some(name) = &base.name { + if name == ref_str { + id_found = Some(base.id); + break; + } + } + } + } + + // Check initial_bases by path + if id_found.is_none() { + if let Some(initial_bases) = ¶ms.initial_bases { + for base in initial_bases { + if base.path == ref_str { + id_found = Some(base.id); + break; + } + } + } + } + + // Check existing manifest by name + if id_found.is_none() { + if let Some(manifest) = existing_manifest { + for (id, base) in &manifest.base_paths { + if let Some(name) = &base.name { + if name == ref_str { + id_found = Some(*id); + break; + } + } + } + } + } + + // Check existing manifest by path + if id_found.is_none() { + if let Some(manifest) = existing_manifest { + for (id, base) in &manifest.base_paths { + if base.path == ref_str { + id_found = Some(*id); + break; + } + } + } + } + + let id = id_found.ok_or_else(|| { + Error::invalid_input( + format!( + "Base reference '{}' not found in initial_bases or existing manifest", + ref_str + ), + location!(), + ) + })?; + + resolved_ids.push(id); + } + Some(resolved_ids) + } else { + params.target_bases.clone() + }; + + // Step 4: prepare TargetBaseInfo structs + if let Some(target_bases) = &resolved_target_bases { + let store_params = params.store_params.clone().unwrap_or_default(); + + // Collect information for all target bases + let mut base_infos = Vec::new(); + for &target_base_id in target_bases { + // Find the base_path for this ID + let (target_path_uri, is_dataset_root) = match ¶ms.initial_bases { + Some(initial_bases) => { + // CREATE or OVERWRITE with new bases: find the base_path by ID + let base_path = initial_bases + .iter() + .find(|base_path| base_path.id == target_base_id) + .ok_or_else(|| { + Error::invalid_input( + format!( + "Target base ID {} not found in initial_bases", + target_base_id + ), + location!(), + ) + })?; + (base_path.path.clone(), base_path.is_dataset_root) + } + None => { + // APPEND or OVERWRITE without new bases: find in existing manifest + let manifest = existing_manifest.ok_or_else(|| { + Error::invalid_input( + format!( + "Cannot use target base ID {} without existing manifest or initial_bases", + target_base_id + ), + location!(), + ) + })?; + + let base_path = manifest.base_paths.get(&target_base_id).ok_or_else(|| { + Error::invalid_input( + format!( + "Target base ID {} not found in existing manifest", + target_base_id + ), + location!(), + ) + })?; + (base_path.path.clone(), base_path.is_dataset_root) + } + }; + + // Extract path from URI using standard ObjectStore method + let (target_object_store, extracted_path) = ObjectStore::from_uri_and_params( + store_registry.clone(), + &target_path_uri, + &store_params, + ) + .await?; + + // Store the raw path - /data will be added later based on is_dataset_root + base_infos.push(TargetBaseInfo { + base_id: target_base_id, + object_store: target_object_store, + base_dir: extracted_path, + is_dataset_root, + }); + } + + Ok(Some(base_infos)) + } else { + Ok(None) + } +} + /// Writes the given data to the dataset and returns fragments. /// /// NOTE: the fragments have not yet been assigned an ID. That must be done @@ -394,6 +671,7 @@ pub async fn write_fragments_internal( schema: Schema, data: SendableRecordBatchStream, mut params: WriteParams, + target_bases_info: Option>, ) -> Result { // Convert Arrow JSON columns to Lance JSON (JSONB) format // @@ -502,6 +780,7 @@ pub async fn write_fragments_internal( data, params, storage_version, + target_bases_info, ); let (default, blob) = if let Some(blob_data) = blob_data { @@ -514,6 +793,7 @@ pub async fn write_fragments_internal( blob_data, blob_write_params, storage_version, + None, // Blobs don't use target_bases ); let (fragments_res, blobs_res) = futures::join!(fragments_fut, blob_fut); let fragments = fragments_res?; @@ -540,22 +820,35 @@ pub trait GenericWriter: Send { async fn finish(&mut self) -> Result<(u32, DataFile)>; } +struct V1WriterAdapter +where + M: ManifestProvider + Send + Sync, +{ + writer: FileWriter, + path: String, + base_id: Option, +} + #[async_trait::async_trait] -impl GenericWriter for (FileWriter, String) { +impl GenericWriter for V1WriterAdapter +where + M: ManifestProvider + Send + Sync, +{ async fn write(&mut self, batches: &[RecordBatch]) -> Result<()> { - self.0.write(batches).await + self.writer.write(batches).await } async fn tell(&mut self) -> Result { - Ok(self.0.tell().await? as u64) + Ok(self.writer.tell().await? as u64) } async fn finish(&mut self) -> Result<(u32, DataFile)> { - let size_bytes = self.0.tell().await?; + let size_bytes = self.writer.tell().await?; Ok(( - self.0.finish().await? as u32, + self.writer.finish().await? as u32, DataFile::new_legacy( - self.1.clone(), - self.0.schema(), + self.path.clone(), + self.writer.schema(), NonZero::new(size_bytes as u64), + self.base_id, ), )) } @@ -564,6 +857,7 @@ impl GenericWriter for (FileWriter, String struct V2WriterAdapter { writer: v2::writer::FileWriter, path: String, + base_id: Option, } #[async_trait::async_trait] @@ -599,6 +893,7 @@ impl GenericWriter for V2WriterAdapter { major, minor, NonZero::new(self.writer.tell().await?), + self.base_id, ); Ok((num_rows, data_file)) } @@ -609,22 +904,46 @@ pub async fn open_writer( schema: &Schema, base_dir: &Path, storage_version: LanceFileVersion, +) -> Result> { + open_writer_with_options( + object_store, + schema, + base_dir, + storage_version, + true, + None, + ) + .await +} + +pub async fn open_writer_with_options( + object_store: &ObjectStore, + schema: &Schema, + base_dir: &Path, + storage_version: LanceFileVersion, + add_data_dir: bool, + base_id: Option, ) -> Result> { let filename = format!("{}.lance", generate_random_filename()); - let full_path = base_dir.child(DATA_DIR).child(filename.as_str()); + let full_path = if add_data_dir { + base_dir.child(DATA_DIR).child(filename.as_str()) + } else { + base_dir.child(filename.as_str()) + }; let writer = if storage_version == LanceFileVersion::Legacy { - Box::new(( - FileWriter::::try_new( + Box::new(V1WriterAdapter { + writer: FileWriter::::try_new( object_store, &full_path, schema.clone(), &Default::default(), ) .await?, - filename, - )) + path: filename, + base_id, + }) } else { let writer = object_store.create(&full_path).await?; let file_writer = v2::writer::FileWriter::try_new( @@ -638,18 +957,37 @@ pub async fn open_writer( let writer_adapter = V2WriterAdapter { writer: file_writer, path: filename, + base_id, }; Box::new(writer_adapter) as Box }; Ok(writer) } -/// Creates new file writers for a given dataset. +/// Information about a target base for writing. +/// Contains the base ID, object store, directory path, and whether it's a dataset root. +pub struct TargetBaseInfo { + pub base_id: u32, + pub object_store: Arc, + /// The base directory path (without /data subdirectory) + pub base_dir: Path, + /// Whether this base path is a dataset root. + /// If true, /data will be added when creating file paths. + /// If false, files will be written directly to base_dir. + pub is_dataset_root: bool, +} + struct WriterGenerator { + /// Default object store (used when no target bases specified) object_store: Arc, + /// Default base directory (used when no target bases specified) base_dir: Path, schema: Schema, storage_version: LanceFileVersion, + /// Target base information (if writing to specific bases) + target_bases_info: Option>, + /// Counter for round-robin selection + next_base_index: AtomicUsize, } impl WriterGenerator { @@ -658,26 +996,52 @@ impl WriterGenerator { base_dir: &Path, schema: &Schema, storage_version: LanceFileVersion, + target_bases_info: Option>, ) -> Self { Self { object_store, base_dir: base_dir.clone(), schema: schema.clone(), storage_version, + target_bases_info, + next_base_index: AtomicUsize::new(0), } } + /// Select the next target base using round-robin strategy. + /// TODO: In the future, we can develop different strategies for selecting target bases + fn select_target_base(&self) -> Option<&TargetBaseInfo> { + self.target_bases_info.as_ref().map(|bases| { + let index = self + .next_base_index + .fetch_add(1, std::sync::atomic::Ordering::Relaxed); + &bases[index % bases.len()] + }) + } + pub async fn new_writer(&self) -> Result<(Box, Fragment)> { // Use temporary ID 0; will assign ID later. let fragment = Fragment::new(0); - let writer = open_writer( - &self.object_store, - &self.schema, - &self.base_dir, - self.storage_version, - ) - .await?; + let writer = if let Some(base_info) = self.select_target_base() { + open_writer_with_options( + base_info.object_store.as_ref(), + &self.schema, + &base_info.base_dir, + self.storage_version, + base_info.is_dataset_root, + Some(base_info.base_id), + ) + .await? + } else { + open_writer( + self.object_store.as_ref(), + &self.schema, + &self.base_dir, + self.storage_version, + ) + .await? + }; Ok((writer, fragment)) } @@ -697,7 +1061,7 @@ async fn resolve_commit_handler( .map(|opts| opts.object_store.is_some()) .unwrap_or_default() { - return Err(Error::InvalidInput { source: "when creating a dataset with a custom object store the commit_handler must also be specified".into(), location: location!() }); + return Err(Error::InvalidInput { source: "when creating a dataset with a custom object store the commit_handler must also be specified".into(), location: Default::default() }); } commit_handler_from_url(uri, store_options).await } @@ -706,7 +1070,7 @@ async fn resolve_commit_handler( Err(Error::InvalidInput { source: "`s3+ddb://` scheme and custom commit handler are mutually exclusive" .into(), - location: location!(), + location: Default::default(), }) } else { Ok(commit_handler) @@ -944,6 +1308,7 @@ mod tests { schema, data_stream, write_params, + None, ) .await } @@ -1015,6 +1380,7 @@ mod tests { schema, data_stream, write_params, + None, ) .await .unwrap(); @@ -1092,6 +1458,7 @@ mod tests { schema.clone(), data_stream, write_params, + None, ) .await .unwrap(); @@ -1124,4 +1491,585 @@ mod tests { let batch = reader.read_batch(0, .., &schema).await.unwrap(); assert_eq!(batch, data); } + + #[tokio::test] + async fn test_explicit_data_file_bases_writer_generator() { + use arrow::datatypes::{DataType, Field as ArrowField, Schema as ArrowSchema}; + use lance_io::object_store::ObjectStore; + use std::sync::Arc; + + // Create test schema + let arrow_schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])); + let schema = Schema::try_from(arrow_schema.as_ref()).unwrap(); + + // Create in-memory object store + let object_store = Arc::new(ObjectStore::memory()); + let base_dir = Path::from("test/bucket2"); + + // Test WriterGenerator with explicit data file bases configuration + let target_bases = vec![TargetBaseInfo { + base_id: 2, + object_store: object_store.clone(), + base_dir: base_dir.clone(), + is_dataset_root: false, // Test uses direct data directory + }]; + let writer_generator = WriterGenerator::new( + object_store.clone(), + &base_dir, + &schema, + LanceFileVersion::Stable, + Some(target_bases), + ); + + // Create a writer + let (writer, fragment) = writer_generator.new_writer().await.unwrap(); + + // Verify fragment is created + assert_eq!(fragment.id, 0); // Temporary ID + + // Verify writer is created (we can't test much more without writing data) + drop(writer); // Clean up + } + + #[tokio::test] + async fn test_writer_with_base_id() { + use arrow::array::Int32Array; + use arrow::datatypes::{DataType, Field as ArrowField, Schema as ArrowSchema}; + use arrow::record_batch::RecordBatch; + use lance_io::object_store::ObjectStore; + use std::sync::Arc; + + // Create test data + let arrow_schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])); + let schema = Schema::try_from(arrow_schema.as_ref()).unwrap(); + + let batch = RecordBatch::try_new( + arrow_schema.clone(), + vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], + ) + .unwrap(); + + // Create in-memory object store and writer + let object_store = Arc::new(ObjectStore::memory()); + let base_dir = Path::from("test/bucket2"); + + let mut inner_writer = open_writer_with_options( + object_store.as_ref(), + &schema, + &base_dir, + LanceFileVersion::Stable, + false, // Don't add /data + None, + ) + .await + .unwrap(); + + // Write data + inner_writer.write(&[batch]).await.unwrap(); + + // Finish and manually set base_id + let base_id = 2u32; + let (_num_rows, mut data_file) = inner_writer.finish().await.unwrap(); + data_file.base_id = Some(base_id); + + assert_eq!(data_file.base_id, Some(base_id)); + assert!(!data_file.path.is_empty()); + } + + #[tokio::test] + async fn test_round_robin_target_base_selection() { + use arrow::array::Int32Array; + use arrow::datatypes::{DataType, Field as ArrowField, Schema as ArrowSchema}; + use arrow::record_batch::RecordBatch; + use lance_io::object_store::ObjectStore; + use std::sync::Arc; + + // Create test schema + let arrow_schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "id", + DataType::Int32, + false, + )])); + let schema = Schema::try_from(arrow_schema.as_ref()).unwrap(); + + // Create in-memory object stores for different bases + let store1 = Arc::new(ObjectStore::memory()); + let store2 = Arc::new(ObjectStore::memory()); + let store3 = Arc::new(ObjectStore::memory()); + + // Create WriterGenerator with multiple target bases + let target_bases = vec![ + TargetBaseInfo { + base_id: 1, + object_store: store1.clone(), + base_dir: Path::from("base1"), + is_dataset_root: false, + }, + TargetBaseInfo { + base_id: 2, + object_store: store2.clone(), + base_dir: Path::from("base2"), + is_dataset_root: false, + }, + TargetBaseInfo { + base_id: 3, + object_store: store3.clone(), + base_dir: Path::from("base3"), + is_dataset_root: false, + }, + ]; + + let writer_generator = WriterGenerator::new( + Arc::new(ObjectStore::memory()), + &Path::from("default"), + &schema, + LanceFileVersion::Stable, + Some(target_bases), + ); + + // Create test batch + let batch = RecordBatch::try_new( + arrow_schema.clone(), + vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], + ) + .unwrap(); + + // Create multiple writers and verify round-robin selection + let mut base_ids = Vec::new(); + for _ in 0..6 { + let (mut writer, _fragment) = writer_generator.new_writer().await.unwrap(); + writer.write(std::slice::from_ref(&batch)).await.unwrap(); + let (_num_rows, data_file) = writer.finish().await.unwrap(); + base_ids.push(data_file.base_id.unwrap()); + } + + // Verify round-robin pattern: 1, 2, 3, 1, 2, 3 + assert_eq!(base_ids, vec![1, 2, 3, 1, 2, 3]); + } + + #[tokio::test] + async fn test_explicit_data_file_bases_path_parsing() { + // Test URI parsing logic + let test_cases = vec![ + ("s3://multi-path-test/test1/subBucket2", "test1/subBucket2"), + ("gs://my-bucket/path/to/data", "path/to/data"), + ("file:///tmp/test/bucket", "tmp/test/bucket"), + ]; + + for (uri, expected_path) in test_cases { + let url = url::Url::parse(uri).unwrap(); + let parsed_path = url.path().trim_start_matches('/'); + assert_eq!(parsed_path, expected_path, "Failed for URI: {}", uri); + } + } + + #[tokio::test] + async fn test_write_params_validation() { + // Test CREATE mode validation + let mut params = WriteParams { + mode: WriteMode::Create, + initial_bases: Some(vec![ + BasePath { + id: 1, + name: Some("bucket1".to_string()), + path: "s3://bucket1/path1".to_string(), + is_dataset_root: true, + }, + BasePath { + id: 2, + name: Some("bucket2".to_string()), + path: "s3://bucket2/path2".to_string(), + is_dataset_root: true, + }, + ]), + target_bases: Some(vec![1]), // Use ID 1 which corresponds to bucket1 + ..Default::default() + }; + + // This should be valid + let result = validate_write_params(¶ms); + assert!(result.is_ok()); + + // Test target_bases with ID not in initial_bases (should fail) + params.target_bases = Some(vec![99]); // ID 99 doesn't exist + let result = validate_write_params(¶ms); + assert!(result.is_err()); + + // Test CREATE mode with target_bases but no initial_bases (should fail) + params.initial_bases = None; + params.target_bases = Some(vec![1]); + let result = validate_write_params(¶ms); + assert!(result.is_err()); + } + + fn validate_write_params(params: &WriteParams) -> Result<()> { + // Replicate the validation logic from the main write function + if matches!(params.mode, WriteMode::Create) { + if let Some(target_bases) = ¶ms.target_bases { + if target_bases.len() != 1 { + return Err(Error::invalid_input( + format!( + "target_bases with {} elements is not supported", + target_bases.len() + ), + Default::default(), + )); + } + let target_base_id = target_bases[0]; + if let Some(initial_bases) = ¶ms.initial_bases { + if !initial_bases.iter().any(|bp| bp.id == target_base_id) { + return Err(Error::invalid_input( + format!( + "target_base_id {} must be one of the initial_bases in CREATE mode", + target_base_id + ), + Default::default(), + )); + } + } else { + return Err(Error::invalid_input( + "initial_bases must be provided when target_bases is specified in CREATE mode", + Default::default(), + )); + } + } + } + Ok(()) + } + + #[tokio::test] + async fn test_multi_base_create() { + use lance_testing::datagen::{BatchGenerator, IncrementingInt32}; + + // Create dataset with multi-base configuration + let test_uri = "memory://multi_base_test"; + let primary_uri = format!("{}/primary", test_uri); + let base1_uri = format!("{}/base1", test_uri); + let base2_uri = format!("{}/base2", test_uri); + + let mut data_gen = + BatchGenerator::new().col(Box::new(IncrementingInt32::new().named("id".to_owned()))); + + let dataset = crate::dataset::Dataset::write( + data_gen.batch(5), + &primary_uri, + Some(WriteParams { + mode: WriteMode::Create, + initial_bases: Some(vec![ + BasePath { + id: 1, + name: Some("base1".to_string()), + path: base1_uri.clone(), + is_dataset_root: true, + }, + BasePath { + id: 2, + name: Some("base2".to_string()), + path: base2_uri.clone(), + is_dataset_root: true, + }, + ]), + target_bases: Some(vec![1]), + ..Default::default() + }), + ) + .await + .unwrap(); + + // Verify dataset was created + assert_eq!(dataset.count_rows(None).await.unwrap(), 5); + + // Verify base_paths are registered in manifest + assert_eq!(dataset.manifest.base_paths.len(), 2); + assert!(dataset + .manifest + .base_paths + .values() + .any(|bp| bp.name == Some("base1".to_string()))); + assert!(dataset + .manifest + .base_paths + .values() + .any(|bp| bp.name == Some("base2".to_string()))); + + // Verify data was written to base1 + let fragments = dataset.get_fragments(); + assert!(!fragments.is_empty()); + for fragment in fragments { + assert!(fragment + .metadata + .files + .iter() + .any(|file| file.base_id == Some(1))); + } + } + + #[tokio::test] + async fn test_multi_base_overwrite() { + use lance_testing::datagen::{BatchGenerator, IncrementingInt32}; + + // Create initial dataset + let test_uri = "memory://overwrite_test"; + let primary_uri = format!("{}/primary", test_uri); + let base1_uri = format!("{}/base1", test_uri); + let base2_uri = format!("{}/base2", test_uri); + let _base3_uri = format!("{}/base3", test_uri); + + let mut data_gen = + BatchGenerator::new().col(Box::new(IncrementingInt32::new().named("id".to_owned()))); + + let dataset = crate::dataset::Dataset::write( + data_gen.batch(3), + &primary_uri, + Some(WriteParams { + mode: WriteMode::Create, + initial_bases: Some(vec![ + BasePath { + id: 1, + name: Some("base1".to_string()), + path: base1_uri.clone(), + is_dataset_root: true, + }, + BasePath { + id: 2, + name: Some("base2".to_string()), + path: base2_uri.clone(), + is_dataset_root: true, + }, + ]), + target_bases: Some(vec![1]), + ..Default::default() + }), + ) + .await + .unwrap(); + + assert_eq!(dataset.count_rows(None).await.unwrap(), 3); + + // Overwrite - should inherit existing base configuration (base1, base2) + // Write to base2 + let mut data_gen2 = + BatchGenerator::new().col(Box::new(IncrementingInt32::new().named("id".to_owned()))); + + let dataset = crate::dataset::Dataset::write( + data_gen2.batch(2), + std::sync::Arc::new(dataset), + Some(WriteParams { + mode: WriteMode::Overwrite, + // No initial_bases - inherits existing base_paths + target_bases: Some(vec![2]), // Write to base2 + ..Default::default() + }), + ) + .await + .unwrap(); + + // Verify data was overwritten + assert_eq!(dataset.count_rows(None).await.unwrap(), 2); + + // Verify base_paths were inherited (still base1 and base2) + assert_eq!(dataset.manifest.base_paths.len(), 2); + assert!(dataset + .manifest + .base_paths + .values() + .any(|bp| bp.name == Some("base1".to_string()))); + assert!(dataset + .manifest + .base_paths + .values() + .any(|bp| bp.name == Some("base2".to_string()))); + + // Verify data was written to base2 (ID 2) + let fragments = dataset.get_fragments(); + assert!(fragments.iter().any(|f| f + .metadata + .files + .iter() + .any(|file| file.base_id == Some(2)))); + } + + #[tokio::test] + async fn test_multi_base_append() { + use lance_testing::datagen::{BatchGenerator, IncrementingInt32}; + + // Create initial dataset with multi-base configuration + let test_uri = "memory://append_test"; + let primary_uri = format!("{}/primary", test_uri); + let base1_uri = format!("{}/base1", test_uri); + let base2_uri = format!("{}/base2", test_uri); + + let mut data_gen = + BatchGenerator::new().col(Box::new(IncrementingInt32::new().named("id".to_owned()))); + + let dataset = crate::dataset::Dataset::write( + data_gen.batch(3), + &primary_uri, + Some(WriteParams { + mode: WriteMode::Create, + initial_bases: Some(vec![ + BasePath { + id: 1, + name: Some("base1".to_string()), + path: base1_uri.clone(), + is_dataset_root: true, + }, + BasePath { + id: 2, + name: Some("base2".to_string()), + path: base2_uri.clone(), + is_dataset_root: true, + }, + ]), + target_bases: Some(vec![1]), + ..Default::default() + }), + ) + .await + .unwrap(); + + assert_eq!(dataset.count_rows(None).await.unwrap(), 3); + + // Append to base1 (same base as initial write) + let mut data_gen2 = + BatchGenerator::new().col(Box::new(IncrementingInt32::new().named("id".to_owned()))); + + let dataset = crate::dataset::Dataset::write( + data_gen2.batch(2), + std::sync::Arc::new(dataset), + Some(WriteParams { + mode: WriteMode::Append, + target_bases: Some(vec![1]), + ..Default::default() + }), + ) + .await + .unwrap(); + + assert_eq!(dataset.count_rows(None).await.unwrap(), 5); + + // Verify base_paths are still registered + assert_eq!(dataset.manifest.base_paths.len(), 2); + + // Append to base2 (different base) + let mut data_gen3 = + BatchGenerator::new().col(Box::new(IncrementingInt32::new().named("id".to_owned()))); + + let dataset = crate::dataset::Dataset::write( + data_gen3.batch(4), + std::sync::Arc::new(dataset), + Some(WriteParams { + mode: WriteMode::Append, + target_bases: Some(vec![2]), + ..Default::default() + }), + ) + .await + .unwrap(); + + assert_eq!(dataset.count_rows(None).await.unwrap(), 9); + + // Verify data is distributed across both bases + let fragments = dataset.get_fragments(); + let has_base1_data = fragments + .iter() + .any(|f| f.metadata.files.iter().any(|file| file.base_id == Some(1))); + let has_base2_data = fragments + .iter() + .any(|f| f.metadata.files.iter().any(|file| file.base_id == Some(2))); + + assert!(has_base1_data, "Should have data in base1"); + assert!(has_base2_data, "Should have data in base2"); + } + + #[tokio::test] + async fn test_multi_base_overwrite_reference_existing() { + use lance_testing::datagen::{BatchGenerator, IncrementingInt32}; + + // Create initial dataset with multi-base configuration + let test_uri = "memory://overwrite_ref_test"; + let primary_uri = format!("{}/primary", test_uri); + let base1_uri = format!("{}/base1", test_uri); + let base2_uri = format!("{}/base2", test_uri); + + let mut data_gen = + BatchGenerator::new().col(Box::new(IncrementingInt32::new().named("id".to_owned()))); + + let dataset = crate::dataset::Dataset::write( + data_gen.batch(3), + &primary_uri, + Some(WriteParams { + mode: WriteMode::Create, + initial_bases: Some(vec![ + BasePath { + id: 1, + name: Some("base1".to_string()), + path: base1_uri.clone(), + is_dataset_root: true, + }, + BasePath { + id: 2, + name: Some("base2".to_string()), + path: base2_uri.clone(), + is_dataset_root: true, + }, + ]), + target_bases: Some(vec![1]), + ..Default::default() + }), + ) + .await + .unwrap(); + + assert_eq!(dataset.count_rows(None).await.unwrap(), 3); + assert_eq!(dataset.manifest.base_paths.len(), 2); + + // Overwrite referencing existing base2 without registering new bases + let mut data_gen2 = + BatchGenerator::new().col(Box::new(IncrementingInt32::new().named("id".to_owned()))); + + let dataset = crate::dataset::Dataset::write( + data_gen2.batch(5), + std::sync::Arc::new(dataset), + Some(WriteParams { + mode: WriteMode::Overwrite, + target_bases: Some(vec![2]), + ..Default::default() + }), + ) + .await + .unwrap(); + + // Verify data was overwritten with new row count + assert_eq!(dataset.count_rows(None).await.unwrap(), 5); + + // Verify base_paths are still the same (no new bases registered) + assert_eq!(dataset.manifest.base_paths.len(), 2); + assert!(dataset + .manifest + .base_paths + .values() + .any(|bp| bp.name == Some("base1".to_string()))); + assert!(dataset + .manifest + .base_paths + .values() + .any(|bp| bp.name == Some("base2".to_string()))); + + // Verify data was written to base2 + let fragments = dataset.get_fragments(); + assert!(fragments.iter().any(|f| f + .metadata + .files + .iter() + .any(|file| file.base_id == Some(2)))); + } } diff --git a/rust/lance/src/dataset/write/commit.rs b/rust/lance/src/dataset/write/commit.rs index 44a4e8485f4..142d43ec046 100644 --- a/rust/lance/src/dataset/write/commit.rs +++ b/rust/lance/src/dataset/write/commit.rs @@ -391,6 +391,7 @@ impl<'a> CommitBuilder<'a> { branch: manifest.branch.clone(), }, ); + Ok(Dataset { object_store, base: base_path, @@ -404,6 +405,7 @@ impl<'a> CommitBuilder<'a> { fragment_bitmap, metadata_cache, file_reader_options: None, + store_params: self.store_params.clone(), }) } } diff --git a/rust/lance/src/dataset/write/insert.rs b/rust/lance/src/dataset/write/insert.rs index 904580a7d1a..e4b2cdb1838 100644 --- a/rust/lance/src/dataset/write/insert.rs +++ b/rust/lance/src/dataset/write/insert.rs @@ -25,7 +25,7 @@ use snafu::location; use crate::dataset::builder::DatasetBuilder; use crate::dataset::transaction::{Operation, Transaction, TransactionBuilder}; -use crate::dataset::write::write_fragments_internal; +use crate::dataset::write::{write_fragments_internal, TargetBaseInfo}; use crate::dataset::ReadParams; use crate::Dataset; use crate::{Error, Result}; @@ -195,6 +195,8 @@ impl<'a> InsertBuilder<'a> { self.validate_write(&mut context, &schema)?; + let target_base_info = self.validate_and_resolve_multi_base(&mut context).await?; + let written_frags = write_fragments_internal( context.dest.dataset(), context.object_store.clone(), @@ -202,6 +204,7 @@ impl<'a> InsertBuilder<'a> { schema.clone(), stream, context.params.clone(), + target_base_info, ) .await?; @@ -210,6 +213,39 @@ impl<'a> InsertBuilder<'a> { Ok((transaction, context)) } + async fn validate_and_resolve_multi_base( + &self, + context: &mut WriteContext<'_>, + ) -> Result>> { + + if !matches!(context.params.mode, WriteMode::Create) + && context.params.initial_bases.is_some() + { + return Err(Error::invalid_input( + format!( + "Cannot register new bases in {:?} mode. Only CREATE mode can register new bases.", + context.params.mode + ), + location!(), + )); + } + + let existing_manifest = context.dest.dataset().map(|ds| ds.manifest.as_ref()); + let store_registry = context + .params + .session + .as_ref() + .map(|s| s.store_registry()) + .unwrap_or_default(); + + crate::dataset::write::resolve_target_bases( + &mut context.params, + existing_manifest, + store_registry, + ) + .await + } + fn build_transaction( schema: Schema, written_frags: WrittenFragments, @@ -246,19 +282,24 @@ impl<'a> InsertBuilder<'a> { } None => None, }; + Operation::Overwrite { // Use the full schema, not the written schema schema, fragments: written_frags.default.0, config_upsert_values, + initial_bases: context.params.initial_bases.clone(), + } + } + WriteMode::Overwrite => { + Operation::Overwrite { + // Use the full schema, not the written schema + schema, + fragments: written_frags.default.0, + config_upsert_values: None, + initial_bases: context.params.initial_bases.clone(), } } - WriteMode::Overwrite => Operation::Overwrite { - // Use the full schema, not the written schema - schema, - fragments: written_frags.default.0, - config_upsert_values: None, - }, WriteMode::Append => Operation::Append { fragments: written_frags.default.0, }, @@ -269,6 +310,7 @@ impl<'a> InsertBuilder<'a> { schema: blob.1, fragments: blob.0, config_upsert_values: None, + initial_bases: context.params.initial_bases.clone(), }, WriteMode::Append => Operation::Append { fragments: blob.0 }, }); diff --git a/rust/lance/src/dataset/write/merge_insert.rs b/rust/lance/src/dataset/write/merge_insert.rs index dc6f471f6e8..c7904f04a26 100644 --- a/rust/lance/src/dataset/write/merge_insert.rs +++ b/rust/lance/src/dataset/write/merge_insert.rs @@ -1051,6 +1051,7 @@ impl MergeInsertJob { write_schema, stream, Default::default(), // TODO: support write params. + None, // Merge insert doesn't use target_bases ) .await?; @@ -1435,6 +1436,7 @@ impl MergeInsertJob { self.dataset.schema().clone(), Box::pin(stream), WriteParams::default(), + None, // Merge insert doesn't use target_bases ) .await?; diff --git a/rust/lance/src/dataset/write/merge_insert/exec/write.rs b/rust/lance/src/dataset/write/merge_insert/exec/write.rs index abbbc5e9b88..bd6fa18679d 100644 --- a/rust/lance/src/dataset/write/merge_insert/exec/write.rs +++ b/rust/lance/src/dataset/write/merge_insert/exec/write.rs @@ -823,6 +823,7 @@ impl ExecutionPlan for FullSchemaMergeInsertExec { dataset.schema().clone(), write_data_stream, WriteParams::default(), + None, // Merge insert doesn't use target_bases ) .await?; diff --git a/rust/lance/src/dataset/write/update.rs b/rust/lance/src/dataset/write/update.rs index 2b73d69802f..de944561236 100644 --- a/rust/lance/src/dataset/write/update.rs +++ b/rust/lance/src/dataset/write/update.rs @@ -329,6 +329,7 @@ impl UpdateJob { self.dataset.schema().clone(), Box::pin(stream), WriteParams::with_storage_version(version), + None, // TODO: support multiple bases for update ) .await?; diff --git a/rust/lance/src/io/commit.rs b/rust/lance/src/io/commit.rs index 46a35dc7d1e..922fcc9183e 100644 --- a/rust/lance/src/io/commit.rs +++ b/rust/lance/src/io/commit.rs @@ -1520,8 +1520,8 @@ mod tests { Fragment { id: 0, files: vec![ - DataFile::new_legacy_from_fields("path1", vec![0, 1, 2]), - DataFile::new_legacy_from_fields("unused", vec![9]), + DataFile::new_legacy_from_fields("path1", vec![0, 1, 2], None), + DataFile::new_legacy_from_fields("unused", vec![9], None), ], deletion_file: None, row_id_meta: None, @@ -1530,8 +1530,8 @@ mod tests { Fragment { id: 1, files: vec![ - DataFile::new_legacy_from_fields("path2", vec![0, 1, 2]), - DataFile::new_legacy_from_fields("path3", vec![2]), + DataFile::new_legacy_from_fields("path2", vec![0, 1, 2], None), + DataFile::new_legacy_from_fields("path3", vec![2], None), ], deletion_file: None, row_id_meta: None, @@ -1564,7 +1564,11 @@ mod tests { let expected_fragments = vec![ Fragment { id: 0, - files: vec![DataFile::new_legacy_from_fields("path1", vec![0, 1, 10])], + files: vec![DataFile::new_legacy_from_fields( + "path1", + vec![0, 1, 10], + None, + )], deletion_file: None, row_id_meta: None, physical_rows: None, @@ -1572,8 +1576,8 @@ mod tests { Fragment { id: 1, files: vec![ - DataFile::new_legacy_from_fields("path2", vec![0, 1, 2]), - DataFile::new_legacy_from_fields("path3", vec![10]), + DataFile::new_legacy_from_fields("path2", vec![0, 1, 2], None), + DataFile::new_legacy_from_fields("path3", vec![10], None), ], deletion_file: None, row_id_meta: None, diff --git a/rust/lance/src/io/commit/conflict_resolver.rs b/rust/lance/src/io/commit/conflict_resolver.rs index da9a95e12b3..8a2543f0c7b 100644 --- a/rust/lance/src/io/commit/conflict_resolver.rs +++ b/rust/lance/src/io/commit/conflict_resolver.rs @@ -2050,6 +2050,7 @@ mod tests { "overwrite-key".to_string(), "value".to_string(), )])), + initial_bases: None, }, Operation::Rewrite { groups: vec![RewriteGroup { @@ -2152,6 +2153,7 @@ mod tests { fragments: vec![fragment0.clone(), fragment2.clone()], schema: lance_core::datatypes::Schema::default(), config_upsert_values: None, + initial_bases: None, }, // No conflicts: overwrite can always happen since it doesn't // depend on previous state of the table. diff --git a/rust/lance/src/utils/test.rs b/rust/lance/src/utils/test.rs index 7d393cad3ad..0e591500c70 100644 --- a/rust/lance/src/utils/test.rs +++ b/rust/lance/src/utils/test.rs @@ -124,6 +124,7 @@ impl TestDatasetGenerator { fragments, schema, config_upsert_values: None, + initial_bases: None, }; Dataset::commit( From d1200f67a5604b2282756cd71df09af6a0e1034c Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Thu, 9 Oct 2025 16:00:46 -0700 Subject: [PATCH 02/11] modify --- ...test_multi_bases.py => test_multi_base.py} | 41 +- rust/lance/src/dataset/write.rs | 411 +++++++----------- rust/lance/src/dataset/write/insert.rs | 39 +- 3 files changed, 174 insertions(+), 317 deletions(-) rename python/python/tests/{test_multi_bases.py => test_multi_base.py} (93%) diff --git a/python/python/tests/test_multi_bases.py b/python/python/tests/test_multi_base.py similarity index 93% rename from python/python/tests/test_multi_bases.py rename to python/python/tests/test_multi_base.py index dcd1e84c7fc..8a46e9e0ecb 100644 --- a/python/python/tests/test_multi_bases.py +++ b/python/python/tests/test_multi_base.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright The Lance Authors """ -Tests for multi-bases dataset functionality. +Tests for multi-base dataset functionality. """ import shutil @@ -18,8 +18,8 @@ from lance import DatasetBasePath -class TestMultiBases: - """Test multi-bases dataset functionality with local file system.""" +class TestMultiBase: + """Test multi-base dataset functionality with local file system.""" def setup_method(self): """Set up test directories for each test.""" @@ -42,7 +42,7 @@ def teardown_method(self): shutil.rmtree(self.test_dir, ignore_errors=True) def create_test_data(self, num_rows=500, id_offset=0): - """Create test data for multi-bases tests.""" + """Create test data for multi-base tests.""" return pd.DataFrame( { "id": range(id_offset, id_offset + num_rows), @@ -51,11 +51,11 @@ def create_test_data(self, num_rows=500, id_offset=0): } ) - def test_multi_bases_create_and_read(self): - """Test creating a multi-bases dataset and reading it back.""" + def test_multi_base_create_and_read(self): + """Test creating a multi-base dataset and reading it back.""" data = self.create_test_data(500) - # Create dataset with multi-bases layout + # Create dataset with multi-base layout dataset = lance.write_dataset( data, self.primary_uri, @@ -82,8 +82,8 @@ def test_multi_bases_create_and_read(self): data.sort_values("id").reset_index(drop=True), ) - def test_multi_bases_append_mode(self): - """Test appending data to a multi-bases dataset.""" + def test_multi_base_append_mode(self): + """Test appending data to a multi-base dataset.""" # Create initial dataset initial_data = self.create_test_data(300) @@ -120,7 +120,7 @@ def test_multi_bases_append_mode(self): actual_ids = set(result["id"].tolist()) assert actual_ids == expected_ids - def test_multi_bases_overwrite_mode_inherits_bases(self): + def test_multi_base_overwrite_mode_inherits_bases(self): """Test OVERWRITE mode inherits existing base configuration.""" # Create initial dataset with multi-base configuration initial_data = self.create_test_data(200) @@ -165,7 +165,7 @@ def test_multi_bases_overwrite_mode_inherits_bases(self): assert any(bp.name == "path1" for bp in base_paths.values()) assert any(bp.name == "path2" for bp in base_paths.values()) - def test_multi_bases_overwrite_mode_primary_path_default(self): + def test_multi_base_overwrite_mode_primary_path_default(self): """Test that OVERWRITE mode defaults to primary path when no target specified.""" # Create initial dataset with explicit data file bases initial_data = self.create_test_data(100) @@ -225,7 +225,7 @@ def test_multi_bases_overwrite_mode_primary_path_default(self): assert len(final_result) == 100 def test_concurrent_appends(self): - """Test concurrent appends to multi-bases dataset.""" + """Test concurrent appends to multi-base dataset.""" # Create initial dataset initial_data = self.create_test_data(100) @@ -295,7 +295,7 @@ def append_worker(worker_id, target_path_name, start_id, num_rows): assert actual_ids == expected_ids def test_validation_errors(self): - """Test validation errors for invalid multi-bases configurations.""" + """Test validation errors for invalid multi-base configurations.""" data = self.create_test_data(100) # Test 1: Target reference not found in new bases for CREATE mode @@ -337,15 +337,6 @@ def test_validation_errors(self): ], ) - # Test 3: Write to dataset with explicit data file bases without target_bases - with pytest.raises(Exception, match="target_bases"): - lance.write_dataset( - data, - base_dataset, - mode="append", - # No target_bases specified - ) - def test_fragment_distribution(self): """Test that fragments are correctly distributed and readable.""" data = self.create_test_data(1000) @@ -376,7 +367,7 @@ def test_fragment_distribution(self): assert all(filtered_result["id"] < 500) def test_schema_consistency(self): - """Test that schema is consistent across multi-bases operations.""" + """Test that schema is consistent across multi-base operations.""" # Create dataset with specific schema schema = pa.schema( [ @@ -428,10 +419,10 @@ def test_schema_consistency(self): assert result.schema == schema def test_dataset_reopening(self): - """Test reopening a multi-bases dataset.""" + """Test reopening a multi-base dataset.""" data = self.create_test_data(300) - # Create multi-bases dataset + # Create multi-base dataset dataset = lance.write_dataset( data, self.primary_uri, diff --git a/rust/lance/src/dataset/write.rs b/rust/lance/src/dataset/write.rs index dc8c4323c6f..c9ffda55c75 100644 --- a/rust/lance/src/dataset/write.rs +++ b/rust/lance/src/dataset/write.rs @@ -1,10 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -use std::collections::HashMap; -use std::num::NonZero; -use std::sync::Arc; -use std::sync::atomic::AtomicUsize; use arrow_array::RecordBatch; use chrono::TimeDelta; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; @@ -25,11 +21,15 @@ use lance_file::v2::writer::FileWriterOptions; use lance_file::version::LanceFileVersion; use lance_file::writer::{FileWriter, ManifestProvider}; use lance_io::object_store::{ObjectStore, ObjectStoreParams, ObjectStoreRegistry}; -use lance_table::format::{BasePath, DataFile, Fragment, Manifest}; +use lance_table::format::{BasePath, DataFile, Fragment}; use lance_table::io::commit::{commit_handler_from_url, CommitHandler}; use lance_table::io::manifest::ManifestDescribing; use object_store::path::Path; use snafu::location; +use std::collections::HashMap; +use std::num::NonZero; +use std::sync::atomic::AtomicUsize; +use std::sync::Arc; use tracing::{info, instrument}; use crate::session::Session; @@ -399,7 +399,7 @@ pub async fn do_write_fragments( base_dir, schema, storage_version, - target_bases_info + target_bases_info, ); let mut writer: Option> = None; let mut num_rows_in_current_file = 0; @@ -452,127 +452,60 @@ pub struct WrittenFragments { pub blob: Option<(Vec, Schema)>, } -/// Resolve target bases for writing. -/// -/// This function should be called early in the write flow (e.g., in InsertBuilder) -/// before calling write_fragments_internal. -/// -/// This function: -/// 1. Assigns sequential IDs to base paths that have ID 0 in initial_bases -/// 2. Validates that all IDs in initial_bases are non-zero and unique -/// 3. Resolves target_base_names_or_paths to target_bases IDs -/// 4. Validates that all target base IDs are non-zero -/// 5. Prepares TargetBaseInfo structs with object stores for round-robin selection -/// -/// Returns Option> - Some if target bases are configured, None otherwise -pub(crate) async fn resolve_target_bases( +pub async fn validate_and_resolve_target_bases( params: &mut WriteParams, - existing_manifest: Option<&Manifest>, - store_registry: Arc, + existing_base_paths: Option<&HashMap>, ) -> Result>> { - // Step 1: Assign IDs to initial_bases that have ID 0 + // Step 1: Validations + if !matches!(params.mode, WriteMode::Create) && params.initial_bases.is_some() { + return Err(Error::invalid_input( + format!( + "Cannot register new bases in {:?} mode. Only CREATE mode can register new bases.", + params.mode + ), + location!(), + )); + } + + if params.target_base_names_or_paths.is_some() && params.target_bases.is_some() { + return Err(Error::invalid_input( + "Cannot specify both target_base_names_or_paths and target_bases. Use one or the other.", + location!(), + )); + } + + // Step 2: Assign IDs to initial_bases and add them to all_bases + let mut all_bases: HashMap = existing_base_paths.cloned().unwrap_or_default(); if let Some(initial_bases) = &mut params.initial_bases { - let mut next_id = 1u32; + let mut next_id = all_bases.keys().max().map(|&id| id + 1).unwrap_or(1); + for base_path in initial_bases.iter_mut() { if base_path.id == 0 { base_path.id = next_id; next_id += 1; - } else { - // Track the highest assigned ID to avoid conflicts - next_id = next_id.max(base_path.id + 1); } + all_bases.insert(base_path.id, base_path.clone()); } } - // Step 2: Validate that IDs in initial_bases are non-zero and unique - if let Some(initial_bases) = ¶ms.initial_bases { - let mut seen_ids = std::collections::HashSet::new(); - for base_path in initial_bases { - if base_path.id == 0 { - return Err(Error::invalid_input( - format!( - "Base path '{}' has ID 0. IDs must be non-zero and assigned.", - base_path.name.as_ref().unwrap_or(&base_path.path) - ), - location!(), - )); - } - if !seen_ids.insert(base_path.id) { - return Err(Error::invalid_input( - format!("Duplicate base path ID: {}", base_path.id), - location!(), - )); - } - } - } - - // Step 3: Resolve target_base_names_or_paths to IDs if present - let resolved_target_bases = if let Some(ref names_or_paths) = params.target_base_names_or_paths - { + // Step 3: Resolve target_base_names_or_paths to IDs + let target_base_ids = if let Some(ref names_or_paths) = params.target_base_names_or_paths { let mut resolved_ids = Vec::new(); for reference in names_or_paths { let ref_str = reference.as_str(); - let mut id_found = None; - - // Check initial_bases by name - if let Some(initial_bases) = ¶ms.initial_bases { - for base in initial_bases { - if let Some(name) = &base.name { - if name == ref_str { - id_found = Some(base.id); - break; - } - } - } - } - - // Check initial_bases by path - if id_found.is_none() { - if let Some(initial_bases) = ¶ms.initial_bases { - for base in initial_bases { - if base.path == ref_str { - id_found = Some(base.id); - break; - } - } - } - } - - // Check existing manifest by name - if id_found.is_none() { - if let Some(manifest) = existing_manifest { - for (id, base) in &manifest.base_paths { - if let Some(name) = &base.name { - if name == ref_str { - id_found = Some(*id); - break; - } - } - } - } - } - - // Check existing manifest by path - if id_found.is_none() { - if let Some(manifest) = existing_manifest { - for (id, base) in &manifest.base_paths { - if base.path == ref_str { - id_found = Some(*id); - break; - } - } - } - } - - let id = id_found.ok_or_else(|| { - Error::invalid_input( - format!( - "Base reference '{}' not found in initial_bases or existing manifest", - ref_str - ), - location!(), - ) - })?; + let id = all_bases + .iter() + .find(|(_, base)| { + base.name.as_ref().map(|n| n == ref_str).unwrap_or(false) + || base.path == ref_str + }) + .map(|(&id, _)| id) + .ok_or_else(|| { + Error::invalid_input( + format!("Base reference '{}' not found in available bases", ref_str), + location!(), + ) + })?; resolved_ids.push(id); } @@ -581,74 +514,44 @@ pub(crate) async fn resolve_target_bases( params.target_bases.clone() }; - // Step 4: prepare TargetBaseInfo structs - if let Some(target_bases) = &resolved_target_bases { + // Step 4: Prepare TargetBaseInfo structs + let store_registry = params + .session + .as_ref() + .map(|s| s.store_registry()) + .unwrap_or_default(); + + if let Some(target_bases) = &target_base_ids { let store_params = params.store_params.clone().unwrap_or_default(); + let mut bases_info = Vec::new(); - // Collect information for all target bases - let mut base_infos = Vec::new(); for &target_base_id in target_bases { - // Find the base_path for this ID - let (target_path_uri, is_dataset_root) = match ¶ms.initial_bases { - Some(initial_bases) => { - // CREATE or OVERWRITE with new bases: find the base_path by ID - let base_path = initial_bases - .iter() - .find(|base_path| base_path.id == target_base_id) - .ok_or_else(|| { - Error::invalid_input( - format!( - "Target base ID {} not found in initial_bases", - target_base_id - ), - location!(), - ) - })?; - (base_path.path.clone(), base_path.is_dataset_root) - } - None => { - // APPEND or OVERWRITE without new bases: find in existing manifest - let manifest = existing_manifest.ok_or_else(|| { - Error::invalid_input( - format!( - "Cannot use target base ID {} without existing manifest or initial_bases", - target_base_id - ), - location!(), - ) - })?; - - let base_path = manifest.base_paths.get(&target_base_id).ok_or_else(|| { - Error::invalid_input( - format!( - "Target base ID {} not found in existing manifest", - target_base_id - ), - location!(), - ) - })?; - (base_path.path.clone(), base_path.is_dataset_root) - } - }; + let base_path = all_bases.get(&target_base_id).ok_or_else(|| { + Error::invalid_input( + format!( + "Target base ID {} not found in available bases", + target_base_id + ), + location!(), + ) + })?; - // Extract path from URI using standard ObjectStore method let (target_object_store, extracted_path) = ObjectStore::from_uri_and_params( store_registry.clone(), - &target_path_uri, + &base_path.path, &store_params, ) - .await?; + .await?; - // Store the raw path - /data will be added later based on is_dataset_root - base_infos.push(TargetBaseInfo { + bases_info.push(TargetBaseInfo { base_id: target_base_id, object_store: target_object_store, base_dir: extracted_path, - is_dataset_root, + is_dataset_root: base_path.is_dataset_root, }); } - Ok(Some(base_infos)) + Ok(Some(bases_info)) } else { Ok(None) } @@ -905,15 +808,7 @@ pub async fn open_writer( base_dir: &Path, storage_version: LanceFileVersion, ) -> Result> { - open_writer_with_options( - object_store, - schema, - base_dir, - storage_version, - true, - None, - ) - .await + open_writer_with_options(object_store, schema, base_dir, storage_version, true, None).await } pub async fn open_writer_with_options( @@ -1810,6 +1705,36 @@ mod tests { .iter() .any(|file| file.base_id == Some(1))); } + + // Test validation: cannot specify both target_bases and target_base_names_or_paths + let mut data_gen2 = + BatchGenerator::new().col(Box::new(IncrementingInt32::new().named("id".to_owned()))); + + let result = Dataset::write( + data_gen2.batch(5), + &format!("{}/test_validation", test_uri), + Some(WriteParams { + mode: WriteMode::Create, + initial_bases: Some(vec![ + BasePath { + id: 1, + name: Some("base1".to_string()), + path: base1_uri.clone(), + is_dataset_root: true, + }, + ]), + target_bases: Some(vec![1]), + target_base_names_or_paths: Some(vec!["base1".to_string()]), + ..Default::default() + }), + ) + .await; + + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("Cannot specify both target_base_names_or_paths and target_bases")); } #[tokio::test] @@ -1826,7 +1751,7 @@ mod tests { let mut data_gen = BatchGenerator::new().col(Box::new(IncrementingInt32::new().named("id".to_owned()))); - let dataset = crate::dataset::Dataset::write( + let dataset = Dataset::write( data_gen.batch(3), &primary_uri, Some(WriteParams { @@ -1859,7 +1784,7 @@ mod tests { let mut data_gen2 = BatchGenerator::new().col(Box::new(IncrementingInt32::new().named("id".to_owned()))); - let dataset = crate::dataset::Dataset::write( + let dataset = Dataset::write( data_gen2.batch(2), std::sync::Arc::new(dataset), Some(WriteParams { @@ -1890,11 +1815,38 @@ mod tests { // Verify data was written to base2 (ID 2) let fragments = dataset.get_fragments(); - assert!(fragments.iter().any(|f| f + assert!(fragments.iter().all(|f| f .metadata .files .iter() - .any(|file| file.base_id == Some(2)))); + .all(|file| file.base_id == Some(2)))); + + // Test validation: cannot specify initial_bases in OVERWRITE mode + let mut data_gen3 = + BatchGenerator::new().col(Box::new(IncrementingInt32::new().named("id".to_owned()))); + + let result = Dataset::write( + data_gen3.batch(2), + Arc::new(dataset), + Some(WriteParams { + mode: WriteMode::Overwrite, + initial_bases: Some(vec![BasePath { + id: 3, + name: Some("base3".to_string()), + path: _base3_uri.clone(), + is_dataset_root: true, + }]), + target_bases: Some(vec![1]), + ..Default::default() + }), + ) + .await; + + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("Cannot register new bases in Overwrite mode")); } #[tokio::test] @@ -1910,7 +1862,7 @@ mod tests { let mut data_gen = BatchGenerator::new().col(Box::new(IncrementingInt32::new().named("id".to_owned()))); - let dataset = crate::dataset::Dataset::write( + let dataset = Dataset::write( data_gen.batch(3), &primary_uri, Some(WriteParams { @@ -1942,7 +1894,7 @@ mod tests { let mut data_gen2 = BatchGenerator::new().col(Box::new(IncrementingInt32::new().named("id".to_owned()))); - let dataset = crate::dataset::Dataset::write( + let dataset = Dataset::write( data_gen2.batch(2), std::sync::Arc::new(dataset), Some(WriteParams { @@ -1963,9 +1915,9 @@ mod tests { let mut data_gen3 = BatchGenerator::new().col(Box::new(IncrementingInt32::new().named("id".to_owned()))); - let dataset = crate::dataset::Dataset::write( + let dataset = Dataset::write( data_gen3.batch(4), - std::sync::Arc::new(dataset), + Arc::new(dataset), Some(WriteParams { mode: WriteMode::Append, target_bases: Some(vec![2]), @@ -1988,88 +1940,33 @@ mod tests { assert!(has_base1_data, "Should have data in base1"); assert!(has_base2_data, "Should have data in base2"); - } - - #[tokio::test] - async fn test_multi_base_overwrite_reference_existing() { - use lance_testing::datagen::{BatchGenerator, IncrementingInt32}; - - // Create initial dataset with multi-base configuration - let test_uri = "memory://overwrite_ref_test"; - let primary_uri = format!("{}/primary", test_uri); - let base1_uri = format!("{}/base1", test_uri); - let base2_uri = format!("{}/base2", test_uri); - let mut data_gen = + // Test validation: cannot specify initial_bases in APPEND mode + let mut data_gen4 = BatchGenerator::new().col(Box::new(IncrementingInt32::new().named("id".to_owned()))); + let base3_uri = format!("{}/base3", test_uri); - let dataset = crate::dataset::Dataset::write( - data_gen.batch(3), - &primary_uri, + let result = Dataset::write( + data_gen4.batch(2), + Arc::new(dataset), Some(WriteParams { - mode: WriteMode::Create, - initial_bases: Some(vec![ - BasePath { - id: 1, - name: Some("base1".to_string()), - path: base1_uri.clone(), - is_dataset_root: true, - }, - BasePath { - id: 2, - name: Some("base2".to_string()), - path: base2_uri.clone(), - is_dataset_root: true, - }, - ]), + mode: WriteMode::Append, + initial_bases: Some(vec![BasePath { + id: 3, + name: Some("base3".to_string()), + path: base3_uri, + is_dataset_root: true, + }]), target_bases: Some(vec![1]), ..Default::default() }), ) - .await - .unwrap(); + .await; - assert_eq!(dataset.count_rows(None).await.unwrap(), 3); - assert_eq!(dataset.manifest.base_paths.len(), 2); - - // Overwrite referencing existing base2 without registering new bases - let mut data_gen2 = - BatchGenerator::new().col(Box::new(IncrementingInt32::new().named("id".to_owned()))); - - let dataset = crate::dataset::Dataset::write( - data_gen2.batch(5), - std::sync::Arc::new(dataset), - Some(WriteParams { - mode: WriteMode::Overwrite, - target_bases: Some(vec![2]), - ..Default::default() - }), - ) - .await - .unwrap(); - - // Verify data was overwritten with new row count - assert_eq!(dataset.count_rows(None).await.unwrap(), 5); - - // Verify base_paths are still the same (no new bases registered) - assert_eq!(dataset.manifest.base_paths.len(), 2); - assert!(dataset - .manifest - .base_paths - .values() - .any(|bp| bp.name == Some("base1".to_string()))); - assert!(dataset - .manifest - .base_paths - .values() - .any(|bp| bp.name == Some("base2".to_string()))); - - // Verify data was written to base2 - let fragments = dataset.get_fragments(); - assert!(fragments.iter().any(|f| f - .metadata - .files - .iter() - .any(|file| file.base_id == Some(2)))); + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("Cannot register new bases in Append mode")); } } diff --git a/rust/lance/src/dataset/write/insert.rs b/rust/lance/src/dataset/write/insert.rs index e4b2cdb1838..b8c9c225724 100644 --- a/rust/lance/src/dataset/write/insert.rs +++ b/rust/lance/src/dataset/write/insert.rs @@ -25,7 +25,7 @@ use snafu::location; use crate::dataset::builder::DatasetBuilder; use crate::dataset::transaction::{Operation, Transaction, TransactionBuilder}; -use crate::dataset::write::{write_fragments_internal, TargetBaseInfo}; +use crate::dataset::write::{validate_and_resolve_target_bases, write_fragments_internal}; use crate::dataset::ReadParams; use crate::Dataset; use crate::{Error, Result}; @@ -195,7 +195,9 @@ impl<'a> InsertBuilder<'a> { self.validate_write(&mut context, &schema)?; - let target_base_info = self.validate_and_resolve_multi_base(&mut context).await?; + let existing_base_paths = context.dest.dataset().map(|ds| &ds.manifest.base_paths); + let target_base_info = + validate_and_resolve_target_bases(&mut context.params, existing_base_paths).await?; let written_frags = write_fragments_internal( context.dest.dataset(), @@ -213,39 +215,6 @@ impl<'a> InsertBuilder<'a> { Ok((transaction, context)) } - async fn validate_and_resolve_multi_base( - &self, - context: &mut WriteContext<'_>, - ) -> Result>> { - - if !matches!(context.params.mode, WriteMode::Create) - && context.params.initial_bases.is_some() - { - return Err(Error::invalid_input( - format!( - "Cannot register new bases in {:?} mode. Only CREATE mode can register new bases.", - context.params.mode - ), - location!(), - )); - } - - let existing_manifest = context.dest.dataset().map(|ds| ds.manifest.as_ref()); - let store_registry = context - .params - .session - .as_ref() - .map(|s| s.store_registry()) - .unwrap_or_default(); - - crate::dataset::write::resolve_target_bases( - &mut context.params, - existing_manifest, - store_registry, - ) - .await - } - fn build_transaction( schema: Schema, written_frags: WrittenFragments, From 1396b1c5564908f53d05499043a528d588e6beb6 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Thu, 9 Oct 2025 16:25:42 -0700 Subject: [PATCH 03/11] seems good now --- python/python/lance/dataset.py | 51 ++-- python/python/tests/test_multi_base.py | 324 +++++++++---------------- python/src/dataset.rs | 17 +- rust/lance/src/dataset/write.rs | 158 +++++++++++- 4 files changed, 302 insertions(+), 248 deletions(-) diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 43cd7285e02..f9ef6d44433 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -4898,7 +4898,8 @@ def write_dataset( auto_cleanup_options: Optional[AutoCleanupConfig] = None, commit_message: Optional[str] = None, transaction_properties: Optional[Dict[str, str]] = None, - target_bases: Optional[List[Union[str, DatasetBasePath]]] = None, + initial_bases: Optional[List[DatasetBasePath]] = None, + target_bases: Optional[List[str]] = None, ) -> LanceDataset: """Write a given data_obj to the given uri @@ -4977,32 +4978,27 @@ def write_dataset( and can be retrieved using read_transaction(). If both `commit_message` and `properties` are provided, `commit_message` will override any "lance.commit.message" key in `properties`. - target_bases: list of str or DatasetBasePath, optional - Specifies which base paths to use for writing data. Can contain: + initial_bases: list of DatasetBasePath, optional + New base paths to register in the manifest. Only used in **CREATE mode**. + Cannot be specified in APPEND or OVERWRITE modes. - - :class:`DatasetBasePath` objects: Register new base paths (**CREATE mode only**) - - Strings: Reference existing bases by name or path (for all modes) + Each :class:`DatasetBasePath` defines a storage location with an optional name: - **Important**: OVERWRITE mode **always inherits** existing base_paths and cannot - register new ones. Only CREATE mode can register new bases. + >>> from lance import DatasetBasePath + >>> bases = [ + ... DatasetBasePath("s3://bucket1/data", name="primary"), + ... DatasetBasePath("s3://bucket2/data", name="archive") + ... ] - When a string is provided, it will be resolved by trying to match: - 1. Base name (e.g., "primary", "archive") - 2. Base path URI (e.g., "s3://bucket/path") + target_bases: list of str, optional + References to base paths where data should be written. Can be specified in all modes. - For CREATE mode, you can define new bases and write to them in one call: + Each string is resolved by trying to match: + 1. Base name (e.g., "primary", "archive") from registered bases + 2. Base path URI (e.g., "s3://bucket1/data") - >>> bases = [DatasetBasePath("s3://bucket1/data", name="primary"), - ... DatasetBasePath("s3://bucket2/data", name="archive")] - >>> ds = write_dataset(data, "s3://bucket1/dataset", mode="create", - ... target_bases=bases) - - For OVERWRITE and APPEND modes, reference existing bases by name or path: - - >>> # OVERWRITE always inherits existing base_paths - >>> write_dataset(data, ds, mode="overwrite", target_bases=["primary"]) - >>> write_dataset(data, ds, mode="append", target_bases=["primary"]) - >>> write_dataset(data, ds, mode="append", target_bases=["s3://bucket1/data"]) + **CREATE mode**: References must match bases in `initial_bases` + **APPEND/OVERWRITE modes**: References must match bases in the existing manifest Examples -------- @@ -5010,12 +5006,16 @@ def write_dataset( >>> import lance >>> from lance import DatasetBasePath - >>> bases = [ + >>> initial_bases = [ ... DatasetBasePath("s3://fast-storage/data", name="hot"), ... DatasetBasePath("s3://archive-storage/data", name="cold") ... ] - >>> ds = lance.write_dataset(data, "s3://fast-storage/mydataset", - ... mode="create", target_bases=bases) + >>> ds = lance.write_dataset( + ... data, "s3://fast-storage/mydataset", + ... mode="create", + ... initial_bases=initial_bases, + ... target_bases=["hot"] + ... ) Append data to a specific base by name: @@ -5065,6 +5065,7 @@ def write_dataset( "enable_stable_row_ids": enable_stable_row_ids, "auto_cleanup_options": auto_cleanup_options, "transaction_properties": merged_properties, + "initial_bases": initial_bases, "target_bases": target_bases, } diff --git a/python/python/tests/test_multi_base.py b/python/python/tests/test_multi_base.py index 8a46e9e0ecb..0ea85bc53a1 100644 --- a/python/python/tests/test_multi_base.py +++ b/python/python/tests/test_multi_base.py @@ -60,12 +60,12 @@ def test_multi_base_create_and_read(self): data, self.primary_uri, mode="create", - target_bases=[ + initial_bases=[ DatasetBasePath(self.path1_uri, name="path1"), DatasetBasePath(self.path2_uri, name="path2"), DatasetBasePath(self.path3_uri, name="path3"), - "path2", # Write data to path2 ], + target_bases=["path2"], # Write data to path2 max_rows_per_file=100, # Force multiple fragments ) @@ -91,11 +91,11 @@ def test_multi_base_append_mode(self): initial_data, self.primary_uri, mode="create", - target_bases=[ + initial_bases=[ DatasetBasePath(self.path1_uri, name="path1"), DatasetBasePath(self.path2_uri, name="path2"), - "path1", # Write to path1 ], + target_bases=["path1"], # Write to path1 max_rows_per_file=100, ) @@ -129,11 +129,11 @@ def test_multi_base_overwrite_mode_inherits_bases(self): initial_data, self.primary_uri, mode="create", - target_bases=[ + initial_bases=[ DatasetBasePath(self.path1_uri, name="path1"), DatasetBasePath(self.path2_uri, name="path2"), - "path1", # Write to path1 ], + target_bases=["path1"], # Write to path1 max_rows_per_file=100, ) @@ -174,11 +174,11 @@ def test_multi_base_overwrite_mode_primary_path_default(self): initial_data, self.primary_uri, mode="create", - target_bases=[ + initial_bases=[ DatasetBasePath(self.path1_uri, name="path1"), DatasetBasePath(self.path2_uri, name="path2"), - "path1", # Write to path1 ], + target_bases=["path1"], # Write to path1 max_rows_per_file=50, ) @@ -211,6 +211,16 @@ def test_multi_base_overwrite_mode_primary_path_default(self): assert any(bp.name == "path1" for bp in base_paths.values()) assert any(bp.name == "path2" for bp in base_paths.values()) + # Verify that data files were written to primary path (not to path1 or path2) + fragments = list(updated_dataset.get_fragments()) + for fragment in fragments: + # Fragment files should be in primary path (base_id should be None) + data_file = fragment.data_files()[0] + assert data_file.base_id is None, ( + f"Expected data file to be in primary path (base_id=None), " + f"but got base_id={data_file.base_id}" + ) + # Since base_paths exist, we can append to them by name append_data = self.create_test_data(25, id_offset=300) @@ -224,75 +234,114 @@ def test_multi_base_overwrite_mode_primary_path_default(self): final_result = final_dataset.to_table().to_pandas() assert len(final_result) == 100 - def test_concurrent_appends(self): - """Test concurrent appends to multi-base dataset.""" - # Create initial dataset + def test_multi_base_append_mode_primary_path_default(self): + """Test that APPEND mode defaults to primary path when no target specified.""" + # Create initial dataset with explicit data file bases initial_data = self.create_test_data(100) - lance.write_dataset( + dataset = lance.write_dataset( initial_data, self.primary_uri, mode="create", - target_bases=[ + initial_bases=[ DatasetBasePath(self.path1_uri, name="path1"), DatasetBasePath(self.path2_uri, name="path2"), - DatasetBasePath(self.path3_uri, name="path3"), - "path1", # Write to path1 ], + target_bases=["path1"], # Write to path1 max_rows_per_file=50, ) - def append_worker(worker_id, target_path_name, start_id, num_rows): - """Worker function to append data concurrently.""" - worker_data = self.create_test_data(num_rows, id_offset=start_id) + # Append without specifying target - should use primary path + append_data = self.create_test_data(75, id_offset=100) - # Load dataset fresh for this worker - worker_dataset = lance.dataset(self.primary_uri) + updated_dataset = lance.write_dataset( + append_data, + dataset, + mode="append", + # No target_bases specified - data goes to primary path + max_rows_per_file=25, + ) - # Perform append - return lance.write_dataset( - worker_data, - worker_dataset, - mode="append", - target_bases=[target_path_name], # Reference by name - max_rows_per_file=25, - ) + # Verify appended data + result = updated_dataset.to_table().to_pandas() + assert len(result) == 175 - # Configure concurrent append tasks - append_tasks = [ - {"worker_id": 1, "target_path": "path1", "start_id": 100, "num_rows": 50}, - {"worker_id": 2, "target_path": "path2", "start_id": 150, "num_rows": 50}, - {"worker_id": 3, "target_path": "path3", "start_id": 200, "num_rows": 50}, - ] - - # Execute concurrent appends - results = [] - with ThreadPoolExecutor(max_workers=3) as executor: - future_to_task = { - executor.submit( - append_worker, - task["worker_id"], - task["target_path"], - task["start_id"], - task["num_rows"], - ): task - for task in append_tasks - } + # Verify data content + expected_ids = set(range(175)) + actual_ids = set(result["id"].tolist()) + assert actual_ids == expected_ids - for future in as_completed(future_to_task): - results.append(future.result()) + # Verify base_paths are preserved from previous manifest + base_paths = updated_dataset._ds.base_paths() + assert len(base_paths) == 2 + assert any(bp.name == "path1" for bp in base_paths.values()) + assert any(bp.name == "path2" for bp in base_paths.values()) - # Verify final dataset integrity - final_dataset = lance.dataset(self.primary_uri) - final_data = final_dataset.to_table().to_pandas() + # Verify that new data files were written to primary path (not to path1 or path2) + fragments = list(updated_dataset.get_fragments()) + # The first 2 fragments should be in path1 (from initial write) + # The remaining fragments should be in primary path (from append) + primary_path_fragments = 0 + path1_fragments = 0 + + for fragment in fragments: + data_file = fragment.data_files()[0] + if data_file.base_id is None: + primary_path_fragments += 1 + else: + base_path = base_paths.get(data_file.base_id) + if base_path and base_path.name == "path1": + path1_fragments += 1 + + assert path1_fragments == 2, f"Expected 2 fragments in path1, got {path1_fragments}" + assert primary_path_fragments >= 3, ( + f"Expected at least 3 fragments in primary path, got {primary_path_fragments}" + ) - # Should have initial 100 + 3 * 50 = 250 rows - assert len(final_data) == 250 + def test_multi_base_is_dataset_root_flag(self): + """Test is_dataset_root flag on DatasetBasePath.""" + # Create initial dataset with one base marked as dataset_root + initial_data = self.create_test_data(100) - # Verify all expected IDs are present - expected_ids = set(range(250)) - actual_ids = set(final_data["id"].tolist()) - assert actual_ids == expected_ids + dataset = lance.write_dataset( + initial_data, + self.primary_uri, + mode="create", + initial_bases=[ + DatasetBasePath(self.path1_uri, name="path1", is_dataset_root=True), + DatasetBasePath(self.path2_uri, name="path2", is_dataset_root=False), + ], + target_bases=["path1"], + max_rows_per_file=50, + ) + + # Verify base_paths configuration + base_paths = dataset._ds.base_paths() + assert len(base_paths) == 2 + + # Find path1 and path2 in base_paths + path1_base = None + path2_base = None + for base_path in base_paths.values(): + if base_path.name == "path1": + path1_base = base_path + elif base_path.name == "path2": + path2_base = base_path + + assert path1_base is not None, "path1 base not found" + assert path2_base is not None, "path2 base not found" + + # Verify is_dataset_root flags + assert path1_base.is_dataset_root is True, ( + f"Expected path1.is_dataset_root=True, got {path1_base.is_dataset_root}" + ) + assert path2_base.is_dataset_root is False, ( + f"Expected path2.is_dataset_root=False, got {path2_base.is_dataset_root}" + ) + + # Verify data is readable + result = dataset.to_table().to_pandas() + assert len(result) == 100 def test_validation_errors(self): """Test validation errors for invalid multi-base configurations.""" @@ -304,11 +353,11 @@ def test_validation_errors(self): data, self.primary_uri, mode="create", - target_bases=[ + initial_bases=[ DatasetBasePath(self.path1_uri, name="path1"), DatasetBasePath(self.path2_uri, name="path2"), - "path3", # Not in new bases ], + target_bases=["path3"], # Not in new bases ) # Test 2: DatasetBasePath in APPEND mode @@ -317,11 +366,11 @@ def test_validation_errors(self): data, self.primary_uri, mode="create", - target_bases=[ + initial_bases=[ DatasetBasePath(self.path1_uri, name="path1"), DatasetBasePath(self.path2_uri, name="path2"), - "path1", ], + target_bases=["path1"], ) with pytest.raises(Exception, match="Cannot register new bases in Append mode"): @@ -329,12 +378,12 @@ def test_validation_errors(self): data, base_dataset, mode="append", - target_bases=[ + initial_bases=[ DatasetBasePath( name="path3", path=self.path3_uri ), # New base not allowed - "path3", ], + target_bases=["path3"], ) def test_fragment_distribution(self): @@ -345,11 +394,11 @@ def test_fragment_distribution(self): data, self.primary_uri, mode="create", - target_bases=[ + initial_bases=[ DatasetBasePath(self.path1_uri, name="path1"), DatasetBasePath(self.path2_uri, name="path2"), - "path1", ], + target_bases=["path1"], max_rows_per_file=100, # Should create 10 fragments ) @@ -364,143 +413,4 @@ def test_fragment_distribution(self): # Test scanning with filters filtered_result = dataset.scanner(filter="id < 500").to_table().to_pandas() assert len(filtered_result) == 500 - assert all(filtered_result["id"] < 500) - - def test_schema_consistency(self): - """Test that schema is consistent across multi-base operations.""" - # Create dataset with specific schema - schema = pa.schema( - [ - pa.field("id", pa.int64()), - pa.field("name", pa.string()), - pa.field("score", pa.float64()), - ] - ) - - data = pa.table( - { - "id": [1, 2, 3, 4, 5], - "name": ["a", "b", "c", "d", "e"], - "score": [1.0, 2.0, 3.0, 4.0, 5.0], - }, - schema=schema, - ) - - dataset = lance.write_dataset( - data, - self.primary_uri, - mode="create", - target_bases=[ - DatasetBasePath(self.path1_uri, name="path1"), - DatasetBasePath(self.path2_uri, name="path2"), - "path1", - ], - ) - - # Verify schema - assert dataset.schema == schema - - # Append with same schema to different path - append_data = pa.table( - {"id": [6, 7, 8], "name": ["f", "g", "h"], "score": [6.0, 7.0, 8.0]}, - schema=schema, - ) - - updated_dataset = lance.write_dataset( - append_data, dataset, mode="append", target_bases=["path2"] - ) - - # Schema should remain consistent - assert updated_dataset.schema == schema - - # Verify all data - result = updated_dataset.to_table() - assert len(result) == 8 - assert result.schema == schema - - def test_dataset_reopening(self): - """Test reopening a multi-base dataset.""" - data = self.create_test_data(300) - - # Create multi-base dataset - dataset = lance.write_dataset( - data, - self.primary_uri, - mode="create", - target_bases=[ - DatasetBasePath(self.path1_uri, name="path1"), - DatasetBasePath(self.path2_uri, name="path2"), - "path1", - ], - max_rows_per_file=100, - ) - - # Close and reopen dataset - del dataset - reopened_dataset = lance.dataset(self.primary_uri) - - # Verify data is still accessible - result = reopened_dataset.to_table().to_pandas() - assert len(result) == 300 - - # Verify we can still append to different paths - append_data = self.create_test_data(50, id_offset=300) - - updated_dataset = lance.write_dataset( - append_data, - reopened_dataset, - mode="append", - target_bases=["path2"], # Reference by name - ) - - final_result = updated_dataset.to_table().to_pandas() - assert len(final_result) == 350 - - def test_single_path_mode(self): - """Test behavior with single path configuration.""" - data = self.create_test_data(100) - - # Test with single path (should work like normal dataset) - dataset = lance.write_dataset( - data, - self.primary_uri, - mode="create", - target_bases=[ - DatasetBasePath(self.path1_uri, name="path1"), - "path1", - ], - ) - - result = dataset.to_table().to_pandas() - assert len(result) == 100 - - # Verify data integrity - pd.testing.assert_frame_equal( - result.sort_values("id").reset_index(drop=True), - data.sort_values("id").reset_index(drop=True), - ) - - def test_primary_path_write_rejection(self): - """Test that writing to primary path URI is rejected.""" - data = self.create_test_data(50) - - # Create dataset with explicit data file bases - dataset = lance.write_dataset( - data, - self.primary_uri, - mode="create", - target_bases=[ - DatasetBasePath(self.path1_uri, name="path1"), - DatasetBasePath(self.path2_uri, name="path2"), - "path1", - ], - ) - - # Try to append using primary path URI (should fail - not in registered bases) - with pytest.raises(Exception, match="not found"): - lance.write_dataset( - data, - dataset, - mode="append", - target_bases=[self.primary_uri], # Primary path URI not registered - ) + assert all(filtered_result["id"] < 500) \ No newline at end of file diff --git a/python/src/dataset.rs b/python/src/dataset.rs index 7b5933ef939..e8b26ef96d2 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -2710,18 +2710,16 @@ pub fn get_write_params(options: &Bound<'_, PyDict>) -> PyResult>>(options, "target_bases")? { + // Handle initial_bases parameter (list of DatasetBasePath objects) + if let Some(initial_bases_list) = get_dict_opt::>>(options, "initial_bases")? { let mut new_bases = Vec::new(); - let mut target_base_refs = Vec::new(); - for item in target_bases_list.iter() { + for item in initial_bases_list.iter() { if let Ok(dataset_base_path) = item.extract::() { new_bases.push(BasePath::from(dataset_base_path)); - } else if let Ok(s) = item.extract::() { - target_base_refs.push(s); } else { return Err(PyValueError::new_err( - "target_bases must contain DatasetBasePath objects or strings" + "initial_bases must contain DatasetBasePath objects only" )); } } @@ -2729,9 +2727,12 @@ pub fn get_write_params(options: &Bound<'_, PyDict>) -> PyResult>(options, "target_bases")? { + if !target_bases_list.is_empty() { + p = p.with_target_base_names_or_paths(target_bases_list); } } diff --git a/rust/lance/src/dataset/write.rs b/rust/lance/src/dataset/write.rs index c9ffda55c75..eefd3f6e835 100644 --- a/rust/lance/src/dataset/write.rs +++ b/rust/lance/src/dataset/write.rs @@ -1715,14 +1715,12 @@ mod tests { &format!("{}/test_validation", test_uri), Some(WriteParams { mode: WriteMode::Create, - initial_bases: Some(vec![ - BasePath { - id: 1, - name: Some("base1".to_string()), - path: base1_uri.clone(), - is_dataset_root: true, - }, - ]), + initial_bases: Some(vec![BasePath { + id: 1, + name: Some("base1".to_string()), + path: base1_uri.clone(), + is_dataset_root: true, + }]), target_bases: Some(vec![1]), target_base_names_or_paths: Some(vec!["base1".to_string()]), ..Default::default() @@ -1969,4 +1967,148 @@ mod tests { .to_string() .contains("Cannot register new bases in Append mode")); } + + #[tokio::test] + async fn test_is_dataset_root_flag() { + use lance_core::utils::tempfile::TempDir; + use lance_testing::datagen::{BatchGenerator, IncrementingInt32}; + + // Create dataset with different is_dataset_root settings using tempdir + let test_dir = TempDir::default(); + let primary_uri = test_dir.path_str(); + let base1_dir = test_dir.std_path().join("base1"); + let base2_dir = test_dir.std_path().join("base2"); + + std::fs::create_dir_all(&base1_dir).unwrap(); + std::fs::create_dir_all(&base2_dir).unwrap(); + + let base1_uri = format!("file://{}", base1_dir.display()); + let base2_uri = format!("file://{}", base2_dir.display()); + + let mut data_gen = + BatchGenerator::new().col(Box::new(IncrementingInt32::new().named("id".to_owned()))); + + let dataset = Dataset::write( + data_gen.batch(10), + &primary_uri, + Some(WriteParams { + mode: WriteMode::Create, + max_rows_per_file: 5, // Create multiple fragments + initial_bases: Some(vec![ + BasePath { + id: 1, + name: Some("base1".to_string()), + path: base1_uri.clone(), + is_dataset_root: true, // Files will go to base1/data/ + }, + BasePath { + id: 2, + name: Some("base2".to_string()), + path: base2_uri.clone(), + is_dataset_root: false, // Files will go directly to base2/ + }, + ]), + target_bases: Some(vec![1, 2]), // Write to both bases + ..Default::default() + }), + ) + .await + .unwrap(); + + assert_eq!(dataset.count_rows(None).await.unwrap(), 10); + + // Verify base_paths configuration + assert_eq!(dataset.manifest.base_paths.len(), 2); + + let base1 = dataset + .manifest + .base_paths + .values() + .find(|bp| bp.name == Some("base1".to_string())) + .expect("base1 not found"); + let base2 = dataset + .manifest + .base_paths + .values() + .find(|bp| bp.name == Some("base2".to_string())) + .expect("base2 not found"); + + // Verify is_dataset_root flags are persisted correctly in manifest + assert!( + base1.is_dataset_root, + "base1 should have is_dataset_root=true" + ); + assert!( + !base2.is_dataset_root, + "base2 should have is_dataset_root=false" + ); + + // Verify data was written to both bases + let fragments = dataset.get_fragments(); + assert!(!fragments.is_empty()); + + let has_base1_data = fragments + .iter() + .any(|f| f.metadata.files.iter().any(|file| file.base_id == Some(1))); + let has_base2_data = fragments + .iter() + .any(|f| f.metadata.files.iter().any(|file| file.base_id == Some(2))); + + assert!(has_base1_data, "Should have data in base1"); + assert!(has_base2_data, "Should have data in base2"); + + // Verify actual file paths on disk + // For base1 (is_dataset_root=true), files should be in base1/data/ + let base1_data_dir = base1_dir.join("data"); + assert!(base1_data_dir.exists(), "base1/data directory should exist"); + let base1_files: Vec<_> = std::fs::read_dir(&base1_data_dir) + .unwrap() + .filter_map(|e| e.ok()) + .filter(|e| { + e.path() + .extension() + .map(|ext| ext == "lance") + .unwrap_or(false) + }) + .collect(); + assert!( + !base1_files.is_empty(), + "base1/data should contain .lance files" + ); + + // For base2 (is_dataset_root=false), files should be directly in base2/ + let base2_files: Vec<_> = std::fs::read_dir(&base2_dir) + .unwrap() + .filter_map(|e| e.ok()) + .filter(|e| { + e.path() + .extension() + .map(|ext| ext == "lance") + .unwrap_or(false) + }) + .collect(); + assert!( + !base2_files.is_empty(), + "base2 should contain .lance files directly" + ); + + // Verify base2 does NOT have a data subdirectory with lance files + let base2_data_dir = base2_dir.join("data"); + if base2_data_dir.exists() { + let base2_data_files: Vec<_> = std::fs::read_dir(&base2_data_dir) + .unwrap() + .filter_map(|e| e.ok()) + .filter(|e| { + e.path() + .extension() + .map(|ext| ext == "lance") + .unwrap_or(false) + }) + .collect(); + assert!( + base2_data_files.is_empty(), + "base2/data should NOT contain .lance files" + ); + } + } } From 705c82b31927bf5de5f957299a0f8e35d4e6eab9 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Thu, 9 Oct 2025 18:16:42 -0700 Subject: [PATCH 04/11] clippy --- java/lance-jni/src/blocking_dataset.rs | 1 + java/lance-jni/src/transaction.rs | 2 + python/python/tests/test_multi_base.py | 66 ++++++++++++++++ python/src/dataset.rs | 8 +- python/src/transaction.rs | 4 +- rust/lance/src/dataset/write.rs | 100 ++++++++++++++++++++++++- 6 files changed, 174 insertions(+), 7 deletions(-) diff --git a/java/lance-jni/src/blocking_dataset.rs b/java/lance-jni/src/blocking_dataset.rs index e2c73524fd0..52d63bc2fab 100644 --- a/java/lance-jni/src/blocking_dataset.rs +++ b/java/lance-jni/src/blocking_dataset.rs @@ -564,6 +564,7 @@ pub fn inner_commit_overwrite<'local>( fragments, schema, config_upsert_values: None, + initial_bases: None, }; let path_str = path.extract(env)?; let read_version = env.get_u64_opt(&read_version_obj)?; diff --git a/java/lance-jni/src/transaction.rs b/java/lance-jni/src/transaction.rs index e73dbe40b44..f9ce719f369 100644 --- a/java/lance-jni/src/transaction.rs +++ b/java/lance-jni/src/transaction.rs @@ -472,6 +472,7 @@ fn convert_to_java_operation_inner<'local>( fragments: rust_fragments, schema, config_upsert_values, + initial_bases: _, } => { let java_fragments = export_vec(env, &rust_fragments)?; let java_schema = convert_to_java_schema(env, schema)?; @@ -890,6 +891,7 @@ fn convert_to_rust_operation( fragments, schema, config_upsert_values, + initial_bases: None, } } "Rewrite" => { diff --git a/python/python/tests/test_multi_base.py b/python/python/tests/test_multi_base.py index 0ea85bc53a1..07245fd4053 100644 --- a/python/python/tests/test_multi_base.py +++ b/python/python/tests/test_multi_base.py @@ -343,6 +343,72 @@ def test_multi_base_is_dataset_root_flag(self): result = dataset.to_table().to_pandas() assert len(result) == 100 + def test_multi_base_target_by_path_uri(self): + """Test using path URIs instead of names in target_bases.""" + # Create initial dataset with named bases + initial_data = self.create_test_data(100) + + dataset = lance.write_dataset( + initial_data, + self.primary_uri, + mode="create", + initial_bases=[ + DatasetBasePath(self.path1_uri, name="path1"), + DatasetBasePath(self.path2_uri, name="path2"), + ], + target_bases=["path1"], # Write to path1 using name + max_rows_per_file=50, + ) + + # Get the base_paths to find the actual path URI for path2 + base_paths = dataset._ds.base_paths() + path2_base = None + for base_path in base_paths.values(): + if base_path.name == "path2": + path2_base = base_path + break + + assert path2_base is not None, "path2 base not found" + + # Append using the path URI instead of name + append_data = self.create_test_data(50, id_offset=100) + + updated_dataset = lance.write_dataset( + append_data, + dataset, + mode="append", + target_bases=[path2_base.path], # Use path URI instead of name + max_rows_per_file=25, + ) + + # Verify appended data + result = updated_dataset.to_table().to_pandas() + assert len(result) == 150 + + # Verify data content + expected_ids = set(range(150)) + actual_ids = set(result["id"].tolist()) + assert actual_ids == expected_ids + + # Verify that new fragments are in path2 (not primary or path1) + fragments = list(updated_dataset.get_fragments()) + base_paths_updated = updated_dataset._ds.base_paths() + + path1_fragments = 0 + path2_fragments = 0 + + for fragment in fragments: + data_file = fragment.data_files()[0] + if data_file.base_id is not None: + base_path = base_paths_updated.get(data_file.base_id) + if base_path and base_path.name == "path1": + path1_fragments += 1 + elif base_path and base_path.name == "path2": + path2_fragments += 1 + + assert path1_fragments == 2, f"Expected 2 fragments in path1, got {path1_fragments}" + assert path2_fragments == 2, f"Expected 2 fragments in path2, got {path2_fragments}" + def test_validation_errors(self): """Test validation errors for invalid multi-base configurations.""" data = self.create_test_data(100) diff --git a/python/src/dataset.rs b/python/src/dataset.rs index e8b26ef96d2..7586d4ae387 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -427,7 +427,7 @@ impl From for DatasetBasePath { impl From for BasePath { fn from(py_base_path: DatasetBasePath) -> Self { - BasePath::new( + Self::new( py_base_path.id, py_base_path.path.clone(), py_base_path.name, @@ -2711,7 +2711,9 @@ pub fn get_write_params(options: &Bound<'_, PyDict>) -> PyResult>>(options, "initial_bases")? { + if let Some(initial_bases_list) = + get_dict_opt::>>(options, "initial_bases")? + { let mut new_bases = Vec::new(); for item in initial_bases_list.iter() { @@ -2719,7 +2721,7 @@ pub fn get_write_params(options: &Bound<'_, PyDict>) -> PyResult for PyLance { let initial_bases = ob .getattr("initial_bases") .ok() - .and_then(|attr| { - attr.extract::>>().ok() - }) + .and_then(|attr| attr.extract::>>().ok()) .flatten() .map(|py_bases| { py_bases diff --git a/rust/lance/src/dataset/write.rs b/rust/lance/src/dataset/write.rs index eefd3f6e835..e4250e1e106 100644 --- a/rust/lance/src/dataset/write.rs +++ b/rust/lance/src/dataset/write.rs @@ -1969,7 +1969,7 @@ mod tests { } #[tokio::test] - async fn test_is_dataset_root_flag() { + async fn test_multi_base_is_dataset_root_flag() { use lance_core::utils::tempfile::TempDir; use lance_testing::datagen::{BatchGenerator, IncrementingInt32}; @@ -2111,4 +2111,102 @@ mod tests { ); } } + + #[tokio::test] + async fn test_multi_base_target_by_path_uri() { + use lance_core::utils::tempfile::TempDir; + use lance_testing::datagen::{BatchGenerator, IncrementingInt32}; + + // Create dataset with named bases + let test_dir = TempDir::default(); + let primary_uri = test_dir.path_str(); + let base1_dir = test_dir.std_path().join("base1"); + let base2_dir = test_dir.std_path().join("base2"); + + std::fs::create_dir_all(&base1_dir).unwrap(); + std::fs::create_dir_all(&base2_dir).unwrap(); + + let base1_uri = format!("file://{}", base1_dir.display()); + let base2_uri = format!("file://{}", base2_dir.display()); + + let mut data_gen = + BatchGenerator::new().col(Box::new(IncrementingInt32::new().named("id".to_owned()))); + + // Create initial dataset writing to base1 using name + let dataset = Dataset::write( + data_gen.batch(10), + &primary_uri, + Some(WriteParams { + mode: WriteMode::Create, + max_rows_per_file: 5, + initial_bases: Some(vec![ + BasePath { + id: 1, + name: Some("base1".to_string()), + path: base1_uri.clone(), + is_dataset_root: true, + }, + BasePath { + id: 2, + name: Some("base2".to_string()), + path: base2_uri.clone(), + is_dataset_root: true, + }, + ]), + target_base_names_or_paths: Some(vec!["base1".to_string()]), // Use name + ..Default::default() + }), + ) + .await + .unwrap(); + + assert_eq!(dataset.count_rows(None).await.unwrap(), 10); + + // Verify data was written to base1 + let fragments = dataset.get_fragments(); + assert!(fragments.iter().all(|f| f + .metadata + .files + .iter() + .all(|file| file.base_id == Some(1)))); + + // Now append using the path URI instead of name + let mut data_gen2 = + BatchGenerator::new().col(Box::new(IncrementingInt32::new().named("id".to_owned()))); + + let dataset = Dataset::write( + data_gen2.batch(5), + Arc::new(dataset), + Some(WriteParams { + mode: WriteMode::Append, + // Use the actual path URI instead of the name + target_base_names_or_paths: Some(vec![base2_uri.clone()]), + max_rows_per_file: 5, + ..Default::default() + }), + ) + .await + .unwrap(); + + assert_eq!(dataset.count_rows(None).await.unwrap(), 15); + + // Verify data is now in both bases + let fragments = dataset.get_fragments(); + let has_base1_data = fragments + .iter() + .any(|f| f.metadata.files.iter().any(|file| file.base_id == Some(1))); + let has_base2_data = fragments + .iter() + .any(|f| f.metadata.files.iter().any(|file| file.base_id == Some(2))); + + assert!(has_base1_data, "Should have data in base1"); + assert!(has_base2_data, "Should have data in base2"); + + // Verify base2 has exactly 1 fragment (from the append) + let base2_fragments: Vec<_> = fragments + .iter() + .filter(|f| f.metadata.files.iter().all(|file| file.base_id == Some(2))) + .collect(); + assert_eq!(base2_fragments.len(), 1, "Should have 1 fragment in base2"); + } } From e0c7715797d4f730f8819796d256d3f15e1ba8cf Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Thu, 9 Oct 2025 19:50:45 -0700 Subject: [PATCH 05/11] python lint --- python/python/lance/dataset.py | 3 ++- python/python/tests/test_multi_base.py | 24 +++++++++++++++--------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index f9ef6d44433..6d15fae3133 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -4991,7 +4991,8 @@ def write_dataset( ... ] target_bases: list of str, optional - References to base paths where data should be written. Can be specified in all modes. + References to base paths where data should be written. Can be + specified in all modes. Each string is resolved by trying to match: 1. Base name (e.g., "primary", "archive") from registered bases diff --git a/python/python/tests/test_multi_base.py b/python/python/tests/test_multi_base.py index 07245fd4053..b92238f4003 100644 --- a/python/python/tests/test_multi_base.py +++ b/python/python/tests/test_multi_base.py @@ -8,12 +8,10 @@ import shutil import tempfile import uuid -from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path import lance import pandas as pd -import pyarrow as pa import pytest from lance import DatasetBasePath @@ -166,7 +164,7 @@ def test_multi_base_overwrite_mode_inherits_bases(self): assert any(bp.name == "path2" for bp in base_paths.values()) def test_multi_base_overwrite_mode_primary_path_default(self): - """Test that OVERWRITE mode defaults to primary path when no target specified.""" + """Test OVERWRITE mode defaults to primary path when no target.""" # Create initial dataset with explicit data file bases initial_data = self.create_test_data(100) @@ -277,7 +275,8 @@ def test_multi_base_append_mode_primary_path_default(self): assert any(bp.name == "path1" for bp in base_paths.values()) assert any(bp.name == "path2" for bp in base_paths.values()) - # Verify that new data files were written to primary path (not to path1 or path2) + # Verify new data files were written to primary path + # (not to path1 or path2) fragments = list(updated_dataset.get_fragments()) # The first 2 fragments should be in path1 (from initial write) # The remaining fragments should be in primary path (from append) @@ -293,9 +292,12 @@ def test_multi_base_append_mode_primary_path_default(self): if base_path and base_path.name == "path1": path1_fragments += 1 - assert path1_fragments == 2, f"Expected 2 fragments in path1, got {path1_fragments}" + assert path1_fragments == 2, ( + f"Expected 2 fragments in path1, got {path1_fragments}" + ) assert primary_path_fragments >= 3, ( - f"Expected at least 3 fragments in primary path, got {primary_path_fragments}" + f"Expected at least 3 fragments in primary path, " + f"got {primary_path_fragments}" ) def test_multi_base_is_dataset_root_flag(self): @@ -406,8 +408,12 @@ def test_multi_base_target_by_path_uri(self): elif base_path and base_path.name == "path2": path2_fragments += 1 - assert path1_fragments == 2, f"Expected 2 fragments in path1, got {path1_fragments}" - assert path2_fragments == 2, f"Expected 2 fragments in path2, got {path2_fragments}" + assert path1_fragments == 2, ( + f"Expected 2 fragments in path1, got {path1_fragments}" + ) + assert path2_fragments == 2, ( + f"Expected 2 fragments in path2, got {path2_fragments}" + ) def test_validation_errors(self): """Test validation errors for invalid multi-base configurations.""" @@ -479,4 +485,4 @@ def test_fragment_distribution(self): # Test scanning with filters filtered_result = dataset.scanner(filter="id < 500").to_table().to_pandas() assert len(filtered_result) == 500 - assert all(filtered_result["id"] < 500) \ No newline at end of file + assert all(filtered_result["id"] < 500) From 44ce9bfeed6f658705bd97f0ee6d542e19ba9259 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Thu, 9 Oct 2025 20:48:14 -0700 Subject: [PATCH 06/11] fix shallow clone bug --- rust/lance/src/dataset.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index bc27d53f416..6831bf503e5 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -1881,7 +1881,7 @@ impl Dataset { is_shallow: true, ref_name, ref_version: version_number, - ref_path: String::from(self.base.clone()), + ref_path: self.uri.clone(), branch_name: None, }; let transaction = Transaction::new(version_number, clone_op, None, None); From 550be7c3c9c760e3b28854251e31d90fb5dd133f Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Thu, 9 Oct 2025 22:45:39 -0700 Subject: [PATCH 07/11] fix windows shallow clone --- rust/lance/src/dataset/write/commit.rs | 1 + rust/lance/src/io/commit.rs | 29 ++++++++++++++------------ 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/rust/lance/src/dataset/write/commit.rs b/rust/lance/src/dataset/write/commit.rs index 142d43ec046..5a94332ce71 100644 --- a/rust/lance/src/dataset/write/commit.rs +++ b/rust/lance/src/dataset/write/commit.rs @@ -357,6 +357,7 @@ impl<'a> CommitBuilder<'a> { &manifest_config, manifest_naming_scheme, metadata_cache.as_ref(), + session.store_registry(), ) .await? }; diff --git a/rust/lance/src/io/commit.rs b/rust/lance/src/io/commit.rs index 922fcc9183e..1333c0ec566 100644 --- a/rust/lance/src/io/commit.rs +++ b/rust/lance/src/io/commit.rs @@ -40,14 +40,6 @@ use lance_table::io::commit::{ use rand::{rng, Rng}; use snafu::location; -use futures::future::Either; -use futures::{StreamExt, TryFutureExt, TryStreamExt}; -use lance_core::{Error, Result}; -use lance_index::{is_system_index, DatasetIndexExt}; -use log; -use object_store::path::Path; -use prost::Message; - use super::ObjectStore; use crate::dataset::cleanup::auto_cleanup_hook; use crate::dataset::fragment::FileFragment; @@ -61,6 +53,14 @@ use crate::session::caches::DSMetadataCache; use crate::session::index_caches::IndexMetadataKey; use crate::session::Session; use crate::Dataset; +use futures::future::Either; +use futures::{StreamExt, TryFutureExt, TryStreamExt}; +use lance_core::{Error, Result}; +use lance_index::{is_system_index, DatasetIndexExt}; +use lance_io::object_store::ObjectStoreRegistry; +use log; +use object_store::path::Path; +use prost::Message; mod conflict_resolver; #[cfg(all(feature = "dynamodb_tests", test))] @@ -109,6 +109,7 @@ async fn do_commit_new_dataset( manifest_naming_scheme: ManifestNamingScheme, blob_version: Option, metadata_cache: &DSMetadataCache, + store_registry: Arc, ) -> Result<(Manifest, ManifestLocation)> { let transaction_file = write_transaction_file(object_store, base_path, transaction).await?; @@ -120,12 +121,10 @@ async fn do_commit_new_dataset( .. } = &transaction.operation { + let source_base_path = + ObjectStore::extract_path_from_uri(store_registry, ref_path.as_str())?; let source_manifest_location = commit_handler - .resolve_version_location( - &Path::parse(ref_path.as_str())?, - *ref_version, - &object_store.inner, - ) + .resolve_version_location(&source_base_path, *ref_version, &object_store.inner) .await?; let source_manifest = Dataset::load_manifest( object_store, @@ -222,6 +221,7 @@ async fn do_commit_new_dataset( } } +#[allow(clippy::too_many_arguments)] pub(crate) async fn commit_new_dataset( object_store: &ObjectStore, commit_handler: &dyn CommitHandler, @@ -230,6 +230,7 @@ pub(crate) async fn commit_new_dataset( write_config: &ManifestWriteConfig, manifest_naming_scheme: ManifestNamingScheme, metadata_cache: &crate::session::caches::DSMetadataCache, + store_registry: Arc, ) -> Result<(Manifest, ManifestLocation)> { let blob_version = if let Some(blob_op) = transaction.blobs_op.as_ref() { let blob_path = base_path.child(BLOB_DIR); @@ -243,6 +244,7 @@ pub(crate) async fn commit_new_dataset( manifest_naming_scheme, None, metadata_cache, + store_registry.clone(), ) .await?; Some(blob_manifest.version) @@ -259,6 +261,7 @@ pub(crate) async fn commit_new_dataset( manifest_naming_scheme, blob_version, metadata_cache, + store_registry, ) .await } From 27cef48172726dee16eb3e772f1d7326d361f214 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Thu, 9 Oct 2025 23:07:40 -0700 Subject: [PATCH 08/11] box new param to minimize future size --- rust/lance/src/dataset.rs | 18 +++++++++++------- rust/lance/src/dataset/write/commit.rs | 2 +- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index 6831bf503e5..b75cab2f8ae 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -167,7 +167,7 @@ pub struct Dataset { /// Object store parameters used when opening this dataset. /// These are used when creating object stores for additional base paths. - pub(crate) store_params: Option, + pub(crate) store_params: Option>, } impl std::fmt::Debug for Dataset { @@ -557,7 +557,7 @@ impl Dataset { self.session.clone(), self.commit_handler.clone(), self.file_reader_options.clone(), - self.store_params.clone(), + self.store_params.as_deref().cloned(), ) } @@ -695,7 +695,7 @@ impl Dataset { metadata_cache, index_cache, file_reader_options, - store_params, + store_params: store_params.map(Box::new), }) } @@ -838,7 +838,7 @@ impl Dataset { self.session.clone(), self.commit_handler.clone(), self.file_reader_options.clone(), - self.store_params.clone(), + self.store_params.as_deref().cloned(), )?; Ok(Some(Arc::new(blobs_dataset))) } else { @@ -1397,7 +1397,11 @@ impl Dataset { let (store, _) = ObjectStore::from_uri_and_params( self.session.store_registry(), &base_path.path, - &self.store_params.clone().unwrap_or_default(), + &self + .store_params + .as_deref() + .cloned() + .unwrap_or_default(), ) .await?; @@ -2024,7 +2028,7 @@ pub(crate) fn load_new_transactions(dataset: &Dataset) -> NewTransactionResult<' dataset.session(), dataset.commit_handler.clone(), dataset.file_reader_options.clone(), - dataset.store_params.clone(), + dataset.store_params.as_deref().cloned(), )?; let object_store = dataset_version.object_store(); let path = dataset_version @@ -2062,7 +2066,7 @@ pub(crate) fn load_new_transactions(dataset: &Dataset) -> NewTransactionResult<' dataset.session(), dataset.commit_handler.clone(), dataset.file_reader_options.clone(), - dataset.store_params.clone(), + dataset.store_params.as_deref().cloned(), ) } else { // If we didn't get the latest manifest, we can still return the dataset diff --git a/rust/lance/src/dataset/write/commit.rs b/rust/lance/src/dataset/write/commit.rs index 5a94332ce71..bb1ed481378 100644 --- a/rust/lance/src/dataset/write/commit.rs +++ b/rust/lance/src/dataset/write/commit.rs @@ -406,7 +406,7 @@ impl<'a> CommitBuilder<'a> { fragment_bitmap, metadata_cache, file_reader_options: None, - store_params: self.store_params.clone(), + store_params: self.store_params.clone().map(Box::new), }) } } From 0784526ea7bd6ecba811e3caa96e9dec615631d3 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Thu, 9 Oct 2025 23:08:00 -0700 Subject: [PATCH 09/11] fmt --- rust/lance/src/dataset.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index b75cab2f8ae..e14cc6c4be1 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -1397,11 +1397,7 @@ impl Dataset { let (store, _) = ObjectStore::from_uri_and_params( self.session.store_registry(), &base_path.path, - &self - .store_params - .as_deref() - .cloned() - .unwrap_or_default(), + &self.store_params.as_deref().cloned().unwrap_or_default(), ) .await?; From 31fefdeaad583d3ca4fe4fec513086c38d3519e3 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Thu, 9 Oct 2025 23:40:59 -0700 Subject: [PATCH 10/11] fix ci oom and remove doc examples --- .github/workflows/rust.yml | 2 +- python/python/lance/dataset.py | 34 ---------------------------------- 2 files changed, 1 insertion(+), 35 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index adf923f0c71..6a09b219716 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -156,7 +156,7 @@ jobs: ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -` cargo test --locked --features ${ALL_FEATURES} build-no-lock: - runs-on: ubuntu-24.04 + runs-on: warp-ubuntu-2404-x64-4x timeout-minutes: 30 env: # Need up-to-date compilers for kernels diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 6d15fae3133..97fb3c5b5ee 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -4981,15 +4981,6 @@ def write_dataset( initial_bases: list of DatasetBasePath, optional New base paths to register in the manifest. Only used in **CREATE mode**. Cannot be specified in APPEND or OVERWRITE modes. - - Each :class:`DatasetBasePath` defines a storage location with an optional name: - - >>> from lance import DatasetBasePath - >>> bases = [ - ... DatasetBasePath("s3://bucket1/data", name="primary"), - ... DatasetBasePath("s3://bucket2/data", name="archive") - ... ] - target_bases: list of str, optional References to base paths where data should be written. Can be specified in all modes. @@ -5000,31 +4991,6 @@ def write_dataset( **CREATE mode**: References must match bases in `initial_bases` **APPEND/OVERWRITE modes**: References must match bases in the existing manifest - - Examples - -------- - Create a dataset with multiple base paths: - - >>> import lance - >>> from lance import DatasetBasePath - >>> initial_bases = [ - ... DatasetBasePath("s3://fast-storage/data", name="hot"), - ... DatasetBasePath("s3://archive-storage/data", name="cold") - ... ] - >>> ds = lance.write_dataset( - ... data, "s3://fast-storage/mydataset", - ... mode="create", - ... initial_bases=initial_bases, - ... target_bases=["hot"] - ... ) - - Append data to a specific base by name: - - >>> lance.write_dataset(new_data, ds, mode="append", target_bases=["hot"]) - - Overwrite data (inherits existing base_paths): - - >>> lance.write_dataset(new_data, ds, mode="overwrite", target_bases=["cold"]) """ if use_legacy_format is not None: warnings.warn( From e0f61b8e7c20194437c2c203ae764188a0dc20eb Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Fri, 10 Oct 2025 01:00:06 -0700 Subject: [PATCH 11/11] increase no build size --- .github/workflows/rust.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 6a09b219716..e45f9d6b53d 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -156,7 +156,7 @@ jobs: ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -` cargo test --locked --features ${ALL_FEATURES} build-no-lock: - runs-on: warp-ubuntu-2404-x64-4x + runs-on: warp-ubuntu-2404-x64-8x timeout-minutes: 30 env: # Need up-to-date compilers for kernels