From 5d4a7f77f7a226b7b8d4e495aaf287b0f1f5ba0f Mon Sep 17 00:00:00 2001 From: hushengquan <1390305506@qq.com> Date: Fri, 17 Apr 2026 17:42:32 +0800 Subject: [PATCH] feat(python): support deep_clone in python api --- python/python/lance/dataset.py | 44 ++++++++++++++++++++++++++ python/python/lance/lance/__init__.pyi | 6 ++++ python/python/tests/test_dataset.py | 40 +++++++++++++++++++++++ python/src/dataset.rs | 33 +++++++++++++++++++ 4 files changed, 123 insertions(+) diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 3c8668abb84..7367965d903 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -4180,6 +4180,50 @@ def shallow_clone( # Open and return a fresh dataset at the target URI to avoid manual overrides return LanceDataset(target_uri, storage_options=storage_options, **kwargs) + def deep_clone( + self, + target_path: str | Path, + reference: int | str | Tuple[Optional[str], Optional[int]], + storage_options: Optional[Dict[str, str]] = None, + **kwargs, + ) -> "LanceDataset": + """ + Deep clone the specified version into a new dataset at target_path. + + Unlike :meth:`shallow_clone`, this performs a server-side copy of all + relevant data files (data files, deletion files, index files) into the + target location, creating a fully independent dataset. + + Parameters + ---------- + target_path : str or Path + The URI or filesystem path to clone the dataset into. + reference : int, str or Tuple[Optional[str], Optional[int]] + An integer specifies a version number in the current branch; a string + specifies a tag name; a Tuple[Optional[str], Optional[int]] specifies + a version number in a specified branch. (None, None) means the latest + version_number on the main branch. + storage_options : dict, optional + Object store configuration for the new dataset (e.g., credentials, + endpoints). If not specified, the storage options of the source dataset + will be used. + + Returns + ------- + LanceDataset + A new LanceDataset representing the deep-cloned dataset. + """ + if isinstance(target_path, Path): + target_uri = os.fspath(target_path) + else: + target_uri = target_path + + if storage_options is None: + storage_options = self._storage_options + self._ds.deep_clone(target_uri, reference, storage_options) + + return LanceDataset(target_uri, storage_options=storage_options, **kwargs) + def migrate_manifest_paths_v2(self): """ Migrate the manifest paths to the new format. diff --git a/python/python/lance/lance/__init__.pyi b/python/python/lance/lance/__init__.pyi index 60398f9a3ae..89f43d55e58 100644 --- a/python/python/lance/lance/__init__.pyi +++ b/python/python/lance/lance/__init__.pyi @@ -316,6 +316,12 @@ class _Dataset: reference: Optional[int | str | Tuple[Optional[str], Optional[int]]] = None, storage_options: Optional[Dict[str, str]] = None, ) -> _Dataset: ... + def deep_clone( + self, + target_path: str, + reference: Optional[int | str | Tuple[Optional[str], Optional[int]]] = None, + storage_options: Optional[Dict[str, str]] = None, + ) -> _Dataset: ... def restore(self): ... def cleanup_old_versions( self, diff --git a/python/python/tests/test_dataset.py b/python/python/tests/test_dataset.py index 8d4f70d6efb..da43fd5a599 100644 --- a/python/python/tests/test_dataset.py +++ b/python/python/tests/test_dataset.py @@ -5294,6 +5294,46 @@ def test_branches(tmp_path: Path): assert branch1.to_table().combine_chunks() == expected_branch1.combine_chunks() +def test_deep_clone(tmp_path: Path): + """Deep clone copies all data files into a fully independent dataset.""" + # Prepare source dataset with two versions + src_dir = tmp_path / "deep_src" + table_v1 = pa.table({"a": [1, 2, 3], "b": [10, 20, 30]}) + lance.write_dataset(table_v1, src_dir, mode="create") + + table_v2 = pa.table({"a": [4, 5, 6], "b": [40, 50, 60]}) + ds = lance.write_dataset(table_v2, src_dir, mode="overwrite") + + # Tag version 1 for reference + ds.tags.create("v1", 1) + + # Deep clone by numeric version (v2) + clone_v2_dir = tmp_path / "deep_clone_v2" + ds_clone_v2 = ds.deep_clone(clone_v2_dir, 2) + assert ds_clone_v2.to_table() == table_v2 + # Re-open independently to verify files were actually copied + assert lance.dataset(clone_v2_dir).to_table() == table_v2 + + # Deep clone by tag (v1) + clone_v1_dir = tmp_path / "deep_clone_v1" + ds_clone_v1 = ds.deep_clone(clone_v1_dir, "v1") + assert ds_clone_v1.to_table() == table_v1 + assert lance.dataset(clone_v1_dir).to_table() == table_v1 + + # Deep clone from a branch + table_v3 = pa.table({"a": [7, 8, 9], "b": [70, 80, 90]}) + branch = ds.create_branch("branch", 2) + lance.write_dataset(table_v3, branch.uri, mode="overwrite") + clone_branch_dir = tmp_path / "deep_clone_branch" + cloned_branch = branch.deep_clone(clone_branch_dir, 3) + assert cloned_branch.to_table() == table_v3 + assert lance.dataset(clone_branch_dir).to_table() == table_v3 + + # Cloning to an existing target should error + with pytest.raises(OSError): + ds.deep_clone(clone_v2_dir, 2) + + def test_default_scan_options_nearest(tmp_path: Path) -> None: dim = 4 num_rows = 10 diff --git a/python/src/dataset.rs b/python/src/dataset.rs index 7e139446819..06360bf6078 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -1567,6 +1567,39 @@ impl Dataset { }) } + /// Deep clone the dataset into a new location, copying all data files + #[pyo3(signature = (target_path, reference, storage_options=None))] + fn deep_clone( + &mut self, + py: Python, + target_path: String, + reference: Option>, + storage_options: Option>, + ) -> PyResult { + let store_params = storage_options.as_ref().map(|opts| ObjectStoreParams { + storage_options_accessor: Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(opts.clone()), + )), + ..Default::default() + }); + + let mut new_self = self.ds.as_ref().clone(); + let reference = self.transform_ref(reference)?; + + let ds = rt() + .block_on( + Some(py), + new_self.deep_clone(&target_path, reference, store_params), + )? + .map_err(|err: Error| PyIOError::new_err(err.to_string()))?; + + let uri = ds.uri().to_string(); + Ok(Self { + ds: Arc::new(ds), + uri, + }) + } + fn restore(&mut self) -> PyResult<()> { let mut new_self = self.ds.as_ref().clone(); rt().block_on(None, new_self.restore())?