Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions python/python/lance/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4180,6 +4180,50 @@ def shallow_clone(
# Open and return a fresh dataset at the target URI to avoid manual overrides
return LanceDataset(target_uri, storage_options=storage_options, **kwargs)

def deep_clone(
self,
target_path: str | Path,
reference: int | str | Tuple[Optional[str], Optional[int]],
storage_options: Optional[Dict[str, str]] = None,
**kwargs,
) -> "LanceDataset":
"""
Deep clone the specified version into a new dataset at target_path.

Unlike :meth:`shallow_clone`, this performs a server-side copy of all
relevant data files (data files, deletion files, index files) into the
target location, creating a fully independent dataset.

Parameters
----------
target_path : str or Path
The URI or filesystem path to clone the dataset into.
reference : int, str or Tuple[Optional[str], Optional[int]]
An integer specifies a version number in the current branch; a string
specifies a tag name; a Tuple[Optional[str], Optional[int]] specifies
a version number in a specified branch. (None, None) means the latest
version_number on the main branch.
storage_options : dict, optional
Object store configuration for the new dataset (e.g., credentials,
endpoints). If not specified, the storage options of the source dataset
will be used.

Returns
-------
LanceDataset
A new LanceDataset representing the deep-cloned dataset.
"""
if isinstance(target_path, Path):
target_uri = os.fspath(target_path)
else:
target_uri = target_path

if storage_options is None:
storage_options = self._storage_options
self._ds.deep_clone(target_uri, reference, storage_options)

return LanceDataset(target_uri, storage_options=storage_options, **kwargs)

def migrate_manifest_paths_v2(self):
"""
Migrate the manifest paths to the new format.
Expand Down
6 changes: 6 additions & 0 deletions python/python/lance/lance/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,12 @@ class _Dataset:
reference: Optional[int | str | Tuple[Optional[str], Optional[int]]] = None,
storage_options: Optional[Dict[str, str]] = None,
) -> _Dataset: ...
def deep_clone(
self,
target_path: str,
reference: Optional[int | str | Tuple[Optional[str], Optional[int]]] = None,
storage_options: Optional[Dict[str, str]] = None,
) -> _Dataset: ...
def restore(self): ...
def cleanup_old_versions(
self,
Expand Down
40 changes: 40 additions & 0 deletions python/python/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5294,6 +5294,46 @@ def test_branches(tmp_path: Path):
assert branch1.to_table().combine_chunks() == expected_branch1.combine_chunks()


def test_deep_clone(tmp_path: Path):
"""Deep clone copies all data files into a fully independent dataset."""
# Prepare source dataset with two versions
src_dir = tmp_path / "deep_src"
table_v1 = pa.table({"a": [1, 2, 3], "b": [10, 20, 30]})
lance.write_dataset(table_v1, src_dir, mode="create")

table_v2 = pa.table({"a": [4, 5, 6], "b": [40, 50, 60]})
ds = lance.write_dataset(table_v2, src_dir, mode="overwrite")

# Tag version 1 for reference
ds.tags.create("v1", 1)

# Deep clone by numeric version (v2)
clone_v2_dir = tmp_path / "deep_clone_v2"
ds_clone_v2 = ds.deep_clone(clone_v2_dir, 2)
assert ds_clone_v2.to_table() == table_v2
# Re-open independently to verify files were actually copied
assert lance.dataset(clone_v2_dir).to_table() == table_v2

# Deep clone by tag (v1)
clone_v1_dir = tmp_path / "deep_clone_v1"
ds_clone_v1 = ds.deep_clone(clone_v1_dir, "v1")
assert ds_clone_v1.to_table() == table_v1
assert lance.dataset(clone_v1_dir).to_table() == table_v1

# Deep clone from a branch
table_v3 = pa.table({"a": [7, 8, 9], "b": [70, 80, 90]})
branch = ds.create_branch("branch", 2)
lance.write_dataset(table_v3, branch.uri, mode="overwrite")
clone_branch_dir = tmp_path / "deep_clone_branch"
cloned_branch = branch.deep_clone(clone_branch_dir, 3)
assert cloned_branch.to_table() == table_v3
assert lance.dataset(clone_branch_dir).to_table() == table_v3

# Cloning to an existing target should error
with pytest.raises(OSError):
ds.deep_clone(clone_v2_dir, 2)


def test_default_scan_options_nearest(tmp_path: Path) -> None:
dim = 4
num_rows = 10
Expand Down
33 changes: 33 additions & 0 deletions python/src/dataset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1567,6 +1567,39 @@ impl Dataset {
})
}

/// Deep clone the dataset into a new location, copying all data files
#[pyo3(signature = (target_path, reference, storage_options=None))]
fn deep_clone(
&mut self,
py: Python,
target_path: String,
reference: Option<Bound<PyAny>>,
storage_options: Option<HashMap<String, String>>,
) -> PyResult<Self> {
let store_params = storage_options.as_ref().map(|opts| ObjectStoreParams {
storage_options_accessor: Some(Arc::new(
lance::io::StorageOptionsAccessor::with_static_options(opts.clone()),
)),
..Default::default()
});

let mut new_self = self.ds.as_ref().clone();
let reference = self.transform_ref(reference)?;

let ds = rt()
.block_on(
Some(py),
new_self.deep_clone(&target_path, reference, store_params),
)?
.map_err(|err: Error| PyIOError::new_err(err.to_string()))?;

let uri = ds.uri().to_string();
Ok(Self {
ds: Arc::new(ds),
uri,
})
}

fn restore(&mut self) -> PyResult<()> {
let mut new_self = self.ds.as_ref().clone();
rt().block_on(None, new_self.restore())?
Expand Down
Loading