diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 30ac105d47f..358846f68b4 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -126,13 +126,11 @@ jobs: name: linux-wheels path: python/target/wheels/pylance-*.whl - forward-compat: + compat: needs: linux + timeout-minutes: 60 runs-on: ubuntu-24.04 - name: Forward Compatibility Tests (${{ matrix.lance-version }}) - strategy: - matrix: - lance-version: ["0.16.0", "0.30.0", "0.36.0"] + name: Compatibility Tests defaults: run: shell: bash @@ -154,16 +152,11 @@ jobs: - name: Install dependencies run: | pip install $(ls wheels/pylance-*.whl)[tests,ray] - - name: Generate forward compatibility files - env: - PYTHONPATH: python/tests - run: python -m forward_compat.datagen - - name: Run forward compatibility tests (pylance ${{ matrix.lance-version }}) + - name: Run compatibility tests run: | - python -m venv venv - source venv/bin/activate - pip install pytest pylance==${{ matrix.lance-version }} - pytest python/tests/forward_compat --run-forward + make compattest + env: + COMPAT_TEMP_VENV: 1 linux-arm: timeout-minutes: 45 diff --git a/python/Makefile b/python/Makefile index b224fa7461f..78c0489c522 100644 --- a/python/Makefile +++ b/python/Makefile @@ -16,6 +16,10 @@ doctest: pytest --doctest-modules $(PYTEST_ARGS) python/lance .PHONY: doctest +compattest: + pytest --run-compat $(PYTEST_ARGS) python/tests/compat +.PHONY: compattest + format: format-python cargo fmt .PHONY: format @@ -24,10 +28,6 @@ build: maturin develop .PHONY: build -clean: - rm -rf ./target -.PHONY: clean - format-python: ruff format python ruff check --fix python diff --git a/python/python/tests/forward_compat/__init__.py b/python/python/tests/compat/__init__.py similarity index 100% rename from python/python/tests/forward_compat/__init__.py rename to python/python/tests/compat/__init__.py diff --git a/python/python/tests/compat/compat_decorator.py b/python/python/tests/compat/compat_decorator.py new file mode 100644 index 00000000000..2f3c1cee044 --- /dev/null +++ b/python/python/tests/compat/compat_decorator.py @@ -0,0 +1,313 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + +""" +Compatibility test infrastructure for Lance. + +This module provides the @compat_test() decorator and supporting infrastructure +for testing forward and backward compatibility across Lance versions. +""" + +import inspect +import json +import subprocess +import sys +import urllib.request +from functools import lru_cache +from typing import List + +import pytest +from packaging.version import Version + + +@lru_cache(maxsize=1) +def pylance_stable_versions() -> List[Version]: + """Fetches and returns a sorted list of stable pylance versions from PyPI.""" + try: + with urllib.request.urlopen( + "https://pypi.org/pypi/pylance/json", timeout=5 + ) as response: + data = json.loads(response.read()) + releases = data["releases"].keys() + stable_versions = [ + Version(v) + for v in releases + if not any(c in v for c in ["a", "b", "rc"]) + ] + stable_versions.sort() + return stable_versions + except Exception as e: + print( + f"Warning: Could not fetch pylance versions from PyPI: {e}", + file=sys.stderr, + ) + return [] + + +def recent_major_versions(n: int) -> List[str]: + """Returns the n most recent major versions of pylance as strings.""" + stable_versions = pylance_stable_versions() + major_versions = [] + seen_majors = set() + + def key(v: Version): + # On 0.x versions, we bumped minor version for breaking changes. + if v.major == 0: + return (0, v.minor) + return v.major + + for v in reversed(stable_versions): + if key(v) not in seen_majors: + seen_majors.add(key(v)) + major_versions.append(str(v)) + if len(major_versions) >= n: + break + return major_versions + + +@lru_cache(maxsize=1) +def last_beta_release(): + """Returns the latest beta version available on fury.io. + + Uses pip to query the fury.io index for pre-release versions of pylance. + Results are cached to avoid repeated network calls. + """ + try: + # Use pip index to get versions from fury.io + result = subprocess.run( + [ + sys.executable, + "-m", + "pip", + "index", + "versions", + "pylance", + "--pre", + "--extra-index-url", + "https://pypi.fury.io/lancedb/", + ], + capture_output=True, + text=True, + timeout=10, + ) + + if result.returncode == 0: + # Parse output to find available versions + # Output format: "pylance (x.y.z)" + # Available versions: x.y.z.betaN, x.y.z, ... + for line in result.stdout.splitlines(): + if "Available versions:" in line: + versions_str = line.split("Available versions:")[1].strip() + versions = [v.strip() for v in versions_str.split(",")] + # Return the first beta/pre-release version + for v in versions: + if "beta" in v or "rc" in v or "a" in v or "b" in v: + return v + # If no pre-release found, return the first version + if versions: + return versions[0] + + print( + "Warning: Could not fetch latest beta release from fury.io", + file=sys.stderr, + ) + return None + + except Exception as e: + print( + f"Warning: Could not fetch latest beta release from fury.io: {e}", + file=sys.stderr, + ) + return None + + +VERSIONS = recent_major_versions(3) +LAST_BETA_RELEASE = last_beta_release() +if LAST_BETA_RELEASE is not None: + VERSIONS.append(LAST_BETA_RELEASE) + + +class UpgradeDowngradeTest: + """Base class for compatibility tests. + + Subclasses should implement: + - create(): Create test data/indices with current Lance version + - check_read(): Verify data can be read correctly + - check_write(): Verify data can be written/modified + """ + + def create(self): + pass + + def check_read(self): + pass + + def check_write(self): + pass + + +def compat_test(min_version: str = "0.16.0"): + """Decorator to generate upgrade/downgrade compatibility tests. + + This decorator transforms a test class into two parameterized pytest test functions: + + 1. Downgrade test: Writes with current version, then reads with old version. + 2. Upgrade-Downgrade test: Writes with old version, reads with current version, + writes with current version, reads with old version. + + The test class should inherit from UpgradeDowngradeTest and implement: + - create(): Write data with the current Lance version + - check_read(): Verify data can be read + - check_write(): Verify data can be written + + The class can be parametrized with @pytest.mark.parametrize, and those + parameters will be applied to the generated test functions. + + Parameters + ---------- + versions : list of str, optional + List of Lance versions to test against. Defaults to VERSIONS. + + Example + ------- + @compat_test() + @pytest.mark.parametrize("file_version", ["1.0", "2.0"]) + class BasicTypes(UpgradeDowngradeTest): + def __init__(self, path: Path, file_version: str): + self.path = path + self.file_version = file_version + + def create(self): + # Write data + pass + + def check_read(self): + # Read and verify data + pass + + def check_write(self): + # Write data + pass + """ + version = set([min_version, *VERSIONS]) + versions = [v for v in version if Version(v) >= Version(min_version)] + + def decorator(cls): + # Extract existing parametrize marks from the class + existing_params = ( + [ + m + for m in ( + cls.pytestmark + if isinstance(cls.pytestmark, list) + else [cls.pytestmark] + ) + if getattr(m, "name", None) == "parametrize" + ] + if hasattr(cls, "pytestmark") + else [] + ) + + # Get parameter names from __init__ (excluding 'self' and 'path') + sig = inspect.signature(cls.__init__) + param_names = [p for p in sig.parameters.keys() if p not in ("self", "path")] + + # Create test functions dynamically with proper signatures + downgrade_func = _make_test_function(cls, param_names, "downgrade") + upgrade_downgrade_func = _make_test_function( + cls, param_names, "upgrade_downgrade" + ) + + # Apply version parametrization + downgrade_func = pytest.mark.parametrize("version", versions)(downgrade_func) + upgrade_downgrade_func = pytest.mark.parametrize("version", versions)( + upgrade_downgrade_func + ) + + # Apply existing parametrize marks + for mark in existing_params: + downgrade_func = pytest.mark.parametrize(*mark.args, **mark.kwargs)( + downgrade_func + ) + upgrade_downgrade_func = pytest.mark.parametrize(*mark.args, **mark.kwargs)( + upgrade_downgrade_func + ) + + # Apply compat marker + downgrade_func = pytest.mark.compat(downgrade_func) + upgrade_downgrade_func = pytest.mark.compat(upgrade_downgrade_func) + + # Set function names + downgrade_func.__name__ = f"test_{cls.__name__}_downgrade" + upgrade_downgrade_func.__name__ = f"test_{cls.__name__}_upgrade_downgrade" + + # Register test functions in the module where the class is defined + module = sys.modules[cls.__module__] + setattr(module, downgrade_func.__name__, downgrade_func) + setattr(module, upgrade_downgrade_func.__name__, upgrade_downgrade_func) + + return cls + + return decorator + + +def _make_test_function(cls, param_names, test_type): + """Create a test function with the correct signature for pytest. + + Parameters + ---------- + cls : class + The test class to create a function for + param_names : list of str + Names of parameters from the class __init__ (excluding self and path) + test_type : str + Either "downgrade" or "upgrade_downgrade" + + Returns + ------- + function + Test function with correct signature for pytest + """ + # Build function signature + sig_params = "venv_factory, tmp_path, version" + for param in param_names: + sig_params += f", {param}" + + # Build parameter passing to __init__ + init_params = ", ".join(param_names) if param_names else "" + + # Build function body based on test type + if test_type == "downgrade": + func_body = f''' +def test_func({sig_params}): + """Test that old Lance version can read data written by current version.""" + from pathlib import Path + obj = cls(tmp_path / "data.lance", {init_params}) + # Current version: create data + obj.create() + # Old version: verify can read + venv = venv_factory.get_venv(version) + venv.execute_method(obj, "check_read") + venv.execute_method(obj, "check_write") +''' + else: # upgrade_downgrade + func_body = f''' +def test_func({sig_params}): + """Test round-trip compatibility: old -> current -> old.""" + from pathlib import Path + obj = cls(tmp_path / "data.lance", {init_params}) + venv = venv_factory.get_venv(version) + # Old version: create data + venv.execute_method(obj, "create") + # Current version: read and write + obj.check_read() + obj.check_write() + # Old version: verify can still read + venv.execute_method(obj, "check_read") + venv.execute_method(obj, "check_write") +''' + + # Execute to create the function + namespace = {"cls": cls} + exec(func_body, namespace) + return namespace["test_func"] diff --git a/python/python/tests/compat/conftest.py b/python/python/tests/compat/conftest.py new file mode 100644 index 00000000000..8a4d869b021 --- /dev/null +++ b/python/python/tests/compat/conftest.py @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + +import os +from pathlib import Path + +import pytest + +from .venv_manager import VenvFactory + + +@pytest.fixture(scope="session") +def venv_factory(tmp_path_factory): + """ + Create a VenvFactory for managing virtual environments during compatibility tests. + + This fixture is session-scoped so virtual environments are reused across tests, + improving test performance. + + By default, venvs are persistent (stored in ~/.cache/lance-compat-venvs/). + Set COMPAT_TEMP_VENV=1 to use temporary venvs that are cleaned up after + each session. + """ + use_temp = os.environ.get("COMPAT_TEMP_VENV", "").lower() in ( + "1", + "true", + "yes", + ) + + if use_temp: + # Use temporary venvs (old behavior) + base_path = tmp_path_factory.mktemp("venvs") + factory = VenvFactory(base_path, persistent=False) + yield factory + factory.cleanup_all() + else: + # Use persistent venvs + cache_dir = Path.home() / ".cache" / "lance-compat-venvs" + cache_dir.mkdir(parents=True, exist_ok=True) + factory = VenvFactory(cache_dir, persistent=True) + yield factory + # Don't cleanup persistent venvs diff --git a/python/python/tests/compat/test_file_formats.py b/python/python/tests/compat/test_file_formats.py new file mode 100644 index 00000000000..f65c8611ff6 --- /dev/null +++ b/python/python/tests/compat/test_file_formats.py @@ -0,0 +1,114 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + +""" +File format compatibility tests for Lance. + +Tests that Lance files can be read and written across different versions, +covering various data types and file format versions. +""" + +from pathlib import Path + +import lance +import pytest +from lance.file import LanceFileReader, LanceFileWriter + +from .compat_decorator import ( + UpgradeDowngradeTest, + compat_test, +) +from .util import build_basic_types, build_large + + +# We start testing against the first release where 2.1 was stable. Before that +# the format was unstable so the readers will panic. +@compat_test(min_version="0.38.0") +class BasicTypes2_1(UpgradeDowngradeTest): + """Test file format 2.1 compatibility with basic data types.""" + + def __init__(self, path: Path): + self.path = path + + def create(self): + batch = build_basic_types() + with LanceFileWriter( + str(self.path), version="2.1", schema=batch.schema + ) as writer: + writer.write_batch(batch) + + def check_read(self): + reader = LanceFileReader(str(self.path)) + table = reader.read_all().to_table() + assert table == build_basic_types() + + def check_write(self): + # Test with overwrite + with LanceFileWriter(str(self.path), version="2.1") as writer: + writer.write_batch(build_basic_types()) + + +@compat_test(min_version="0.16.0") +@pytest.mark.parametrize( + "data_factory,name", + [ + (build_basic_types, "basic_types"), + (build_large, "large"), + ], + ids=["basic_types", "large"], +) +class FileCompat(UpgradeDowngradeTest): + """Test file format compatibility with different data types. + + Tests both basic types (scalars, strings, etc.) and large data (vectors, binary). + """ + + def __init__(self, path: Path, data_factory, name: str): + self.path = path + self.data_factory = data_factory + self.name = name + + def create(self): + """Create Lance file with test data.""" + batch = self.data_factory() + with LanceFileWriter( + str(self.path), version="2.0", schema=batch.schema + ) as writer: + writer.write_batch(batch) + + def check_read(self): + """Verify file can be read and data matches.""" + reader = LanceFileReader(str(self.path)) + table = reader.read_all().to_table() + expected = self.data_factory() + assert table.equals(expected), f"Data mismatch for {self.name}" + + def check_write(self): + """Verify can overwrite the file.""" + batch = self.data_factory() + with LanceFileWriter(str(self.path), version="2.0") as writer: + writer.write_batch(batch) + + +@compat_test(min_version="0.16.0") +class BasicTypesLegacy(UpgradeDowngradeTest): + """Test legacy data storage version 0.1 compatibility.""" + + def __init__(self, path: Path): + self.path = path + + def create(self): + batch = build_basic_types() + lance.write_dataset(batch, self.path, data_storage_version="0.1") + + def check_read(self): + ds = lance.dataset(self.path) + table = ds.to_table() + assert table == build_basic_types() + + def check_write(self): + ds = lance.dataset(self.path) + ds.delete("true") + lance.write_dataset( + build_basic_types(), self.path, data_storage_version="0.1", mode="append" + ) diff --git a/python/python/tests/compat/test_scalar_indices.py b/python/python/tests/compat/test_scalar_indices.py new file mode 100644 index 00000000000..5d42a837bdc --- /dev/null +++ b/python/python/tests/compat/test_scalar_indices.py @@ -0,0 +1,315 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + +""" +Scalar index compatibility tests for Lance. + +Tests that scalar indices (BTREE, BITMAP, LABEL_LIST, NGRAM, ZONEMAP, +BLOOMFILTER, JSON, FTS) created with one version of Lance can be read +and written by other versions. +""" + +import shutil +from pathlib import Path + +import lance +import pyarrow as pa + +from .compat_decorator import ( + UpgradeDowngradeTest, + compat_test, +) + + +@compat_test(min_version="0.30.0") +class BTreeIndex(UpgradeDowngradeTest): + """Test BTREE scalar index compatibility (introduced in 0.20.0). + + Started fully working in 0.30.0 with various fixes. + """ + + def __init__(self, path: Path): + self.path = path + + def create(self): + """Create dataset with BTREE index.""" + shutil.rmtree(self.path, ignore_errors=True) + data = pa.table( + { + "idx": pa.array(range(1000)), + "btree": pa.array(range(1000)), + } + ) + dataset = lance.write_dataset(data, self.path, max_rows_per_file=100) + dataset.create_scalar_index("btree", "BTREE") + + def check_read(self): + """Verify BTREE index can be queried.""" + ds = lance.dataset(self.path) + table = ds.to_table(filter="btree == 7") + assert table.num_rows == 1 + assert table.column("idx").to_pylist() == [7] + + # Verify index is used + explain = ds.scanner(filter="btree == 7").explain_plan() + assert "ScalarIndexQuery" in explain or "MaterializeIndex" in explain + + def check_write(self): + """Verify can insert data and optimize BTREE index.""" + ds = lance.dataset(self.path) + data = pa.table( + { + "idx": pa.array([1000]), + "btree": pa.array([1000]), + } + ) + ds.insert(data) + ds.optimize.optimize_indices() + ds.optimize.compact_files() + + # Verify new data is queryable + table = ds.to_table(filter="btree == 1000") + assert table.num_rows >= 1 + + +@compat_test(min_version="0.22.0") +class BitmapLabelListIndex(UpgradeDowngradeTest): + """Test BITMAP and LABEL_LIST scalar index compatibility (introduced in 0.20.0). + + Started fully working in 0.22.0 with fixes to LABEL_LIST index. + """ + + def __init__(self, path: Path): + self.path = path + + def create(self): + """Create dataset with BITMAP and LABEL_LIST indices.""" + shutil.rmtree(self.path, ignore_errors=True) + data = pa.table( + { + "idx": pa.array(range(1000)), + "bitmap": pa.array(range(1000)), + "label_list": pa.array([[f"label{i}"] for i in range(1000)]), + } + ) + dataset = lance.write_dataset(data, self.path, max_rows_per_file=100) + dataset.create_scalar_index("bitmap", "BITMAP") + dataset.create_scalar_index("label_list", "LABEL_LIST") + + def check_read(self): + """Verify BITMAP and LABEL_LIST indices can be queried.""" + ds = lance.dataset(self.path) + + # Test BITMAP index + table = ds.to_table(filter="bitmap == 7") + assert table.num_rows == 1 + assert table.column("idx").to_pylist() == [7] + + # Test LABEL_LIST index + table = ds.to_table(filter="array_has_any(label_list, ['label7'])") + assert table.num_rows == 1 + assert table.column("idx").to_pylist() == [7] + + def check_write(self): + """Verify can insert data and optimize indices.""" + ds = lance.dataset(self.path) + data = pa.table( + { + "idx": pa.array([1000]), + "bitmap": pa.array([1000]), + "label_list": pa.array([["label1000"]]), + } + ) + ds.insert(data) + ds.optimize.optimize_indices() + ds.optimize.compact_files() + + +@compat_test(min_version="0.36.0") +class NgramIndex(UpgradeDowngradeTest): + """Test NGRAM index compatibility (introduced in 0.36.0).""" + + def __init__(self, path: Path): + self.path = path + + def create(self): + """Create dataset with NGRAM index.""" + shutil.rmtree(self.path, ignore_errors=True) + data = pa.table( + { + "idx": pa.array(range(1000)), + "ngram": pa.array([f"word{i}" for i in range(1000)]), + } + ) + dataset = lance.write_dataset(data, self.path, max_rows_per_file=100) + dataset.create_scalar_index("ngram", "NGRAM") + + def check_read(self): + """Verify NGRAM index can be queried.""" + ds = lance.dataset(self.path) + table = ds.to_table(filter="contains(ngram, 'word7')") + # word7, word70-79, word700-799 = 111 results + assert table.num_rows == 111 + + # Verify index is used + explain = ds.scanner(filter="contains(ngram, 'word7')").explain_plan() + assert "ScalarIndexQuery" in explain + + def check_write(self): + """Verify can insert data and optimize NGRAM index.""" + ds = lance.dataset(self.path) + data = pa.table( + { + "idx": pa.array([1000]), + "ngram": pa.array(["word1000"]), + } + ) + ds.insert(data) + ds.optimize.optimize_indices() + ds.optimize.compact_files() + + +@compat_test(min_version="0.36.0") +class ZonemapBloomfilterIndex(UpgradeDowngradeTest): + """Test ZONEMAP and BLOOMFILTER index compatibility (introduced in 0.36.0).""" + + def __init__(self, path: Path): + self.path = path + + def create(self): + """Create dataset with ZONEMAP and BLOOMFILTER indices.""" + shutil.rmtree(self.path, ignore_errors=True) + data = pa.table( + { + "idx": pa.array(range(1000)), + "zonemap": pa.array(range(1000)), + "bloomfilter": pa.array(range(1000)), + } + ) + dataset = lance.write_dataset(data, self.path, max_rows_per_file=100) + dataset.create_scalar_index("zonemap", "ZONEMAP") + dataset.create_scalar_index("bloomfilter", "BLOOMFILTER") + + def check_read(self): + """Verify ZONEMAP and BLOOMFILTER indices can be queried.""" + ds = lance.dataset(self.path) + + # Test ZONEMAP + table = ds.to_table(filter="zonemap == 7") + assert table.num_rows == 1 + assert table.column("idx").to_pylist() == [7] + + # Test BLOOMFILTER + table = ds.to_table(filter="bloomfilter == 7") + assert table.num_rows == 1 + assert table.column("idx").to_pylist() == [7] + + def check_write(self): + """Verify can insert data and optimize indices.""" + ds = lance.dataset(self.path) + data = pa.table( + { + "idx": pa.array([1000]), + "zonemap": pa.array([1000]), + "bloomfilter": pa.array([1000]), + } + ) + ds.insert(data) + ds.optimize.optimize_indices() + ds.optimize.compact_files() + + +@compat_test(min_version="0.36.0") +class JsonIndex(UpgradeDowngradeTest): + """Test JSON index compatibility (introduced in 0.36.0).""" + + def __init__(self, path: Path): + self.path = path + + def create(self): + """Create dataset with JSON index.""" + from lance.indices import IndexConfig + + shutil.rmtree(self.path, ignore_errors=True) + data = pa.table( + { + "idx": pa.array(range(1000)), + "json": pa.array([f'{{"val": {i}}}' for i in range(1000)], pa.json_()), + } + ) + dataset = lance.write_dataset(data, self.path, max_rows_per_file=100) + dataset.create_scalar_index( + "json", + IndexConfig( + index_type="json", + parameters={"target_index_type": "btree", "path": "val"}, + ), + ) + + def check_read(self): + """Verify JSON index can be queried.""" + ds = lance.dataset(self.path) + table = ds.to_table(filter="json_get_int(json, 'val') == 7") + assert table.num_rows == 1 + assert table.column("idx").to_pylist() == [7] + + # Verify index is used + explain = ds.scanner(filter="json_get_int(json, 'val') == 7").explain_plan() + assert "ScalarIndexQuery" in explain + + def check_write(self): + """Verify can insert data with JSON index.""" + ds = lance.dataset(self.path) + data = pa.table( + { + "idx": pa.array([1000]), + "json": pa.array(['{"val": 1000}'], pa.json_()), + } + ) + ds.insert(data) + ds.optimize.compact_files() + + +@compat_test(min_version="0.36.0") +class FtsIndex(UpgradeDowngradeTest): + """Test FTS (full-text search) index compatibility (introduced in 0.36.0).""" + + def __init__(self, path: Path): + self.path = path + + def create(self): + """Create dataset with FTS index.""" + shutil.rmtree(self.path, ignore_errors=True) + data = pa.table( + { + "idx": pa.array(range(1000)), + "text": pa.array( + [f"document with words {i} and more text" for i in range(1000)] + ), + } + ) + dataset = lance.write_dataset(data, self.path, max_rows_per_file=100) + dataset.create_scalar_index("text", "INVERTED") + + def check_read(self): + """Verify FTS index can be queried.""" + ds = lance.dataset(self.path) + # Search for documents containing "words" and "7" + # Note: Actual FTS query syntax may vary + table = ds.to_table(filter="text LIKE '%words 7 %'") + assert table.num_rows > 0 + + def check_write(self): + """Verify can insert data with FTS index.""" + # Dataset::load_manifest does not do retain_supported_indices + # so this can only work with no cache + session = lance.Session(index_cache_size_bytes=0, metadata_cache_size_bytes=0) + ds = lance.dataset(self.path, session=session) + data = pa.table( + { + "idx": pa.array([1000]), + "text": pa.array(["new document to index"]), + } + ) + ds.insert(data) + ds.optimize.compact_files() diff --git a/python/python/tests/compat/test_vector_indices.py b/python/python/tests/compat/test_vector_indices.py new file mode 100644 index 00000000000..b58ded4f5ff --- /dev/null +++ b/python/python/tests/compat/test_vector_indices.py @@ -0,0 +1,212 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + +""" +Vector index compatibility tests for Lance. + +Tests that vector indices (IVF_PQ, etc.) created with one version of Lance +can be read and written by other versions. +""" + +import shutil +from pathlib import Path + +import lance +import pyarrow as pa +import pyarrow.compute as pc + +from .compat_decorator import ( + UpgradeDowngradeTest, + compat_test, +) + + +@compat_test(min_version="0.29.1.beta2") +class PqVectorIndex(UpgradeDowngradeTest): + """Test PQ (Product Quantization) vector index compatibility.""" + + def __init__(self, path: Path): + self.path = path + + def create(self): + """Create dataset with PQ vector index.""" + shutil.rmtree(self.path, ignore_errors=True) + ndims = 32 + nvecs = 512 + + data = pa.table( + { + "id": pa.array(range(nvecs)), + "vec": pa.FixedSizeListArray.from_arrays( + pc.random(ndims * nvecs).cast(pa.float32()), ndims + ), + } + ) + + dataset = lance.write_dataset(data, self.path) + dataset.create_index( + "vec", + "IVF_PQ", + num_partitions=1, + num_sub_vectors=4, + ) + + def check_read(self): + """Verify PQ index can be queried.""" + ds = lance.dataset(self.path) + # Query with random vector + q = pc.random(32).cast(pa.float32()) + result = ds.to_table( + nearest={ + "q": q, + "k": 4, + "column": "vec", + } + ) + assert result.num_rows == 4 + + def check_write(self): + """Verify can insert vectors and rebuild index.""" + ds = lance.dataset(self.path) + # Add new vectors + data = pa.table( + { + "id": pa.array([1000]), + "vec": pa.FixedSizeListArray.from_arrays( + pc.random(32).cast(pa.float32()), 32 + ), + } + ) + ds.insert(data) + ds.optimize.optimize_indices() + ds.optimize.compact_files() + + +@compat_test(min_version="0.39.0") +class HnswPqVectorIndex(UpgradeDowngradeTest): + """Test IVF_HNSW_PQ vector index compatibility. + + Note: Only tests versions >= 0.39.0 because earlier versions don't support + remapping for IVF_HNSW_PQ indices, which is required for optimize operations. + """ + + def __init__(self, path: Path): + self.path = path + + def create(self): + """Create dataset with IVF_HNSW_PQ vector index.""" + shutil.rmtree(self.path, ignore_errors=True) + ndims = 32 + nvecs = 512 + + data = pa.table( + { + "id": pa.array(range(nvecs)), + "vec": pa.FixedSizeListArray.from_arrays( + pc.random(ndims * nvecs).cast(pa.float32()), ndims + ), + } + ) + + dataset = lance.write_dataset(data, self.path) + dataset.create_index( + "vec", + "IVF_HNSW_PQ", + num_partitions=4, + num_sub_vectors=4, + ) + + def check_read(self): + """Verify IVF_HNSW_PQ index can be queried.""" + ds = lance.dataset(self.path) + # Query with random vector + q = pc.random(32).cast(pa.float32()) + result = ds.to_table( + nearest={ + "q": q, + "k": 4, + "column": "vec", + } + ) + assert result.num_rows == 4 + + def check_write(self): + """Verify can insert vectors and rebuild index.""" + ds = lance.dataset(self.path) + # Add new vectors + data = pa.table( + { + "id": pa.array([1000]), + "vec": pa.FixedSizeListArray.from_arrays( + pc.random(32).cast(pa.float32()), 32 + ), + } + ) + ds.insert(data) + ds.optimize.optimize_indices() + ds.optimize.compact_files() + + +@compat_test(min_version="0.39.0") +class HnswSqVectorIndex(UpgradeDowngradeTest): + """Test IVF_HNSW_SQ vector index compatibility. + + Note: Only tests versions >= 0.39.0 because earlier versions don't support + remapping for IVF_HNSW_SQ indices, which is required for optimize operations. + """ + + def __init__(self, path: Path): + self.path = path + + def create(self): + """Create dataset with IVF_HNSW_SQ vector index.""" + shutil.rmtree(self.path, ignore_errors=True) + ndims = 32 + nvecs = 512 + + data = pa.table( + { + "id": pa.array(range(nvecs)), + "vec": pa.FixedSizeListArray.from_arrays( + pc.random(ndims * nvecs).cast(pa.float32()), ndims + ), + } + ) + + dataset = lance.write_dataset(data, self.path) + dataset.create_index( + "vec", + "IVF_HNSW_SQ", + num_partitions=4, + num_sub_vectors=4, + ) + + def check_read(self): + """Verify IVF_HNSW_SQ index can be queried.""" + ds = lance.dataset(self.path) + # Query with random vector + q = pc.random(32).cast(pa.float32()) + result = ds.to_table( + nearest={ + "q": q, + "k": 4, + "column": "vec", + } + ) + assert result.num_rows == 4 + + def check_write(self): + """Verify can insert vectors and rebuild index.""" + ds = lance.dataset(self.path) + # Add new vectors + data = pa.table( + { + "id": pa.array([1000]), + "vec": pa.FixedSizeListArray.from_arrays( + pc.random(32).cast(pa.float32()), 32 + ), + } + ) + ds.insert(data) + ds.optimize.optimize_indices() + ds.optimize.compact_files() diff --git a/python/python/tests/forward_compat/util.py b/python/python/tests/compat/util.py similarity index 90% rename from python/python/tests/forward_compat/util.py rename to python/python/tests/compat/util.py index 319d38d1178..210bc581579 100644 --- a/python/python/tests/forward_compat/util.py +++ b/python/python/tests/compat/util.py @@ -5,21 +5,10 @@ # # Everything here must be runnable by older versions of Lance. -from pathlib import Path import pyarrow as pa -def get_path(name: str): - dataset_dir = ( - Path(__file__).parent.parent.parent.parent.parent - / "test_data" - / "forward_compat" - / name - ) - return dataset_dir - - def build_basic_types(): schema = pa.schema( [ diff --git a/python/python/tests/compat/venv_manager.py b/python/python/tests/compat/venv_manager.py new file mode 100644 index 00000000000..3f803439860 --- /dev/null +++ b/python/python/tests/compat/venv_manager.py @@ -0,0 +1,283 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + +""" +Virtual environment management for compatibility testing. + +Manages creation and execution of test code in isolated virtual environments +with specific Lance versions installed. +""" + +import os +import pickle +import struct +import subprocess +import sys +from pathlib import Path +from typing import Any, Optional + + +class VenvExecutor: + """Manages a virtual environment with a specific Lance version.""" + + def __init__(self, version: str, venv_path: Path, persistent: bool = False): + """ + Initialize a VenvExecutor. + + Parameters + ---------- + version : str + Lance version to install (e.g., "0.30.0") + venv_path : Path + Directory where virtual environment will be created + persistent : bool + If True, venv is persistent and validated before use + """ + self.version = version + self.venv_path = Path(venv_path) + self.persistent = persistent + self._created = False + self._subprocess: Optional[subprocess.Popen] = None + + @property + def python_path(self) -> Path: + if sys.platform == "win32": + return self.venv_path / "Scripts" / "python.exe" + return self.venv_path / "bin" / "python" + + def _validate_venv(self) -> bool: + """Check if existing venv is valid and has correct Lance version.""" + if not self.venv_path.exists(): + return False + + if not self.python_path.exists(): + return False + + # Check if pylance is installed with correct version + try: + result = subprocess.run( + [str(self.python_path), "-m", "pip", "show", "pylance"], + capture_output=True, + text=True, + timeout=5, + ) + if result.returncode != 0: + return False + + # Parse version from output + for line in result.stdout.splitlines(): + if line.startswith("Version:"): + installed_version = line.split(":", 1)[1].strip() + return installed_version == self.version + + except Exception: + return False + + return False + + def create(self): + """Create the virtual environment and install the specified Lance version.""" + if self._created: + return + + # Check if persistent venv already exists and is valid + if self.persistent and self._validate_venv(): + self._created = True + return + + # Create virtual environment + subprocess.run( + [sys.executable, "-m", "venv", str(self.venv_path)], + check=True, + capture_output=True, + ) + + # Install specific pylance version and pytest + subprocess.run( + [ + str(self.python_path), + "-m", + "pip", + "install", + "--quiet", + "--pre", + "--extra-index-url", + "https://pypi.fury.io/lancedb/", + f"pylance=={self.version}", + "pytest", + ], + check=True, + capture_output=True, + ) + + self._created = True + + def _ensure_subprocess(self): + """Ensure the persistent subprocess is running.""" + if self._subprocess is not None and self._subprocess.poll() is None: + # Subprocess is already running + return + + # Start persistent subprocess + runner_script = Path(__file__).parent / "venv_runner.py" + + # Set PYTHONPATH to include the tests directory + env = os.environ.copy() + tests_dir = Path(__file__).parent.parent + env["PYTHONPATH"] = str(tests_dir) + + self._subprocess = subprocess.Popen( + [str(self.python_path), "-u", str(runner_script)], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=None, # Inherit stderr to see timing messages + env=env, + ) + + def _send_message(self, obj: Any): + """Send a length-prefixed pickled message to subprocess.""" + data = pickle.dumps(obj) + length = struct.pack(">I", len(data)) + self._subprocess.stdin.write(length) + self._subprocess.stdin.write(data) + self._subprocess.stdin.flush() + + def _receive_message(self) -> Any: + """Receive a length-prefixed pickled message from subprocess.""" + # Read 4-byte length header + length_bytes = self._subprocess.stdout.read(4) + if len(length_bytes) < 4: + raise RuntimeError("Failed to read message length from subprocess") + + length = struct.unpack(">I", length_bytes)[0] + + # Read message data + data = self._subprocess.stdout.read(length) + if len(data) < length: + raise RuntimeError( + f"Incomplete message: expected {length} bytes, got {len(data)}" + ) + + return pickle.loads(data) + + def execute_method(self, obj: Any, method_name: str) -> Any: + """ + Execute a method on a pickled object in the virtual environment. + + Uses a persistent subprocess to avoid repeatedly importing Lance and + its dependencies. + + Parameters + ---------- + obj : Any + Object to pickle and send to venv. Must be picklable. + method_name : str + Name of the method to call on the object + + Returns + ------- + Any + Return value from the method call + + Raises + ------ + Exception + Re-raises any exception that occurred in the venv + """ + if not self._created: + raise RuntimeError("Virtual environment not created. Call create() first.") + + # Ensure subprocess is running + self._ensure_subprocess() + try: + # Send request: (obj, method_name) + self._send_message((obj, method_name)) + + # Receive response + response = self._receive_message() + + if response["success"]: + return response["result"] + else: + # Error occurred in subprocess + error_msg = ( + f"Error in venv (Lance {self.version}) calling {method_name}:\n" + f"{response['exception_type']}: {response['exception_msg']}\n" + f"\nTraceback from venv:\n{response['traceback']}" + ) + raise RuntimeError(error_msg) + + except (BrokenPipeError, EOFError, struct.error) as e: + # Subprocess died or communication failed + raise RuntimeError( + f"Communication with venv subprocess failed (Lance {self.version}):\n" + f"Error: {e}" + ) + + def cleanup(self): + """Remove the virtual environment directory and terminate subprocess.""" + # Terminate the persistent subprocess + if self._subprocess is not None: + try: + self._subprocess.stdin.close() + self._subprocess.terminate() + self._subprocess.wait(timeout=5) + except Exception: + # Force kill if graceful termination fails + self._subprocess.kill() + finally: + self._subprocess = None + + # Remove venv directory + if self.venv_path.exists(): + import shutil + + shutil.rmtree(self.venv_path) + self._created = False + + +class VenvFactory: + """Factory for creating and managing VenvExecutor instances.""" + + def __init__(self, base_path: Path, persistent: bool = False): + """ + Initialize the factory. + + Parameters + ---------- + base_path : Path + Base directory for creating virtual environments + persistent : bool + If True, venvs are not cleaned up and can be reused across sessions + """ + self.base_path = Path(base_path) + self.persistent = persistent + self.venvs: dict[str, VenvExecutor] = {} + + def get_venv(self, version: str) -> VenvExecutor: + """ + Get or create a VenvExecutor for the specified version. + + Parameters + ---------- + version : str + Lance version + + Returns + ------- + VenvExecutor + Executor for the specified version + """ + if version not in self.venvs: + venv_path = self.base_path / f"venv_{version}" + executor = VenvExecutor(version, venv_path, persistent=self.persistent) + executor.create() + self.venvs[version] = executor + return self.venvs[version] + + def cleanup_all(self): + """Clean up all created virtual environments (skips persistent venvs).""" + if not self.persistent: + for venv in self.venvs.values(): + venv.cleanup() + self.venvs.clear() diff --git a/python/python/tests/compat/venv_runner.py b/python/python/tests/compat/venv_runner.py new file mode 100644 index 00000000000..ce7b3c3de77 --- /dev/null +++ b/python/python/tests/compat/venv_runner.py @@ -0,0 +1,106 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + +""" +Runner script executed inside virtual environments to run compatibility tests. + +This script runs as a persistent subprocess that accepts multiple method calls +without restarting. This avoids the overhead of repeatedly importing Lance and +its dependencies. + +Protocol: +- Reads 4 bytes (message length as big-endian int) +- Reads that many bytes (pickled tuple of (obj, method_name)) +- Executes method on object +- Writes 4 bytes (response length) +- Writes pickled response dict +""" + +import os +import pickle +import struct +import sys +import time +import traceback + +# Enable detailed timing output with DEBUG=1 +DEBUG = os.environ.get("DEBUG", "").lower() in ("1", "true", "yes") + + +def read_message(stream): + """Read a length-prefixed pickled message from stream.""" + # Read 4-byte length header + length_bytes = stream.buffer.read(4) + if len(length_bytes) < 4: + return None # EOF + + length = struct.unpack(">I", length_bytes)[0] + + # Read message data + data = stream.buffer.read(length) + if len(data) < length: + raise RuntimeError( + f"Incomplete message: expected {length} bytes, got {len(data)}" + ) + + return pickle.loads(data) + + +def write_message(stream, obj): + """Write a length-prefixed pickled message to stream.""" + data = pickle.dumps(obj) + length = struct.pack(">I", len(data)) + stream.buffer.write(length) + stream.buffer.write(data) + stream.buffer.flush() + + +def main(): + """Main loop that processes method calls until EOF.""" + while True: + try: + # Read request (obj, method_name) + request = read_message(sys.stdin) + if request is None: + # EOF - parent closed connection + break + + obj, method_name = request + + # Execute method with timing + start_time = time.time() + if DEBUG: + print( + f"[VENV TIMING] Executing {method_name}...", + file=sys.stderr, + flush=True, + ) + + method = getattr(obj, method_name) + result = method() + + if DEBUG: + exec_time = time.time() - start_time + print( + f"[VENV TIMING] {method_name} completed in {exec_time:.2f}s", + file=sys.stderr, + flush=True, + ) + + # Send success response + response = {"success": True, "result": result} + write_message(sys.stdout, response) + + except Exception as e: + # Send error response + error_info = { + "success": False, + "exception_type": type(e).__name__, + "exception_msg": str(e), + "traceback": traceback.format_exc(), + } + write_message(sys.stdout, error_info) + + +if __name__ == "__main__": + main() diff --git a/python/python/tests/conftest.py b/python/python/tests/conftest.py index 3c344d207f5..49a6eeaa490 100644 --- a/python/python/tests/conftest.py +++ b/python/python/tests/conftest.py @@ -42,6 +42,12 @@ def pytest_addoption(parser): default=False, help="Run forward compatibility tests (requires files to be generated already)", ) + parser.addoption( + "--run-compat", + action="store_true", + default=False, + help="Run upgrade/downgrade compatibility tests (creates virtual environments)", + ) def pytest_configure(config): @@ -55,6 +61,10 @@ def pytest_configure(config): config.addinivalue_line( "markers", "slow: mark tests that require large CPU or RAM resources" ) + config.addinivalue_line( + "markers", + "compat: mark tests that run upgrade/downgrade compatibility checks", + ) def pytest_collection_modifyitems(config, items): @@ -64,6 +74,8 @@ def pytest_collection_modifyitems(config, items): disable_items_with_mark(items, "slow", "--run-slow not specified") if not config.getoption("--run-forward"): disable_items_with_mark(items, "forward", "--run-forward not specified") + if not config.getoption("--run-compat"): + disable_items_with_mark(items, "compat", "--run-compat not specified") try: import torch diff --git a/python/python/tests/forward_compat/datagen.py b/python/python/tests/forward_compat/datagen.py deleted file mode 100644 index c5ef40609bd..00000000000 --- a/python/python/tests/forward_compat/datagen.py +++ /dev/null @@ -1,154 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright The Lance Authors - -# Data generation for forward compatibility tests -# -# This file will be run on the up-to-date version of Lance to generate -# test data that will be read by older versions of Lance in test_compat.py - -import shutil - -import lance -import pyarrow as pa -import pyarrow.compute as pc -from lance.file import LanceFileWriter -from lance.indices.builder import IndexConfig - -from forward_compat.util import build_basic_types, build_large, get_path - - -def write_basic_types(): - path = get_path("basic_types.lance") - with LanceFileWriter(str(path)) as writer: - writer.write_batch(build_basic_types()) - - -def write_large(): - path = get_path("large.lance") - with LanceFileWriter(str(path)) as writer: - writer.write_batch(build_large()) - - -def write_dataset_pq_buffer(): - # In https://github.com/lancedb/lance/pull/3829, we started storing the PQ - # codebook in a global buffer instead of the schema metadata as JSON. - - shutil.rmtree(get_path("pq_in_schema"), ignore_errors=True) - - ndims = 32 - nvecs = 512 - - data = pa.table( - { - "id": pa.array(range(nvecs)), - "vec": pa.FixedSizeListArray.from_arrays( - pc.random(ndims * nvecs).cast(pa.float32()), ndims - ), - } - ) - - dataset = lance.write_dataset(data, get_path("pq_in_schema")) - dataset.create_index( - "vec", - "IVF_PQ", - num_partitions=1, - num_sub_vectors=4, - ) - - -def write_dataset_json(): - shutil.rmtree(get_path("json"), ignore_errors=True) - - data = pa.table( - { - "idx": pa.array(range(1000)), - "json": pa.array([f'{{"val": {i}}}' for i in range(1000)], pa.json_()), - } - ) - - dataset = lance.write_dataset(data, get_path("json"), max_rows_per_file=100) - dataset.create_scalar_index( - "json", - IndexConfig( - index_type="json", parameters={"target_index_type": "btree", "path": "val"} - ), - ) - - -def write_dataset_btree_index(): - shutil.rmtree(get_path("btree_index"), ignore_errors=True) - - data = pa.table( - { - "idx": pa.array(range(1000)), - "btree": pa.array(range(1000)), - } - ) - - dataset = lance.write_dataset(data, get_path("btree_index"), max_rows_per_file=100) - dataset.create_scalar_index("btree", "BTREE") - - -def write_dataset_bitmap_labellist_index(): - shutil.rmtree(get_path("bitmap_labellist_index"), ignore_errors=True) - - data = pa.table( - { - "idx": pa.array(range(1000)), - "bitmap": pa.array(range(1000)), - "label_list": pa.array([[f"label{i}"] for i in range(1000)]), - } - ) - - dataset = lance.write_dataset( - data, get_path("bitmap_labellist_index"), max_rows_per_file=100 - ) - dataset.create_scalar_index("bitmap", "BITMAP") - dataset.create_scalar_index("label_list", "LABEL_LIST") - - -def write_dataset_ngram_zonemap_bloomfilter_index(): - shutil.rmtree(get_path("ngram_zonemap_bloomfilter_index"), ignore_errors=True) - - data = pa.table( - { - "idx": pa.array(range(1000)), - "ngram": pa.array([f"word{i}" for i in range(1000)]), - "zonemap": pa.array(range(1000)), - "bloomfilter": pa.array(range(1000)), - } - ) - - dataset = lance.write_dataset( - data, get_path("ngram_zonemap_bloomfilter_index"), max_rows_per_file=100 - ) - dataset.create_scalar_index("ngram", "NGRAM") - dataset.create_scalar_index("zonemap", "ZONEMAP") - dataset.create_scalar_index("bloomfilter", "BLOOMFILTER") - - -def write_dataset_fts_index(): - shutil.rmtree(get_path("fts_index"), ignore_errors=True) - - data = pa.table( - { - "idx": pa.array(range(1000)), - "text": pa.array( - [f"document with words {i} and more text" for i in range(1000)] - ), - } - ) - - dataset = lance.write_dataset(data, get_path("fts_index"), max_rows_per_file=100) - dataset.create_scalar_index("text", "INVERTED") - - -if __name__ == "__main__": - write_basic_types() - write_large() - write_dataset_pq_buffer() - write_dataset_btree_index() - write_dataset_bitmap_labellist_index() - write_dataset_ngram_zonemap_bloomfilter_index() - write_dataset_json() - write_dataset_fts_index() diff --git a/python/python/tests/forward_compat/test_compat.py b/python/python/tests/forward_compat/test_compat.py deleted file mode 100644 index 5a1e2a1adde..00000000000 --- a/python/python/tests/forward_compat/test_compat.py +++ /dev/null @@ -1,243 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright The Lance Authors - -# Forward compatibility tests for older versions of Lance -# -# This file will be run on older versions of Lance to test that the -# current version of Lance can read the test data generated by datagen.py. - -import shutil - -import lance -import pyarrow as pa -import pyarrow.compute as pc -import pytest -from lance.file import LanceFileReader -from packaging.version import Version - -from .util import build_basic_types, build_large, get_path - - -@pytest.mark.forward -@pytest.mark.skipif( - Version(lance.__version__) < Version("0.36.0"), # at least 0.36.0 - reason="version is too old to support JSON index", -) -def test_json_index(): - ds = lance.dataset(get_path("json")) - tbl = ds.to_table(filter="json_get_int(json, 'val') == 7") - assert tbl.num_rows == 1 - assert tbl.column("idx").to_pylist() == [7] - - explain = ds.scanner(filter="json_get_int(json, 'val') == 7").explain_plan() - assert "ScalarIndexQuery" in explain - - -@pytest.mark.forward -@pytest.mark.skipif( - Version(lance.__version__) < Version("0.36.0"), # at least 0.36.0 - reason="version is too old to support NGRAM index", -) -def test_ngram_index(): - ds = lance.dataset(get_path("ngram_zonemap_bloomfilter_index")) - tbl = ds.to_table(filter="contains(ngram, 'word7')") - assert tbl.num_rows == 111 - - explain = ds.scanner(filter="contains(ngram, 'word7')").explain_plan() - assert "ScalarIndexQuery" in explain - - -def query_seven(ds, filt: str): - table = ds.to_table(filter=filt) - assert table.num_rows == 1 - assert table.column("idx").to_pylist() == [7] - - explain = ds.scanner(filter=filt).explain_plan() - assert "ScalarIndexQuery" in explain or "MaterializeIndex" in explain - - -@pytest.mark.forward -@pytest.mark.skipif( - Version(lance.__version__) < Version("0.20.0"), - reason="Version is too old to read index files stored with Lance 2.0 file format", -) -def test_index_search(): - ds = lance.dataset(get_path("btree_index")) - query_seven(ds, "btree == 7") - - ds = lance.dataset(get_path("bitmap_labellist_index")) - - query_seven(ds, "bitmap == 7") - query_seven(ds, "array_has_any(label_list, ['label7'])") - - -@pytest.mark.forward -@pytest.mark.skipif( - Version(lance.__version__) < Version("0.36.0"), - reason="ZONEMAP and BLOOMFILTER indices were introduced in 0.36.0", -) -def test_zonemap_bloomfilter_index_search(): - ds = lance.dataset(get_path("ngram_zonemap_bloomfilter_index")) - query_seven(ds, "zonemap == 7") - query_seven(ds, "bloomfilter == 7") - - -@pytest.mark.forward -def test_scans(): - expected_basic_types = build_basic_types() - actual_basic_types = ( - LanceFileReader(str(get_path("basic_types.lance"))).read_all().to_table() - ) - assert actual_basic_types.equals(expected_basic_types) - - expected_large = build_large() - actual_large = LanceFileReader(str(get_path("large.lance"))).read_all().to_table() - assert actual_large.equals(expected_large) - - -@pytest.mark.forward -@pytest.mark.skipif( - Version(lance.__version__) < Version("0.29.1.beta2"), # at least 0.29.1-beta.2 - reason="Lance 0.29.1-beta.2 would ignore indices too new", -) -def test_pq_buffer(): - ds = lance.dataset(get_path("pq_in_schema")) - # the index should be ignored, still able to query (brute force) - q = pc.random(32).cast(pa.float32()) - ds.to_table( - nearest={ - "q": q, - "k": 4, - "column": "vec", - } - ) - - -@pytest.mark.forward -@pytest.mark.skipif( - Version(lance.__version__) < Version("0.36.0"), - reason="FTS token set format was introduced in 0.36.0", -) -def test_list_indices_ignores_new_fts_index_version(): - # Dataset::load_manifest does not do retain_supported_indices - # so this can only work with no cache - session = lance.Session(index_cache_size_bytes=0, metadata_cache_size_bytes=0) - ds = lance.dataset(get_path("fts_index"), session=session) - indices = ds.list_indices() - # the new index version should be ignored - assert len(indices) == 0 - - -@pytest.mark.forward -@pytest.mark.skipif( - Version(lance.__version__) < Version("0.35.0"), - reason="0.35.0 changes BTREE index schema", -) -def test_write_btree_index(tmp_path: str): - path = get_path("btree_index") - # copy to tmp path to avoid modifying original - shutil.copytree(path, tmp_path, dirs_exist_ok=True) - - ds = lance.dataset(tmp_path) - data = pa.table( - { - "idx": pa.array([1000]), - "btree": pa.array([1000]), - } - ) - ds.insert(data) - ds.optimize.optimize_indices() - ds.optimize.compact_files() - - -@pytest.mark.forward -@pytest.mark.skipif( - Version(lance.__version__) < Version("0.20.0"), - reason="Version is too old to read index files stored with Lance 2.0 file format", -) -def test_write_bitmap_labellist_index(tmp_path: str): - path = get_path("bitmap_labellist_index") - # copy to tmp path to avoid modifying original - shutil.copytree(path, tmp_path, dirs_exist_ok=True) - - ds = lance.dataset(tmp_path) - data = pa.table( - { - "idx": pa.array([1000]), - "bitmap": pa.array([1000]), - "label_list": pa.array([["label1000"]]), - } - ) - ds.insert(data) - ds.optimize.optimize_indices() - ds.optimize.compact_files() - - -@pytest.mark.forward -@pytest.mark.skipif( - Version(lance.__version__) < Version("0.36.0"), - reason="NGRAM index was introduced in 0.36.0", -) -def test_write_ngram_index(tmp_path: str): - path = get_path("ngram_zonemap_bloomfilter_index") - # copy to tmp path to avoid modifying original - shutil.copytree(path, tmp_path, dirs_exist_ok=True) - - ds = lance.dataset(tmp_path) - data = pa.table( - { - "idx": pa.array([1000]), - "ngram": pa.array(["word1000"]), - } - ) - ds.insert(data) - ds.optimize.optimize_indices() - ds.optimize.compact_files() - - -@pytest.mark.forward -@pytest.mark.skipif( - Version(lance.__version__) < Version("0.36.0"), - reason="ZONEMAP and BLOOMFILTER index was introduced in 0.36.0", -) -def test_write_zonemap_bloomfilter_index(tmp_path: str): - path = get_path("ngram_zonemap_bloomfilter_index") - # copy to tmp path to avoid modifying original - shutil.copytree(path, tmp_path, dirs_exist_ok=True) - - ds = lance.dataset(tmp_path) - data = pa.table( - { - "idx": pa.array([1000]), - "zonemap": pa.array([1000]), - "bloomfilter": pa.array([1000]), - } - ) - ds.insert(data) - ds.optimize.optimize_indices() - ds.optimize.compact_files() - - -@pytest.mark.forward -@pytest.mark.skipif( - Version(lance.__version__) < Version("0.36.0"), - reason="FTS token set format was introduced in 0.36.0", -) -def test_write_fts(tmp_path: str): - path = get_path("fts_index") - # copy to tmp path to avoid modifying original - shutil.copytree(path, tmp_path, dirs_exist_ok=True) - - # Dataset::load_manifest does not do retain_supported_indices - # so this can only work with no cache - session = lance.Session(index_cache_size_bytes=0, metadata_cache_size_bytes=0) - ds = lance.dataset(tmp_path, session=session) - data = pa.table( - { - "idx": pa.array([1000]), - "text": pa.array(["new document to index"]), - } - ) - ds.insert(data) - # ds.optimize.optimize_indices() - ds.optimize.compact_files()