From e2bb263c9c0ea7842132efed7966874946d48bb9 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 6 Nov 2025 14:04:52 -0800 Subject: [PATCH 01/19] start of test framework --- python/python/tests/conftest.py | 12 + .../python/tests/forward_compat/conftest.py | 21 ++ .../tests/forward_compat/index_tests.py | 240 ++++++++++++++++++ .../tests/forward_compat/venv_manager.py | 198 +++++++++++++++ .../tests/forward_compat/venv_runner.py | 46 ++++ 5 files changed, 517 insertions(+) create mode 100644 python/python/tests/forward_compat/conftest.py create mode 100644 python/python/tests/forward_compat/index_tests.py create mode 100644 python/python/tests/forward_compat/venv_manager.py create mode 100644 python/python/tests/forward_compat/venv_runner.py diff --git a/python/python/tests/conftest.py b/python/python/tests/conftest.py index 3c344d207f5..49a6eeaa490 100644 --- a/python/python/tests/conftest.py +++ b/python/python/tests/conftest.py @@ -42,6 +42,12 @@ def pytest_addoption(parser): default=False, help="Run forward compatibility tests (requires files to be generated already)", ) + parser.addoption( + "--run-compat", + action="store_true", + default=False, + help="Run upgrade/downgrade compatibility tests (creates virtual environments)", + ) def pytest_configure(config): @@ -55,6 +61,10 @@ def pytest_configure(config): config.addinivalue_line( "markers", "slow: mark tests that require large CPU or RAM resources" ) + config.addinivalue_line( + "markers", + "compat: mark tests that run upgrade/downgrade compatibility checks", + ) def pytest_collection_modifyitems(config, items): @@ -64,6 +74,8 @@ def pytest_collection_modifyitems(config, items): disable_items_with_mark(items, "slow", "--run-slow not specified") if not config.getoption("--run-forward"): disable_items_with_mark(items, "forward", "--run-forward not specified") + if not config.getoption("--run-compat"): + disable_items_with_mark(items, "compat", "--run-compat not specified") try: import torch diff --git a/python/python/tests/forward_compat/conftest.py b/python/python/tests/forward_compat/conftest.py new file mode 100644 index 00000000000..60b0db598fd --- /dev/null +++ b/python/python/tests/forward_compat/conftest.py @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + +import pytest + +from .venv_manager import VenvFactory + + +@pytest.fixture(scope="session") +def venv_factory(tmp_path_factory): + """ + Create a VenvFactory for managing virtual environments during compatibility tests. + + This fixture is session-scoped so virtual environments are reused across tests, + improving test performance. + """ + base_path = tmp_path_factory.mktemp("venvs") + factory = VenvFactory(base_path) + yield factory + # Cleanup all venvs at end of session + factory.cleanup_all() diff --git a/python/python/tests/forward_compat/index_tests.py b/python/python/tests/forward_compat/index_tests.py new file mode 100644 index 00000000000..f77f953a17f --- /dev/null +++ b/python/python/tests/forward_compat/index_tests.py @@ -0,0 +1,240 @@ +import inspect +import sys +from pathlib import Path + +import pytest +from lance.file import LanceFileReader, LanceFileWriter + +from .util import build_basic_types + +# Flow: +# 1. Old +# a. gen_data +# b. create_index +# c. test_query +# d. test_stats +# 2. Current +# a. test_query +# b. test_stats +# c. test_optimize +# 3. Old +# a. test_query +# b. test_stats +# c. test_optimize + + +class UpgradeDowngradeTest: + def create(self): + pass + + def check_read(self): + pass + + def check_write(self): + pass + + +VERSIONS = ["0.16.0", "0.30.0", "0.36.0"] + + +def compat_test(versions=None): + """Decorator to generate upgrade/downgrade compatibility tests. + + This decorator transforms a test class into two parameterized pytest test functions: + + 1. Downgrade test: Writes with current version, then reads with old version. + 2. Upgrade-Downgrade test: Writes with old version, reads with current version, + writes with current version, reads with old version. + + The test class should inherit from UpgradeDowngradeTest and implement: + - create(): Write data with the current Lance version + - check_read(): Verify data can be read + - check_write(): Verify data can be written + + The class can be parametrized with @pytest.mark.parametrize, and those + parameters will be applied to the generated test functions. + + Parameters + ---------- + versions : list of str, optional + List of Lance versions to test against. Defaults to VERSIONS. + + Example + ------- + @compat_test() + @pytest.mark.parametrize("file_version", ["1.0", "2.0"]) + class BasicTypes(UpgradeDowngradeTest): + def __init__(self, path: Path, file_version: str): + self.path = path + self.file_version = file_version + + def create(self): + # Write data + pass + + def check_read(self): + # Read and verify data + pass + + def check_write(self): + # Write data + pass + """ + if versions is None: + versions = VERSIONS + + def decorator(cls): + # Extract existing parametrize marks from the class + existing_params = ( + [ + m + for m in ( + cls.pytestmark + if isinstance(cls.pytestmark, list) + else [cls.pytestmark] + ) + if getattr(m, "name", None) == "parametrize" + ] + if hasattr(cls, "pytestmark") + else [] + ) + + # Get parameter names from __init__ (excluding 'self' and 'path') + sig = inspect.signature(cls.__init__) + param_names = [p for p in sig.parameters.keys() if p not in ("self", "path")] + + # Create test functions dynamically with proper signatures + downgrade_func = _make_test_function(cls, param_names, "downgrade") + upgrade_downgrade_func = _make_test_function( + cls, param_names, "upgrade_downgrade" + ) + + # Apply version parametrization + downgrade_func = pytest.mark.parametrize("version", versions)(downgrade_func) + upgrade_downgrade_func = pytest.mark.parametrize("version", versions)( + upgrade_downgrade_func + ) + + # Apply existing parametrize marks + for mark in existing_params: + downgrade_func = pytest.mark.parametrize(*mark.args, **mark.kwargs)( + downgrade_func + ) + upgrade_downgrade_func = pytest.mark.parametrize(*mark.args, **mark.kwargs)( + upgrade_downgrade_func + ) + + # Apply compat marker + downgrade_func = pytest.mark.compat(downgrade_func) + upgrade_downgrade_func = pytest.mark.compat(upgrade_downgrade_func) + + # Set function names + downgrade_func.__name__ = f"test_{cls.__name__}_downgrade" + upgrade_downgrade_func.__name__ = f"test_{cls.__name__}_upgrade_downgrade" + + # Register test functions in the module where the class is defined + module = sys.modules[cls.__module__] + setattr(module, downgrade_func.__name__, downgrade_func) + setattr(module, upgrade_downgrade_func.__name__, upgrade_downgrade_func) + + return cls + + return decorator + + +def _make_test_function(cls, param_names, test_type): + """Create a test function with the correct signature for pytest. + + Parameters + ---------- + cls : class + The test class to create a function for + param_names : list of str + Names of parameters from the class __init__ (excluding self and path) + test_type : str + Either "downgrade" or "upgrade_downgrade" + + Returns + ------- + function + Test function with correct signature for pytest + """ + # Build function signature + sig_params = "venv_factory, tmp_path, version" + for param in param_names: + sig_params += f", {param}" + + # Build parameter passing to __init__ + init_params = ", ".join(param_names) if param_names else "" + + # Build function body based on test type + if test_type == "downgrade": + func_body = f''' +def test_func({sig_params}): + """Test that old Lance version can read data written by current version.""" + from pathlib import Path + obj = cls(tmp_path / "data.lance", {init_params}) + # Current version: create data + obj.create() + # Old version: verify can read + venv = venv_factory.get_venv(version) + venv.execute_method(obj, "check_read") +''' + else: # upgrade_downgrade + func_body = f''' +def test_func({sig_params}): + """Test round-trip compatibility: old -> current -> old.""" + from pathlib import Path + obj = cls(tmp_path / "data.lance", {init_params}) + venv = venv_factory.get_venv(version) + # Old version: create data + venv.execute_method(obj, "create") + # Current version: read and write + obj.check_read() + obj.check_write() + # Old version: verify can still read + venv.execute_method(obj, "check_read") +''' + + # Execute to create the function + namespace = {"cls": cls} + exec(func_body, namespace) + return namespace["test_func"] + + +@compat_test() +@pytest.mark.parametrize("file_version", ["2.0"]) # Only test stable file versions +class BasicTypes(UpgradeDowngradeTest): + def __init__(self, path: Path, file_version: str): + self.path = path + self.file_version = file_version + + def create(self): + with LanceFileWriter(str(self.path), version=self.file_version) as writer: + writer.write_batch(build_basic_types()) + + def check_read(self): + reader = LanceFileReader(str(self.path)) + table = reader.read_all().to_table() + assert table == build_basic_types() + + def check_write(self): + with LanceFileWriter(str(self.path), version=self.file_version) as writer: + writer.write_batch(build_basic_types()) + + +class IndexTest: + def gen_data(self): + pass + + def create_index(self): + pass + + def test_query(self): + pass + + def test_stats(self): + pass + + def test_optimize(self): + pass diff --git a/python/python/tests/forward_compat/venv_manager.py b/python/python/tests/forward_compat/venv_manager.py new file mode 100644 index 00000000000..2fea6eba91b --- /dev/null +++ b/python/python/tests/forward_compat/venv_manager.py @@ -0,0 +1,198 @@ +""" +Virtual environment management for compatibility testing. + +Manages creation and execution of test code in isolated virtual environments +with specific Lance versions installed. +""" + +import pickle +import subprocess +import sys +from pathlib import Path +from typing import Any, Optional + + +class VenvExecutor: + """Manages a virtual environment with a specific Lance version.""" + + def __init__(self, version: str, venv_path: Path): + """ + Initialize a VenvExecutor. + + Parameters + ---------- + version : str + Lance version to install (e.g., "0.30.0") + venv_path : Path + Directory where virtual environment will be created + """ + self.version = version + self.venv_path = Path(venv_path) + self.python_path: Optional[Path] = None + self._created = False + + def create(self): + """Create the virtual environment and install the specified Lance version.""" + if self._created: + return + + # Create virtual environment + subprocess.run( + [sys.executable, "-m", "venv", str(self.venv_path)], + check=True, + capture_output=True, + ) + + # Determine python path in venv + if sys.platform == "win32": + self.python_path = self.venv_path / "Scripts" / "python.exe" + else: + self.python_path = self.venv_path / "bin" / "python" + + # Upgrade pip + subprocess.run( + [str(self.python_path), "-m", "pip", "install", "--upgrade", "pip"], + check=True, + capture_output=True, + ) + + # Install specific pylance version and pytest (needed for test modules) + subprocess.run( + [ + str(self.python_path), + "-m", + "pip", + "install", + f"pylance=={self.version}", + "pytest", + ], + check=True, + capture_output=True, + ) + + self._created = True + + def execute_method(self, obj: Any, method_name: str) -> Any: + """ + Execute a method on a pickled object in the virtual environment. + + Parameters + ---------- + obj : Any + Object to pickle and send to venv. Must be picklable. + method_name : str + Name of the method to call on the object + + Returns + ------- + Any + Return value from the method call + + Raises + ------ + Exception + Re-raises any exception that occurred in the venv + """ + if not self._created: + raise RuntimeError("Virtual environment not created. Call create() first.") + + # Get path to venv_runner.py + runner_script = Path(__file__).parent / "venv_runner.py" + + # Pickle the object + pickled_obj = pickle.dumps(obj) + + # Set PYTHONPATH to include the tests directory so the venv can import + # test modules. This allows unpickling test classes (they're pickled as + # forward_compat.*) + import os + + env = os.environ.copy() + tests_dir = Path(__file__).parent.parent + env["PYTHONPATH"] = str(tests_dir) + + # Run the venv_runner.py script + result = subprocess.run( + [str(self.python_path), str(runner_script), method_name], + input=pickled_obj, + capture_output=True, + env=env, + ) + + # Parse the result + if result.returncode == 0: + response = pickle.loads(result.stdout) + if response["success"]: + return response["result"] + else: + # This shouldn't happen if returncode is 0, but handle it + raise RuntimeError(f"Unexpected error: {response}") + else: + # Execution failed, unpickle error info + try: + error_info = pickle.loads(result.stdout) + # Re-create the exception with traceback info + error_msg = ( + f"Error in venv (Lance {self.version}) calling {method_name}:\n" + f"{error_info['exception_type']}: {error_info['exception_msg']}\n" + f"\nTraceback from venv:\n{error_info['traceback']}" + ) + raise RuntimeError(error_msg) + except (pickle.UnpicklingError, KeyError, EOFError): + # If we can't unpickle the error, show raw output + raise RuntimeError( + f"Failed to execute {method_name} in venv (Lance {self.version}):\n" + f"stdout: {result.stdout.decode('utf-8', errors='replace')}\n" + f"stderr: {result.stderr.decode('utf-8', errors='replace')}" + ) + + def cleanup(self): + """Remove the virtual environment directory.""" + if self.venv_path.exists(): + import shutil + + shutil.rmtree(self.venv_path) + self._created = False + + +class VenvFactory: + """Factory for creating and managing VenvExecutor instances.""" + + def __init__(self, base_path: Path): + """ + Initialize the factory. + + Parameters + ---------- + base_path : Path + Base directory for creating virtual environments + """ + self.base_path = Path(base_path) + self.venvs: dict[str, VenvExecutor] = {} + + def get_venv(self, version: str) -> VenvExecutor: + """ + Get or create a VenvExecutor for the specified version. + + Parameters + ---------- + version : str + Lance version + + Returns + ------- + VenvExecutor + Executor for the specified version + """ + if version not in self.venvs: + venv_path = self.base_path / f"venv_{version}" + executor = VenvExecutor(version, venv_path) + executor.create() + self.venvs[version] = executor + return self.venvs[version] + + def cleanup_all(self): + """Clean up all created virtual environments.""" + for venv in self.venvs.values(): + venv.cleanup() + self.venvs.clear() diff --git a/python/python/tests/forward_compat/venv_runner.py b/python/python/tests/forward_compat/venv_runner.py new file mode 100644 index 00000000000..a84add5c7b9 --- /dev/null +++ b/python/python/tests/forward_compat/venv_runner.py @@ -0,0 +1,46 @@ +""" +Runner script executed inside virtual environments to run compatibility tests. + +This script is executed in a subprocess with a specific Lance version installed. +It receives a pickled test object and method name, executes the method, and +returns the result. +""" + +import pickle +import sys +import traceback + + +def main(): + if len(sys.argv) < 2: + print("Usage: venv_runner.py ", file=sys.stderr) + sys.exit(1) + + method_name = sys.argv[1] + + try: + # Read pickled object from stdin + obj = pickle.load(sys.stdin.buffer) + + # Call the specified method + method = getattr(obj, method_name) + result = method() + + # Write success indicator and optional result + pickle.dump({"success": True, "result": result}, sys.stdout.buffer) + sys.exit(0) + + except Exception as e: + # Capture exception details to send back + error_info = { + "success": False, + "exception_type": type(e).__name__, + "exception_msg": str(e), + "traceback": traceback.format_exc(), + } + pickle.dump(error_info, sys.stdout.buffer) + sys.exit(1) + + +if __name__ == "__main__": + main() From b01a1f5876726e4540e81e5b9f4dcefff5e01513 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 6 Nov 2025 14:19:46 -0800 Subject: [PATCH 02/19] add 0.1 tests --- .../tests/forward_compat/index_tests.py | 29 +++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/python/python/tests/forward_compat/index_tests.py b/python/python/tests/forward_compat/index_tests.py index f77f953a17f..b735a124c39 100644 --- a/python/python/tests/forward_compat/index_tests.py +++ b/python/python/tests/forward_compat/index_tests.py @@ -2,6 +2,7 @@ import sys from pathlib import Path +import lance import pytest from lance.file import LanceFileReader, LanceFileWriter @@ -210,8 +211,11 @@ def __init__(self, path: Path, file_version: str): self.file_version = file_version def create(self): - with LanceFileWriter(str(self.path), version=self.file_version) as writer: - writer.write_batch(build_basic_types()) + batch = build_basic_types() + with LanceFileWriter( + str(self.path), version=self.file_version, schema=batch.schema + ) as writer: + writer.write_batch(batch) def check_read(self): reader = LanceFileReader(str(self.path)) @@ -223,6 +227,27 @@ def check_write(self): writer.write_batch(build_basic_types()) +@compat_test() +class BasicTypesLegacy(UpgradeDowngradeTest): + def __init__(self, path: Path, file_version: str): + self.path = path + self.file_version = file_version + + def create(self): + batch = build_basic_types() + lance.write_dataset(batch, self.path, data_storage_version="0.1") + + def check_read(self): + ds = lance.dataset(self.path) + table = ds.to_table() + assert table == build_basic_types() + + def check_write(self): + ds = lance.dataset(self.path) + ds.delete("true") + ds.insert(build_basic_types()) + + class IndexTest: def gen_data(self): pass From e2e702fde2e3fbd7dd571ba98e889106a92d056d Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 6 Nov 2025 14:31:13 -0800 Subject: [PATCH 03/19] address performance --- .../tests/forward_compat/index_tests.py | 3 +- .../tests/forward_compat/venv_manager.py | 130 ++++++++++++------ .../tests/forward_compat/venv_runner.py | 98 ++++++++----- 3 files changed, 159 insertions(+), 72 deletions(-) diff --git a/python/python/tests/forward_compat/index_tests.py b/python/python/tests/forward_compat/index_tests.py index b735a124c39..ffec149d44b 100644 --- a/python/python/tests/forward_compat/index_tests.py +++ b/python/python/tests/forward_compat/index_tests.py @@ -229,9 +229,8 @@ def check_write(self): @compat_test() class BasicTypesLegacy(UpgradeDowngradeTest): - def __init__(self, path: Path, file_version: str): + def __init__(self, path: Path): self.path = path - self.file_version = file_version def create(self): batch = build_basic_types() diff --git a/python/python/tests/forward_compat/venv_manager.py b/python/python/tests/forward_compat/venv_manager.py index 2fea6eba91b..202deee0289 100644 --- a/python/python/tests/forward_compat/venv_manager.py +++ b/python/python/tests/forward_compat/venv_manager.py @@ -5,7 +5,9 @@ with specific Lance versions installed. """ +import os import pickle +import struct import subprocess import sys from pathlib import Path @@ -30,6 +32,7 @@ def __init__(self, version: str, venv_path: Path): self.venv_path = Path(venv_path) self.python_path: Optional[Path] = None self._created = False + self._subprocess: Optional[subprocess.Popen] = None def create(self): """Create the virtual environment and install the specified Lance version.""" @@ -72,10 +75,61 @@ def create(self): self._created = True + def _ensure_subprocess(self): + """Ensure the persistent subprocess is running.""" + if self._subprocess is not None and self._subprocess.poll() is None: + # Subprocess is already running + return + + # Start persistent subprocess + runner_script = Path(__file__).parent / "venv_runner.py" + + # Set PYTHONPATH to include the tests directory + env = os.environ.copy() + tests_dir = Path(__file__).parent.parent + env["PYTHONPATH"] = str(tests_dir) + + self._subprocess = subprocess.Popen( + [str(self.python_path), "-u", str(runner_script)], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=env, + ) + + def _send_message(self, obj: Any): + """Send a length-prefixed pickled message to subprocess.""" + data = pickle.dumps(obj) + length = struct.pack(">I", len(data)) + self._subprocess.stdin.write(length) + self._subprocess.stdin.write(data) + self._subprocess.stdin.flush() + + def _receive_message(self) -> Any: + """Receive a length-prefixed pickled message from subprocess.""" + # Read 4-byte length header + length_bytes = self._subprocess.stdout.read(4) + if len(length_bytes) < 4: + raise RuntimeError("Failed to read message length from subprocess") + + length = struct.unpack(">I", length_bytes)[0] + + # Read message data + data = self._subprocess.stdout.read(length) + if len(data) < length: + raise RuntimeError( + f"Incomplete message: expected {length} bytes, got {len(data)}" + ) + + return pickle.loads(data) + def execute_method(self, obj: Any, method_name: str) -> Any: """ Execute a method on a pickled object in the virtual environment. + Uses a persistent subprocess to avoid repeatedly importing Lance and + its dependencies. + Parameters ---------- obj : Any @@ -96,58 +150,56 @@ def execute_method(self, obj: Any, method_name: str) -> Any: if not self._created: raise RuntimeError("Virtual environment not created. Call create() first.") - # Get path to venv_runner.py - runner_script = Path(__file__).parent / "venv_runner.py" - - # Pickle the object - pickled_obj = pickle.dumps(obj) + # Ensure subprocess is running + self._ensure_subprocess() - # Set PYTHONPATH to include the tests directory so the venv can import - # test modules. This allows unpickling test classes (they're pickled as - # forward_compat.*) - import os + try: + # Send request: (obj, method_name) + self._send_message((obj, method_name)) - env = os.environ.copy() - tests_dir = Path(__file__).parent.parent - env["PYTHONPATH"] = str(tests_dir) + # Receive response + response = self._receive_message() - # Run the venv_runner.py script - result = subprocess.run( - [str(self.python_path), str(runner_script), method_name], - input=pickled_obj, - capture_output=True, - env=env, - ) - - # Parse the result - if result.returncode == 0: - response = pickle.loads(result.stdout) if response["success"]: return response["result"] else: - # This shouldn't happen if returncode is 0, but handle it - raise RuntimeError(f"Unexpected error: {response}") - else: - # Execution failed, unpickle error info - try: - error_info = pickle.loads(result.stdout) - # Re-create the exception with traceback info + # Error occurred in subprocess error_msg = ( f"Error in venv (Lance {self.version}) calling {method_name}:\n" - f"{error_info['exception_type']}: {error_info['exception_msg']}\n" - f"\nTraceback from venv:\n{error_info['traceback']}" + f"{response['exception_type']}: {response['exception_msg']}\n" + f"\nTraceback from venv:\n{response['traceback']}" ) raise RuntimeError(error_msg) - except (pickle.UnpicklingError, KeyError, EOFError): - # If we can't unpickle the error, show raw output - raise RuntimeError( - f"Failed to execute {method_name} in venv (Lance {self.version}):\n" - f"stdout: {result.stdout.decode('utf-8', errors='replace')}\n" - f"stderr: {result.stderr.decode('utf-8', errors='replace')}" + + except (BrokenPipeError, EOFError, struct.error) as e: + # Subprocess died or communication failed + stderr_output = "" + if self._subprocess and self._subprocess.stderr: + stderr_output = self._subprocess.stderr.read().decode( + "utf-8", errors="replace" ) + raise RuntimeError( + f"Communication with venv subprocess failed (Lance {self.version}):\n" + f"Error: {e}\n" + f"stderr: {stderr_output}" + ) + def cleanup(self): - """Remove the virtual environment directory.""" + """Remove the virtual environment directory and terminate subprocess.""" + # Terminate the persistent subprocess + if self._subprocess is not None: + try: + self._subprocess.stdin.close() + self._subprocess.terminate() + self._subprocess.wait(timeout=5) + except Exception: + # Force kill if graceful termination fails + self._subprocess.kill() + finally: + self._subprocess = None + + # Remove venv directory if self.venv_path.exists(): import shutil diff --git a/python/python/tests/forward_compat/venv_runner.py b/python/python/tests/forward_compat/venv_runner.py index a84add5c7b9..860e6d75f94 100644 --- a/python/python/tests/forward_compat/venv_runner.py +++ b/python/python/tests/forward_compat/venv_runner.py @@ -1,45 +1,81 @@ """ Runner script executed inside virtual environments to run compatibility tests. -This script is executed in a subprocess with a specific Lance version installed. -It receives a pickled test object and method name, executes the method, and -returns the result. +This script runs as a persistent subprocess that accepts multiple method calls +without restarting. This avoids the overhead of repeatedly importing Lance and +its dependencies. + +Protocol: +- Reads 4 bytes (message length as big-endian int) +- Reads that many bytes (pickled tuple of (obj, method_name)) +- Executes method on object +- Writes 4 bytes (response length) +- Writes pickled response dict """ import pickle +import struct import sys import traceback +def read_message(stream): + """Read a length-prefixed pickled message from stream.""" + # Read 4-byte length header + length_bytes = stream.buffer.read(4) + if len(length_bytes) < 4: + return None # EOF + + length = struct.unpack(">I", length_bytes)[0] + + # Read message data + data = stream.buffer.read(length) + if len(data) < length: + raise RuntimeError( + f"Incomplete message: expected {length} bytes, got {len(data)}" + ) + + return pickle.loads(data) + + +def write_message(stream, obj): + """Write a length-prefixed pickled message to stream.""" + data = pickle.dumps(obj) + length = struct.pack(">I", len(data)) + stream.buffer.write(length) + stream.buffer.write(data) + stream.buffer.flush() + + def main(): - if len(sys.argv) < 2: - print("Usage: venv_runner.py ", file=sys.stderr) - sys.exit(1) - - method_name = sys.argv[1] - - try: - # Read pickled object from stdin - obj = pickle.load(sys.stdin.buffer) - - # Call the specified method - method = getattr(obj, method_name) - result = method() - - # Write success indicator and optional result - pickle.dump({"success": True, "result": result}, sys.stdout.buffer) - sys.exit(0) - - except Exception as e: - # Capture exception details to send back - error_info = { - "success": False, - "exception_type": type(e).__name__, - "exception_msg": str(e), - "traceback": traceback.format_exc(), - } - pickle.dump(error_info, sys.stdout.buffer) - sys.exit(1) + """Main loop that processes method calls until EOF.""" + while True: + try: + # Read request (obj, method_name) + request = read_message(sys.stdin) + if request is None: + # EOF - parent closed connection + break + + obj, method_name = request + + # Execute method + method = getattr(obj, method_name) + result = method() + + # Send success response + response = {"success": True, "result": result} + write_message(sys.stdout, response) + + except Exception as e: + # Send error response + error_info = { + "success": False, + "exception_type": type(e).__name__, + "exception_msg": str(e), + "traceback": traceback.format_exc(), + } + write_message(sys.stdout, error_info) if __name__ == "__main__": From 821f9b4d666ece7eba669b3719f53f1e54325d4e Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 6 Nov 2025 14:57:57 -0800 Subject: [PATCH 04/19] handle more recent versions --- .../tests/forward_compat/index_tests.py | 143 +++++++++++++++++- .../tests/forward_compat/venv_manager.py | 3 + 2 files changed, 139 insertions(+), 7 deletions(-) diff --git a/python/python/tests/forward_compat/index_tests.py b/python/python/tests/forward_compat/index_tests.py index ffec149d44b..feba479e069 100644 --- a/python/python/tests/forward_compat/index_tests.py +++ b/python/python/tests/forward_compat/index_tests.py @@ -1,5 +1,8 @@ import inspect +import json +import subprocess import sys +from functools import lru_cache from pathlib import Path import lance @@ -24,6 +27,92 @@ # c. test_optimize +@lru_cache(maxsize=1) +def last_stable_release(): + """Returns the latest stable version available on PyPI. + + Queries the PyPI JSON API to get the latest stable release of pylance. + Results are cached to avoid repeated network calls. + """ + try: + import urllib.request + + with urllib.request.urlopen( + "https://pypi.org/pypi/pylance/json", timeout=5 + ) as response: + data = json.loads(response.read()) + version = data["info"]["version"] + return version + except Exception as e: + # If we can't fetch, return None which will be filtered out + print( + f"Warning: Could not fetch latest stable release from PyPI: {e}", + file=sys.stderr, + ) + return None + + +@lru_cache(maxsize=1) +def last_beta_release(): + """Returns the latest beta version available on fury.io. + + Uses pip to query the fury.io index for pre-release versions of pylance. + Results are cached to avoid repeated network calls. + """ + try: + # Use pip index to get versions from fury.io + result = subprocess.run( + [ + sys.executable, + "-m", + "pip", + "index", + "versions", + "pylance", + "--pre", + "--extra-index-url", + "https://pypi.fury.io/lancedb/", + ], + capture_output=True, + text=True, + timeout=10, + ) + + if result.returncode == 0: + # Parse output to find available versions + # Output format: "pylance (x.y.z)" + # Available versions: x.y.z.betaN, x.y.z, ... + for line in result.stdout.splitlines(): + if "Available versions:" in line: + versions_str = line.split("Available versions:")[1].strip() + versions = [v.strip() for v in versions_str.split(",")] + # Return the first beta/pre-release version + for v in versions: + if "beta" in v or "rc" in v or "a" in v or "b" in v: + return v + # If no pre-release found, return the first version + if versions: + return versions[0] + + print( + "Warning: Could not fetch latest beta release from fury.io", + file=sys.stderr, + ) + return None + + except Exception as e: + print( + f"Warning: Could not fetch latest beta release from fury.io: {e}", + file=sys.stderr, + ) + return None + + +# Fetch versions (cached) +LAST_STABLE_RELEASE = last_stable_release() +LAST_BETA_RELEASE = last_beta_release() + + class UpgradeDowngradeTest: def create(self): pass @@ -35,7 +124,12 @@ def check_write(self): pass -VERSIONS = ["0.16.0", "0.30.0", "0.36.0"] +# Default versions to test, filtering out any that couldn't be fetched +VERSIONS = [ + v + for v in ["0.16.0", "0.30.0", "0.36.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE] + if v is not None +] def compat_test(versions=None): @@ -84,6 +178,17 @@ def check_write(self): if versions is None: versions = VERSIONS + # Filter out None values (in case some versions couldn't be fetched) + versions = [v for v in versions if v is not None] + + # Skip if no valid versions + if not versions: + + def decorator(cls): + return cls + + return decorator + def decorator(cls): # Extract existing parametrize marks from the class existing_params = ( @@ -203,17 +308,40 @@ def test_func({sig_params}): return namespace["test_func"] +# We start testing against the first release where 2.1 was stable. Before that +# the format was unstable to the readers will panic. +@compat_test(versions=["0.38.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE]) +class BasicTypes2_1(UpgradeDowngradeTest): + def __init__(self, path: Path): + self.path = path + + def create(self): + batch = build_basic_types() + with LanceFileWriter( + str(self.path), version="2.1", schema=batch.schema + ) as writer: + writer.write_batch(batch) + + def check_read(self): + reader = LanceFileReader(str(self.path)) + table = reader.read_all().to_table() + assert table == build_basic_types() + + def check_write(self): + # Test with overwrite + with LanceFileWriter(str(self.path), version="2.1") as writer: + writer.write_batch(build_basic_types()) + + @compat_test() -@pytest.mark.parametrize("file_version", ["2.0"]) # Only test stable file versions -class BasicTypes(UpgradeDowngradeTest): - def __init__(self, path: Path, file_version: str): +class BasicTypes2_0(UpgradeDowngradeTest): + def __init__(self, path: Path): self.path = path - self.file_version = file_version def create(self): batch = build_basic_types() with LanceFileWriter( - str(self.path), version=self.file_version, schema=batch.schema + str(self.path), version="2.0", schema=batch.schema ) as writer: writer.write_batch(batch) @@ -223,7 +351,8 @@ def check_read(self): assert table == build_basic_types() def check_write(self): - with LanceFileWriter(str(self.path), version=self.file_version) as writer: + # Test with overwrite + with LanceFileWriter(str(self.path), version="2.0") as writer: writer.write_batch(build_basic_types()) diff --git a/python/python/tests/forward_compat/venv_manager.py b/python/python/tests/forward_compat/venv_manager.py index 202deee0289..8d08014325e 100644 --- a/python/python/tests/forward_compat/venv_manager.py +++ b/python/python/tests/forward_compat/venv_manager.py @@ -66,6 +66,9 @@ def create(self): "-m", "pip", "install", + "--pre", + "--extra-index-url", + "https://pypi.fury.io/lancedb/", f"pylance=={self.version}", "pytest", ], From 171b992357694d299a28e9b382141b8076897b37 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 6 Nov 2025 16:10:45 -0800 Subject: [PATCH 05/19] add index tests --- .../tests/forward_compat/index_tests.py | 391 ++++++++++++++++-- 1 file changed, 365 insertions(+), 26 deletions(-) diff --git a/python/python/tests/forward_compat/index_tests.py b/python/python/tests/forward_compat/index_tests.py index feba479e069..81277a892fa 100644 --- a/python/python/tests/forward_compat/index_tests.py +++ b/python/python/tests/forward_compat/index_tests.py @@ -1,31 +1,19 @@ import inspect import json +import shutil import subprocess import sys from functools import lru_cache from pathlib import Path import lance +import pyarrow as pa +import pyarrow.compute as pc import pytest from lance.file import LanceFileReader, LanceFileWriter from .util import build_basic_types -# Flow: -# 1. Old -# a. gen_data -# b. create_index -# c. test_query -# d. test_stats -# 2. Current -# a. test_query -# b. test_stats -# c. test_optimize -# 3. Old -# a. test_query -# b. test_stats -# c. test_optimize - @lru_cache(maxsize=1) def last_stable_release(): @@ -376,18 +364,369 @@ def check_write(self): ds.insert(build_basic_types()) -class IndexTest: - def gen_data(self): - pass +# ============================================================================ +# Index Compatibility Tests +# ============================================================================ +# These tests verify that indices created with one version of Lance can be +# read and written by other versions. - def create_index(self): - pass - def test_query(self): - pass +@compat_test(versions=["0.30.0", "0.36.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE]) +class BTreeIndex(UpgradeDowngradeTest): + """Test BTREE scalar index compatibility""" - def test_stats(self): - pass + def __init__(self, path: Path): + self.path = path - def test_optimize(self): - pass + def create(self): + """Create dataset with BTREE index.""" + shutil.rmtree(self.path, ignore_errors=True) + data = pa.table( + { + "idx": pa.array(range(1000)), + "btree": pa.array(range(1000)), + } + ) + dataset = lance.write_dataset(data, self.path, max_rows_per_file=100) + dataset.create_scalar_index("btree", "BTREE") + + def check_read(self): + """Verify BTREE index can be queried.""" + ds = lance.dataset(self.path) + table = ds.to_table(filter="btree == 7") + assert table.num_rows == 1 + assert table.column("idx").to_pylist() == [7] + + # Verify index is used + explain = ds.scanner(filter="btree == 7").explain_plan() + assert "ScalarIndexQuery" in explain or "MaterializeIndex" in explain + + def check_write(self): + """Verify can insert data and optimize BTREE index.""" + ds = lance.dataset(self.path) + data = pa.table( + { + "idx": pa.array([1000]), + "btree": pa.array([1000]), + } + ) + ds.insert(data) + ds.optimize.optimize_indices() + ds.optimize.compact_files() + + # Verify new data is queryable + table = ds.to_table(filter="btree == 1000") + assert table.num_rows == 1 + + +@compat_test( + versions=["0.20.0", "0.30.0", "0.36.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE] +) +class BitmapLabelListIndex(UpgradeDowngradeTest): + """Test BITMAP and LABEL_LIST scalar index compatibility (introduced in 0.20.0).""" + + def __init__(self, path: Path): + self.path = path + + def create(self): + """Create dataset with BITMAP and LABEL_LIST indices.""" + shutil.rmtree(self.path, ignore_errors=True) + data = pa.table( + { + "idx": pa.array(range(1000)), + "bitmap": pa.array(range(1000)), + "label_list": pa.array([[f"label{i}"] for i in range(1000)]), + } + ) + dataset = lance.write_dataset(data, self.path, max_rows_per_file=100) + dataset.create_scalar_index("bitmap", "BITMAP") + dataset.create_scalar_index("label_list", "LABEL_LIST") + + def check_read(self): + """Verify BITMAP and LABEL_LIST indices can be queried.""" + ds = lance.dataset(self.path) + + # Test BITMAP index + table = ds.to_table(filter="bitmap == 7") + assert table.num_rows == 1 + assert table.column("idx").to_pylist() == [7] + + # Test LABEL_LIST index + table = ds.to_table(filter="array_has_any(label_list, ['label7'])") + assert table.num_rows == 1 + assert table.column("idx").to_pylist() == [7] + + def check_write(self): + """Verify can insert data and optimize indices.""" + ds = lance.dataset(self.path) + data = pa.table( + { + "idx": pa.array([1000]), + "bitmap": pa.array([1000]), + "label_list": pa.array([["label1000"]]), + } + ) + ds.insert(data) + ds.optimize.optimize_indices() + ds.optimize.compact_files() + + +@compat_test(versions=["0.36.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE]) +class NgramIndex(UpgradeDowngradeTest): + """Test NGRAM index compatibility (introduced in 0.36.0).""" + + def __init__(self, path: Path): + self.path = path + + def create(self): + """Create dataset with NGRAM index.""" + shutil.rmtree(self.path, ignore_errors=True) + data = pa.table( + { + "idx": pa.array(range(1000)), + "ngram": pa.array([f"word{i}" for i in range(1000)]), + } + ) + dataset = lance.write_dataset(data, self.path, max_rows_per_file=100) + dataset.create_scalar_index("ngram", "NGRAM") + + def check_read(self): + """Verify NGRAM index can be queried.""" + ds = lance.dataset(self.path) + table = ds.to_table(filter="contains(ngram, 'word7')") + # word7, word70-79, word700-799 = 111 results + assert table.num_rows == 111 + + # Verify index is used + explain = ds.scanner(filter="contains(ngram, 'word7')").explain_plan() + assert "ScalarIndexQuery" in explain + + def check_write(self): + """Verify can insert data and optimize NGRAM index.""" + ds = lance.dataset(self.path) + data = pa.table( + { + "idx": pa.array([1000]), + "ngram": pa.array(["word1000"]), + } + ) + ds.insert(data) + ds.optimize.optimize_indices() + ds.optimize.compact_files() + + +@compat_test(versions=["0.36.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE]) +class ZonemapBloomfilterIndex(UpgradeDowngradeTest): + """Test ZONEMAP and BLOOMFILTER index compatibility (introduced in 0.36.0).""" + + def __init__(self, path: Path): + self.path = path + + def create(self): + """Create dataset with ZONEMAP and BLOOMFILTER indices.""" + shutil.rmtree(self.path, ignore_errors=True) + data = pa.table( + { + "idx": pa.array(range(1000)), + "zonemap": pa.array(range(1000)), + "bloomfilter": pa.array(range(1000)), + } + ) + dataset = lance.write_dataset(data, self.path, max_rows_per_file=100) + dataset.create_scalar_index("zonemap", "ZONEMAP") + dataset.create_scalar_index("bloomfilter", "BLOOMFILTER") + + def check_read(self): + """Verify ZONEMAP and BLOOMFILTER indices can be queried.""" + ds = lance.dataset(self.path) + + # Test ZONEMAP + table = ds.to_table(filter="zonemap == 7") + assert table.num_rows == 1 + assert table.column("idx").to_pylist() == [7] + + # Test BLOOMFILTER + table = ds.to_table(filter="bloomfilter == 7") + assert table.num_rows == 1 + assert table.column("idx").to_pylist() == [7] + + def check_write(self): + """Verify can insert data and optimize indices.""" + ds = lance.dataset(self.path) + data = pa.table( + { + "idx": pa.array([1000]), + "zonemap": pa.array([1000]), + "bloomfilter": pa.array([1000]), + } + ) + ds.insert(data) + ds.optimize.optimize_indices() + ds.optimize.compact_files() + + +@compat_test(versions=["0.36.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE]) +class JsonIndex(UpgradeDowngradeTest): + """Test JSON index compatibility (introduced in 0.36.0).""" + + def __init__(self, path: Path): + self.path = path + + def create(self): + """Create dataset with JSON index.""" + from lance.indices import IndexConfig + + shutil.rmtree(self.path, ignore_errors=True) + data = pa.table( + { + "idx": pa.array(range(1000)), + "json": pa.array([f'{{"val": {i}}}' for i in range(1000)], pa.json_()), + } + ) + dataset = lance.write_dataset(data, self.path, max_rows_per_file=100) + dataset.create_scalar_index( + "json", + IndexConfig( + index_type="json", + parameters={"target_index_type": "btree", "path": "val"}, + ), + ) + + def check_read(self): + """Verify JSON index can be queried.""" + ds = lance.dataset(self.path) + table = ds.to_table(filter="json_get_int(json, 'val') == 7") + assert table.num_rows == 1 + assert table.column("idx").to_pylist() == [7] + + # Verify index is used + explain = ds.scanner(filter="json_get_int(json, 'val') == 7").explain_plan() + assert "ScalarIndexQuery" in explain + + def check_write(self): + """Verify can insert data with JSON index.""" + ds = lance.dataset(self.path) + data = pa.table( + { + "idx": pa.array([1000]), + "json": pa.array(['{"val": 1000}'], pa.json_()), + } + ) + ds.insert(data) + # TODO: fix this https://github.com/lancedb/lance/issues/5177 + # ds.optimize.optimize_indices() + ds.optimize.compact_files() + + +@compat_test(versions=["0.36.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE]) +class FtsIndex(UpgradeDowngradeTest): + """Test FTS (full-text search) index compatibility (introduced in 0.36.0).""" + + def __init__(self, path: Path): + self.path = path + + def create(self): + """Create dataset with FTS index.""" + shutil.rmtree(self.path, ignore_errors=True) + data = pa.table( + { + "idx": pa.array(range(1000)), + "text": pa.array( + [f"document with words {i} and more text" for i in range(1000)] + ), + } + ) + dataset = lance.write_dataset(data, self.path, max_rows_per_file=100) + dataset.create_scalar_index("text", "INVERTED") + + def check_read(self): + """Verify FTS index can be queried.""" + ds = lance.dataset(self.path) + # Search for documents containing "words" and "7" + # Note: Actual FTS query syntax may vary + table = ds.to_table(filter="text LIKE '%words 7 %'") + assert table.num_rows > 0 + + def check_write(self): + """Verify can insert data with FTS index.""" + # Dataset::load_manifest does not do retain_supported_indices + # so this can only work with no cache + session = lance.Session(index_cache_size_bytes=0, metadata_cache_size_bytes=0) + ds = lance.dataset(self.path, session=session) + data = pa.table( + { + "idx": pa.array([1000]), + "text": pa.array(["new document to index"]), + } + ) + ds.insert(data) + ds.optimize.compact_files() + + +@compat_test( + versions=[ + "0.29.1.beta2", + "0.30.0", + "0.36.0", + LAST_STABLE_RELEASE, + LAST_BETA_RELEASE, + ] +) +class PqVectorIndex(UpgradeDowngradeTest): + """Test PQ (Product Quantization) vector index compatibility.""" + + def __init__(self, path: Path): + self.path = path + + def create(self): + """Create dataset with PQ vector index.""" + shutil.rmtree(self.path, ignore_errors=True) + ndims = 32 + nvecs = 512 + + data = pa.table( + { + "id": pa.array(range(nvecs)), + "vec": pa.FixedSizeListArray.from_arrays( + pc.random(ndims * nvecs).cast(pa.float32()), ndims + ), + } + ) + + dataset = lance.write_dataset(data, self.path) + dataset.create_index( + "vec", + "IVF_PQ", + num_partitions=1, + num_sub_vectors=4, + ) + + def check_read(self): + """Verify PQ index can be queried.""" + ds = lance.dataset(self.path) + # Query with random vector + q = pc.random(32).cast(pa.float32()) + result = ds.to_table( + nearest={ + "q": q, + "k": 4, + "column": "vec", + } + ) + assert result.num_rows == 4 + + def check_write(self): + """Verify can insert vectors and rebuild index.""" + ds = lance.dataset(self.path) + # Add new vectors + data = pa.table( + { + "id": pa.array([1000]), + "vec": pa.FixedSizeListArray.from_arrays( + pc.random(32).cast(pa.float32()), 32 + ), + } + ) + ds.insert(data) + ds.optimize.optimize_indices() + ds.optimize.compact_files() From ad8413eaf145968a8bfdb491f63813e314aa0d0b Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 6 Nov 2025 16:20:21 -0800 Subject: [PATCH 06/19] build large test --- .../tests/forward_compat/index_tests.py | 33 +++++++++++++++---- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/python/python/tests/forward_compat/index_tests.py b/python/python/tests/forward_compat/index_tests.py index 81277a892fa..6edbfdf8d9b 100644 --- a/python/python/tests/forward_compat/index_tests.py +++ b/python/python/tests/forward_compat/index_tests.py @@ -12,7 +12,7 @@ import pytest from lance.file import LanceFileReader, LanceFileWriter -from .util import build_basic_types +from .util import build_basic_types, build_large @lru_cache(maxsize=1) @@ -322,26 +322,45 @@ def check_write(self): @compat_test() -class BasicTypes2_0(UpgradeDowngradeTest): - def __init__(self, path: Path): +@pytest.mark.parametrize( + "data_factory,name", + [ + (build_basic_types, "basic_types"), + (build_large, "large"), + ], + ids=["basic_types", "large"], +) +class FileCompat(UpgradeDowngradeTest): + """Test file format compatibility with different data types. + + Tests both basic types (scalars, strings, etc.) and large data (vectors, binary). + """ + + def __init__(self, path: Path, data_factory, name: str): self.path = path + self.data_factory = data_factory + self.name = name def create(self): - batch = build_basic_types() + """Create Lance file with test data.""" + batch = self.data_factory() with LanceFileWriter( str(self.path), version="2.0", schema=batch.schema ) as writer: writer.write_batch(batch) def check_read(self): + """Verify file can be read and data matches.""" reader = LanceFileReader(str(self.path)) table = reader.read_all().to_table() - assert table == build_basic_types() + expected = self.data_factory() + assert table.equals(expected), f"Data mismatch for {self.name}" def check_write(self): - # Test with overwrite + """Verify can overwrite the file.""" + batch = self.data_factory() with LanceFileWriter(str(self.path), version="2.0") as writer: - writer.write_batch(build_basic_types()) + writer.write_batch(batch) @compat_test() From 28ad7e95c3abb0c4dfb975d029b934e858ac7883 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 6 Nov 2025 16:37:41 -0800 Subject: [PATCH 07/19] refactor compat tests into focused modules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously all compatibility tests lived in a monolithic 751-line index_tests.py file mixed with infrastructure code. This made tests hard to find and maintain. Split into focused modules: - compat_decorator.py: infrastructure and @compat_test decorator - test_file_formats.py: file format compatibility tests - test_scalar_indices.py: scalar index compatibility tests - test_vector_indices.py: vector index compatibility tests Removed deprecated datagen.py and test_compat.py. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../tests/forward_compat/compat_decorator.py | 306 +++++++ python/python/tests/forward_compat/datagen.py | 154 ---- .../tests/forward_compat/index_tests.py | 751 ------------------ .../tests/forward_compat/test_compat.py | 243 ------ .../tests/forward_compat/test_file_formats.py | 114 +++ .../forward_compat/test_scalar_indices.py | 313 ++++++++ .../forward_compat/test_vector_indices.py | 92 +++ 7 files changed, 825 insertions(+), 1148 deletions(-) create mode 100644 python/python/tests/forward_compat/compat_decorator.py delete mode 100644 python/python/tests/forward_compat/datagen.py delete mode 100644 python/python/tests/forward_compat/index_tests.py delete mode 100644 python/python/tests/forward_compat/test_compat.py create mode 100644 python/python/tests/forward_compat/test_file_formats.py create mode 100644 python/python/tests/forward_compat/test_scalar_indices.py create mode 100644 python/python/tests/forward_compat/test_vector_indices.py diff --git a/python/python/tests/forward_compat/compat_decorator.py b/python/python/tests/forward_compat/compat_decorator.py new file mode 100644 index 00000000000..f3f56cff97d --- /dev/null +++ b/python/python/tests/forward_compat/compat_decorator.py @@ -0,0 +1,306 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + +""" +Compatibility test infrastructure for Lance. + +This module provides the @compat_test() decorator and supporting infrastructure +for testing forward and backward compatibility across Lance versions. +""" + +import inspect +import json +import subprocess +import sys +from functools import lru_cache + +import pytest + + +@lru_cache(maxsize=1) +def last_stable_release(): + """Returns the latest stable version available on PyPI. + + Queries the PyPI JSON API to get the latest stable release of pylance. + Results are cached to avoid repeated network calls. + """ + try: + import urllib.request + + with urllib.request.urlopen( + "https://pypi.org/pypi/pylance/json", timeout=5 + ) as response: + data = json.loads(response.read()) + version = data["info"]["version"] + return version + except Exception as e: + # If we can't fetch, return None which will be filtered out + print( + f"Warning: Could not fetch latest stable release from PyPI: {e}", + file=sys.stderr, + ) + return None + + +@lru_cache(maxsize=1) +def last_beta_release(): + """Returns the latest beta version available on fury.io. + + Uses pip to query the fury.io index for pre-release versions of pylance. + Results are cached to avoid repeated network calls. + """ + try: + # Use pip index to get versions from fury.io + result = subprocess.run( + [ + sys.executable, + "-m", + "pip", + "index", + "versions", + "pylance", + "--pre", + "--extra-index-url", + "https://pypi.fury.io/lancedb/", + ], + capture_output=True, + text=True, + timeout=10, + ) + + if result.returncode == 0: + # Parse output to find available versions + # Output format: "pylance (x.y.z)" + # Available versions: x.y.z.betaN, x.y.z, ... + for line in result.stdout.splitlines(): + if "Available versions:" in line: + versions_str = line.split("Available versions:")[1].strip() + versions = [v.strip() for v in versions_str.split(",")] + # Return the first beta/pre-release version + for v in versions: + if "beta" in v or "rc" in v or "a" in v or "b" in v: + return v + # If no pre-release found, return the first version + if versions: + return versions[0] + + print( + "Warning: Could not fetch latest beta release from fury.io", + file=sys.stderr, + ) + return None + + except Exception as e: + print( + f"Warning: Could not fetch latest beta release from fury.io: {e}", + file=sys.stderr, + ) + return None + + +# Fetch versions (cached) +LAST_STABLE_RELEASE = last_stable_release() +LAST_BETA_RELEASE = last_beta_release() + + +class UpgradeDowngradeTest: + """Base class for compatibility tests. + + Subclasses should implement: + - create(): Create test data/indices with current Lance version + - check_read(): Verify data can be read correctly + - check_write(): Verify data can be written/modified + """ + + def create(self): + pass + + def check_read(self): + pass + + def check_write(self): + pass + + +# Default versions to test, filtering out any that couldn't be fetched +VERSIONS = [ + v + for v in ["0.16.0", "0.30.0", "0.36.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE] + if v is not None +] + + +def compat_test(versions=None): + """Decorator to generate upgrade/downgrade compatibility tests. + + This decorator transforms a test class into two parameterized pytest test functions: + + 1. Downgrade test: Writes with current version, then reads with old version. + 2. Upgrade-Downgrade test: Writes with old version, reads with current version, + writes with current version, reads with old version. + + The test class should inherit from UpgradeDowngradeTest and implement: + - create(): Write data with the current Lance version + - check_read(): Verify data can be read + - check_write(): Verify data can be written + + The class can be parametrized with @pytest.mark.parametrize, and those + parameters will be applied to the generated test functions. + + Parameters + ---------- + versions : list of str, optional + List of Lance versions to test against. Defaults to VERSIONS. + + Example + ------- + @compat_test() + @pytest.mark.parametrize("file_version", ["1.0", "2.0"]) + class BasicTypes(UpgradeDowngradeTest): + def __init__(self, path: Path, file_version: str): + self.path = path + self.file_version = file_version + + def create(self): + # Write data + pass + + def check_read(self): + # Read and verify data + pass + + def check_write(self): + # Write data + pass + """ + if versions is None: + versions = VERSIONS + + # Filter out None values (in case some versions couldn't be fetched) + versions = [v for v in versions if v is not None] + + # Skip if no valid versions + if not versions: + + def decorator(cls): + return cls + + return decorator + + def decorator(cls): + # Extract existing parametrize marks from the class + existing_params = ( + [ + m + for m in ( + cls.pytestmark + if isinstance(cls.pytestmark, list) + else [cls.pytestmark] + ) + if getattr(m, "name", None) == "parametrize" + ] + if hasattr(cls, "pytestmark") + else [] + ) + + # Get parameter names from __init__ (excluding 'self' and 'path') + sig = inspect.signature(cls.__init__) + param_names = [p for p in sig.parameters.keys() if p not in ("self", "path")] + + # Create test functions dynamically with proper signatures + downgrade_func = _make_test_function(cls, param_names, "downgrade") + upgrade_downgrade_func = _make_test_function( + cls, param_names, "upgrade_downgrade" + ) + + # Apply version parametrization + downgrade_func = pytest.mark.parametrize("version", versions)(downgrade_func) + upgrade_downgrade_func = pytest.mark.parametrize("version", versions)( + upgrade_downgrade_func + ) + + # Apply existing parametrize marks + for mark in existing_params: + downgrade_func = pytest.mark.parametrize(*mark.args, **mark.kwargs)( + downgrade_func + ) + upgrade_downgrade_func = pytest.mark.parametrize(*mark.args, **mark.kwargs)( + upgrade_downgrade_func + ) + + # Apply compat marker + downgrade_func = pytest.mark.compat(downgrade_func) + upgrade_downgrade_func = pytest.mark.compat(upgrade_downgrade_func) + + # Set function names + downgrade_func.__name__ = f"test_{cls.__name__}_downgrade" + upgrade_downgrade_func.__name__ = f"test_{cls.__name__}_upgrade_downgrade" + + # Register test functions in the module where the class is defined + module = sys.modules[cls.__module__] + setattr(module, downgrade_func.__name__, downgrade_func) + setattr(module, upgrade_downgrade_func.__name__, upgrade_downgrade_func) + + return cls + + return decorator + + +def _make_test_function(cls, param_names, test_type): + """Create a test function with the correct signature for pytest. + + Parameters + ---------- + cls : class + The test class to create a function for + param_names : list of str + Names of parameters from the class __init__ (excluding self and path) + test_type : str + Either "downgrade" or "upgrade_downgrade" + + Returns + ------- + function + Test function with correct signature for pytest + """ + # Build function signature + sig_params = "venv_factory, tmp_path, version" + for param in param_names: + sig_params += f", {param}" + + # Build parameter passing to __init__ + init_params = ", ".join(param_names) if param_names else "" + + # Build function body based on test type + if test_type == "downgrade": + func_body = f''' +def test_func({sig_params}): + """Test that old Lance version can read data written by current version.""" + from pathlib import Path + obj = cls(tmp_path / "data.lance", {init_params}) + # Current version: create data + obj.create() + # Old version: verify can read + venv = venv_factory.get_venv(version) + venv.execute_method(obj, "check_read") +''' + else: # upgrade_downgrade + func_body = f''' +def test_func({sig_params}): + """Test round-trip compatibility: old -> current -> old.""" + from pathlib import Path + obj = cls(tmp_path / "data.lance", {init_params}) + venv = venv_factory.get_venv(version) + # Old version: create data + venv.execute_method(obj, "create") + # Current version: read and write + obj.check_read() + obj.check_write() + # Old version: verify can still read + venv.execute_method(obj, "check_read") +''' + + # Execute to create the function + namespace = {"cls": cls} + exec(func_body, namespace) + return namespace["test_func"] diff --git a/python/python/tests/forward_compat/datagen.py b/python/python/tests/forward_compat/datagen.py deleted file mode 100644 index c5ef40609bd..00000000000 --- a/python/python/tests/forward_compat/datagen.py +++ /dev/null @@ -1,154 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright The Lance Authors - -# Data generation for forward compatibility tests -# -# This file will be run on the up-to-date version of Lance to generate -# test data that will be read by older versions of Lance in test_compat.py - -import shutil - -import lance -import pyarrow as pa -import pyarrow.compute as pc -from lance.file import LanceFileWriter -from lance.indices.builder import IndexConfig - -from forward_compat.util import build_basic_types, build_large, get_path - - -def write_basic_types(): - path = get_path("basic_types.lance") - with LanceFileWriter(str(path)) as writer: - writer.write_batch(build_basic_types()) - - -def write_large(): - path = get_path("large.lance") - with LanceFileWriter(str(path)) as writer: - writer.write_batch(build_large()) - - -def write_dataset_pq_buffer(): - # In https://github.com/lancedb/lance/pull/3829, we started storing the PQ - # codebook in a global buffer instead of the schema metadata as JSON. - - shutil.rmtree(get_path("pq_in_schema"), ignore_errors=True) - - ndims = 32 - nvecs = 512 - - data = pa.table( - { - "id": pa.array(range(nvecs)), - "vec": pa.FixedSizeListArray.from_arrays( - pc.random(ndims * nvecs).cast(pa.float32()), ndims - ), - } - ) - - dataset = lance.write_dataset(data, get_path("pq_in_schema")) - dataset.create_index( - "vec", - "IVF_PQ", - num_partitions=1, - num_sub_vectors=4, - ) - - -def write_dataset_json(): - shutil.rmtree(get_path("json"), ignore_errors=True) - - data = pa.table( - { - "idx": pa.array(range(1000)), - "json": pa.array([f'{{"val": {i}}}' for i in range(1000)], pa.json_()), - } - ) - - dataset = lance.write_dataset(data, get_path("json"), max_rows_per_file=100) - dataset.create_scalar_index( - "json", - IndexConfig( - index_type="json", parameters={"target_index_type": "btree", "path": "val"} - ), - ) - - -def write_dataset_btree_index(): - shutil.rmtree(get_path("btree_index"), ignore_errors=True) - - data = pa.table( - { - "idx": pa.array(range(1000)), - "btree": pa.array(range(1000)), - } - ) - - dataset = lance.write_dataset(data, get_path("btree_index"), max_rows_per_file=100) - dataset.create_scalar_index("btree", "BTREE") - - -def write_dataset_bitmap_labellist_index(): - shutil.rmtree(get_path("bitmap_labellist_index"), ignore_errors=True) - - data = pa.table( - { - "idx": pa.array(range(1000)), - "bitmap": pa.array(range(1000)), - "label_list": pa.array([[f"label{i}"] for i in range(1000)]), - } - ) - - dataset = lance.write_dataset( - data, get_path("bitmap_labellist_index"), max_rows_per_file=100 - ) - dataset.create_scalar_index("bitmap", "BITMAP") - dataset.create_scalar_index("label_list", "LABEL_LIST") - - -def write_dataset_ngram_zonemap_bloomfilter_index(): - shutil.rmtree(get_path("ngram_zonemap_bloomfilter_index"), ignore_errors=True) - - data = pa.table( - { - "idx": pa.array(range(1000)), - "ngram": pa.array([f"word{i}" for i in range(1000)]), - "zonemap": pa.array(range(1000)), - "bloomfilter": pa.array(range(1000)), - } - ) - - dataset = lance.write_dataset( - data, get_path("ngram_zonemap_bloomfilter_index"), max_rows_per_file=100 - ) - dataset.create_scalar_index("ngram", "NGRAM") - dataset.create_scalar_index("zonemap", "ZONEMAP") - dataset.create_scalar_index("bloomfilter", "BLOOMFILTER") - - -def write_dataset_fts_index(): - shutil.rmtree(get_path("fts_index"), ignore_errors=True) - - data = pa.table( - { - "idx": pa.array(range(1000)), - "text": pa.array( - [f"document with words {i} and more text" for i in range(1000)] - ), - } - ) - - dataset = lance.write_dataset(data, get_path("fts_index"), max_rows_per_file=100) - dataset.create_scalar_index("text", "INVERTED") - - -if __name__ == "__main__": - write_basic_types() - write_large() - write_dataset_pq_buffer() - write_dataset_btree_index() - write_dataset_bitmap_labellist_index() - write_dataset_ngram_zonemap_bloomfilter_index() - write_dataset_json() - write_dataset_fts_index() diff --git a/python/python/tests/forward_compat/index_tests.py b/python/python/tests/forward_compat/index_tests.py deleted file mode 100644 index 6edbfdf8d9b..00000000000 --- a/python/python/tests/forward_compat/index_tests.py +++ /dev/null @@ -1,751 +0,0 @@ -import inspect -import json -import shutil -import subprocess -import sys -from functools import lru_cache -from pathlib import Path - -import lance -import pyarrow as pa -import pyarrow.compute as pc -import pytest -from lance.file import LanceFileReader, LanceFileWriter - -from .util import build_basic_types, build_large - - -@lru_cache(maxsize=1) -def last_stable_release(): - """Returns the latest stable version available on PyPI. - - Queries the PyPI JSON API to get the latest stable release of pylance. - Results are cached to avoid repeated network calls. - """ - try: - import urllib.request - - with urllib.request.urlopen( - "https://pypi.org/pypi/pylance/json", timeout=5 - ) as response: - data = json.loads(response.read()) - version = data["info"]["version"] - return version - except Exception as e: - # If we can't fetch, return None which will be filtered out - print( - f"Warning: Could not fetch latest stable release from PyPI: {e}", - file=sys.stderr, - ) - return None - - -@lru_cache(maxsize=1) -def last_beta_release(): - """Returns the latest beta version available on fury.io. - - Uses pip to query the fury.io index for pre-release versions of pylance. - Results are cached to avoid repeated network calls. - """ - try: - # Use pip index to get versions from fury.io - result = subprocess.run( - [ - sys.executable, - "-m", - "pip", - "index", - "versions", - "pylance", - "--pre", - "--extra-index-url", - "https://pypi.fury.io/lancedb/", - ], - capture_output=True, - text=True, - timeout=10, - ) - - if result.returncode == 0: - # Parse output to find available versions - # Output format: "pylance (x.y.z)" - # Available versions: x.y.z.betaN, x.y.z, ... - for line in result.stdout.splitlines(): - if "Available versions:" in line: - versions_str = line.split("Available versions:")[1].strip() - versions = [v.strip() for v in versions_str.split(",")] - # Return the first beta/pre-release version - for v in versions: - if "beta" in v or "rc" in v or "a" in v or "b" in v: - return v - # If no pre-release found, return the first version - if versions: - return versions[0] - - print( - "Warning: Could not fetch latest beta release from fury.io", - file=sys.stderr, - ) - return None - - except Exception as e: - print( - f"Warning: Could not fetch latest beta release from fury.io: {e}", - file=sys.stderr, - ) - return None - - -# Fetch versions (cached) -LAST_STABLE_RELEASE = last_stable_release() -LAST_BETA_RELEASE = last_beta_release() - - -class UpgradeDowngradeTest: - def create(self): - pass - - def check_read(self): - pass - - def check_write(self): - pass - - -# Default versions to test, filtering out any that couldn't be fetched -VERSIONS = [ - v - for v in ["0.16.0", "0.30.0", "0.36.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE] - if v is not None -] - - -def compat_test(versions=None): - """Decorator to generate upgrade/downgrade compatibility tests. - - This decorator transforms a test class into two parameterized pytest test functions: - - 1. Downgrade test: Writes with current version, then reads with old version. - 2. Upgrade-Downgrade test: Writes with old version, reads with current version, - writes with current version, reads with old version. - - The test class should inherit from UpgradeDowngradeTest and implement: - - create(): Write data with the current Lance version - - check_read(): Verify data can be read - - check_write(): Verify data can be written - - The class can be parametrized with @pytest.mark.parametrize, and those - parameters will be applied to the generated test functions. - - Parameters - ---------- - versions : list of str, optional - List of Lance versions to test against. Defaults to VERSIONS. - - Example - ------- - @compat_test() - @pytest.mark.parametrize("file_version", ["1.0", "2.0"]) - class BasicTypes(UpgradeDowngradeTest): - def __init__(self, path: Path, file_version: str): - self.path = path - self.file_version = file_version - - def create(self): - # Write data - pass - - def check_read(self): - # Read and verify data - pass - - def check_write(self): - # Write data - pass - """ - if versions is None: - versions = VERSIONS - - # Filter out None values (in case some versions couldn't be fetched) - versions = [v for v in versions if v is not None] - - # Skip if no valid versions - if not versions: - - def decorator(cls): - return cls - - return decorator - - def decorator(cls): - # Extract existing parametrize marks from the class - existing_params = ( - [ - m - for m in ( - cls.pytestmark - if isinstance(cls.pytestmark, list) - else [cls.pytestmark] - ) - if getattr(m, "name", None) == "parametrize" - ] - if hasattr(cls, "pytestmark") - else [] - ) - - # Get parameter names from __init__ (excluding 'self' and 'path') - sig = inspect.signature(cls.__init__) - param_names = [p for p in sig.parameters.keys() if p not in ("self", "path")] - - # Create test functions dynamically with proper signatures - downgrade_func = _make_test_function(cls, param_names, "downgrade") - upgrade_downgrade_func = _make_test_function( - cls, param_names, "upgrade_downgrade" - ) - - # Apply version parametrization - downgrade_func = pytest.mark.parametrize("version", versions)(downgrade_func) - upgrade_downgrade_func = pytest.mark.parametrize("version", versions)( - upgrade_downgrade_func - ) - - # Apply existing parametrize marks - for mark in existing_params: - downgrade_func = pytest.mark.parametrize(*mark.args, **mark.kwargs)( - downgrade_func - ) - upgrade_downgrade_func = pytest.mark.parametrize(*mark.args, **mark.kwargs)( - upgrade_downgrade_func - ) - - # Apply compat marker - downgrade_func = pytest.mark.compat(downgrade_func) - upgrade_downgrade_func = pytest.mark.compat(upgrade_downgrade_func) - - # Set function names - downgrade_func.__name__ = f"test_{cls.__name__}_downgrade" - upgrade_downgrade_func.__name__ = f"test_{cls.__name__}_upgrade_downgrade" - - # Register test functions in the module where the class is defined - module = sys.modules[cls.__module__] - setattr(module, downgrade_func.__name__, downgrade_func) - setattr(module, upgrade_downgrade_func.__name__, upgrade_downgrade_func) - - return cls - - return decorator - - -def _make_test_function(cls, param_names, test_type): - """Create a test function with the correct signature for pytest. - - Parameters - ---------- - cls : class - The test class to create a function for - param_names : list of str - Names of parameters from the class __init__ (excluding self and path) - test_type : str - Either "downgrade" or "upgrade_downgrade" - - Returns - ------- - function - Test function with correct signature for pytest - """ - # Build function signature - sig_params = "venv_factory, tmp_path, version" - for param in param_names: - sig_params += f", {param}" - - # Build parameter passing to __init__ - init_params = ", ".join(param_names) if param_names else "" - - # Build function body based on test type - if test_type == "downgrade": - func_body = f''' -def test_func({sig_params}): - """Test that old Lance version can read data written by current version.""" - from pathlib import Path - obj = cls(tmp_path / "data.lance", {init_params}) - # Current version: create data - obj.create() - # Old version: verify can read - venv = venv_factory.get_venv(version) - venv.execute_method(obj, "check_read") -''' - else: # upgrade_downgrade - func_body = f''' -def test_func({sig_params}): - """Test round-trip compatibility: old -> current -> old.""" - from pathlib import Path - obj = cls(tmp_path / "data.lance", {init_params}) - venv = venv_factory.get_venv(version) - # Old version: create data - venv.execute_method(obj, "create") - # Current version: read and write - obj.check_read() - obj.check_write() - # Old version: verify can still read - venv.execute_method(obj, "check_read") -''' - - # Execute to create the function - namespace = {"cls": cls} - exec(func_body, namespace) - return namespace["test_func"] - - -# We start testing against the first release where 2.1 was stable. Before that -# the format was unstable to the readers will panic. -@compat_test(versions=["0.38.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE]) -class BasicTypes2_1(UpgradeDowngradeTest): - def __init__(self, path: Path): - self.path = path - - def create(self): - batch = build_basic_types() - with LanceFileWriter( - str(self.path), version="2.1", schema=batch.schema - ) as writer: - writer.write_batch(batch) - - def check_read(self): - reader = LanceFileReader(str(self.path)) - table = reader.read_all().to_table() - assert table == build_basic_types() - - def check_write(self): - # Test with overwrite - with LanceFileWriter(str(self.path), version="2.1") as writer: - writer.write_batch(build_basic_types()) - - -@compat_test() -@pytest.mark.parametrize( - "data_factory,name", - [ - (build_basic_types, "basic_types"), - (build_large, "large"), - ], - ids=["basic_types", "large"], -) -class FileCompat(UpgradeDowngradeTest): - """Test file format compatibility with different data types. - - Tests both basic types (scalars, strings, etc.) and large data (vectors, binary). - """ - - def __init__(self, path: Path, data_factory, name: str): - self.path = path - self.data_factory = data_factory - self.name = name - - def create(self): - """Create Lance file with test data.""" - batch = self.data_factory() - with LanceFileWriter( - str(self.path), version="2.0", schema=batch.schema - ) as writer: - writer.write_batch(batch) - - def check_read(self): - """Verify file can be read and data matches.""" - reader = LanceFileReader(str(self.path)) - table = reader.read_all().to_table() - expected = self.data_factory() - assert table.equals(expected), f"Data mismatch for {self.name}" - - def check_write(self): - """Verify can overwrite the file.""" - batch = self.data_factory() - with LanceFileWriter(str(self.path), version="2.0") as writer: - writer.write_batch(batch) - - -@compat_test() -class BasicTypesLegacy(UpgradeDowngradeTest): - def __init__(self, path: Path): - self.path = path - - def create(self): - batch = build_basic_types() - lance.write_dataset(batch, self.path, data_storage_version="0.1") - - def check_read(self): - ds = lance.dataset(self.path) - table = ds.to_table() - assert table == build_basic_types() - - def check_write(self): - ds = lance.dataset(self.path) - ds.delete("true") - ds.insert(build_basic_types()) - - -# ============================================================================ -# Index Compatibility Tests -# ============================================================================ -# These tests verify that indices created with one version of Lance can be -# read and written by other versions. - - -@compat_test(versions=["0.30.0", "0.36.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE]) -class BTreeIndex(UpgradeDowngradeTest): - """Test BTREE scalar index compatibility""" - - def __init__(self, path: Path): - self.path = path - - def create(self): - """Create dataset with BTREE index.""" - shutil.rmtree(self.path, ignore_errors=True) - data = pa.table( - { - "idx": pa.array(range(1000)), - "btree": pa.array(range(1000)), - } - ) - dataset = lance.write_dataset(data, self.path, max_rows_per_file=100) - dataset.create_scalar_index("btree", "BTREE") - - def check_read(self): - """Verify BTREE index can be queried.""" - ds = lance.dataset(self.path) - table = ds.to_table(filter="btree == 7") - assert table.num_rows == 1 - assert table.column("idx").to_pylist() == [7] - - # Verify index is used - explain = ds.scanner(filter="btree == 7").explain_plan() - assert "ScalarIndexQuery" in explain or "MaterializeIndex" in explain - - def check_write(self): - """Verify can insert data and optimize BTREE index.""" - ds = lance.dataset(self.path) - data = pa.table( - { - "idx": pa.array([1000]), - "btree": pa.array([1000]), - } - ) - ds.insert(data) - ds.optimize.optimize_indices() - ds.optimize.compact_files() - - # Verify new data is queryable - table = ds.to_table(filter="btree == 1000") - assert table.num_rows == 1 - - -@compat_test( - versions=["0.20.0", "0.30.0", "0.36.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE] -) -class BitmapLabelListIndex(UpgradeDowngradeTest): - """Test BITMAP and LABEL_LIST scalar index compatibility (introduced in 0.20.0).""" - - def __init__(self, path: Path): - self.path = path - - def create(self): - """Create dataset with BITMAP and LABEL_LIST indices.""" - shutil.rmtree(self.path, ignore_errors=True) - data = pa.table( - { - "idx": pa.array(range(1000)), - "bitmap": pa.array(range(1000)), - "label_list": pa.array([[f"label{i}"] for i in range(1000)]), - } - ) - dataset = lance.write_dataset(data, self.path, max_rows_per_file=100) - dataset.create_scalar_index("bitmap", "BITMAP") - dataset.create_scalar_index("label_list", "LABEL_LIST") - - def check_read(self): - """Verify BITMAP and LABEL_LIST indices can be queried.""" - ds = lance.dataset(self.path) - - # Test BITMAP index - table = ds.to_table(filter="bitmap == 7") - assert table.num_rows == 1 - assert table.column("idx").to_pylist() == [7] - - # Test LABEL_LIST index - table = ds.to_table(filter="array_has_any(label_list, ['label7'])") - assert table.num_rows == 1 - assert table.column("idx").to_pylist() == [7] - - def check_write(self): - """Verify can insert data and optimize indices.""" - ds = lance.dataset(self.path) - data = pa.table( - { - "idx": pa.array([1000]), - "bitmap": pa.array([1000]), - "label_list": pa.array([["label1000"]]), - } - ) - ds.insert(data) - ds.optimize.optimize_indices() - ds.optimize.compact_files() - - -@compat_test(versions=["0.36.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE]) -class NgramIndex(UpgradeDowngradeTest): - """Test NGRAM index compatibility (introduced in 0.36.0).""" - - def __init__(self, path: Path): - self.path = path - - def create(self): - """Create dataset with NGRAM index.""" - shutil.rmtree(self.path, ignore_errors=True) - data = pa.table( - { - "idx": pa.array(range(1000)), - "ngram": pa.array([f"word{i}" for i in range(1000)]), - } - ) - dataset = lance.write_dataset(data, self.path, max_rows_per_file=100) - dataset.create_scalar_index("ngram", "NGRAM") - - def check_read(self): - """Verify NGRAM index can be queried.""" - ds = lance.dataset(self.path) - table = ds.to_table(filter="contains(ngram, 'word7')") - # word7, word70-79, word700-799 = 111 results - assert table.num_rows == 111 - - # Verify index is used - explain = ds.scanner(filter="contains(ngram, 'word7')").explain_plan() - assert "ScalarIndexQuery" in explain - - def check_write(self): - """Verify can insert data and optimize NGRAM index.""" - ds = lance.dataset(self.path) - data = pa.table( - { - "idx": pa.array([1000]), - "ngram": pa.array(["word1000"]), - } - ) - ds.insert(data) - ds.optimize.optimize_indices() - ds.optimize.compact_files() - - -@compat_test(versions=["0.36.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE]) -class ZonemapBloomfilterIndex(UpgradeDowngradeTest): - """Test ZONEMAP and BLOOMFILTER index compatibility (introduced in 0.36.0).""" - - def __init__(self, path: Path): - self.path = path - - def create(self): - """Create dataset with ZONEMAP and BLOOMFILTER indices.""" - shutil.rmtree(self.path, ignore_errors=True) - data = pa.table( - { - "idx": pa.array(range(1000)), - "zonemap": pa.array(range(1000)), - "bloomfilter": pa.array(range(1000)), - } - ) - dataset = lance.write_dataset(data, self.path, max_rows_per_file=100) - dataset.create_scalar_index("zonemap", "ZONEMAP") - dataset.create_scalar_index("bloomfilter", "BLOOMFILTER") - - def check_read(self): - """Verify ZONEMAP and BLOOMFILTER indices can be queried.""" - ds = lance.dataset(self.path) - - # Test ZONEMAP - table = ds.to_table(filter="zonemap == 7") - assert table.num_rows == 1 - assert table.column("idx").to_pylist() == [7] - - # Test BLOOMFILTER - table = ds.to_table(filter="bloomfilter == 7") - assert table.num_rows == 1 - assert table.column("idx").to_pylist() == [7] - - def check_write(self): - """Verify can insert data and optimize indices.""" - ds = lance.dataset(self.path) - data = pa.table( - { - "idx": pa.array([1000]), - "zonemap": pa.array([1000]), - "bloomfilter": pa.array([1000]), - } - ) - ds.insert(data) - ds.optimize.optimize_indices() - ds.optimize.compact_files() - - -@compat_test(versions=["0.36.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE]) -class JsonIndex(UpgradeDowngradeTest): - """Test JSON index compatibility (introduced in 0.36.0).""" - - def __init__(self, path: Path): - self.path = path - - def create(self): - """Create dataset with JSON index.""" - from lance.indices import IndexConfig - - shutil.rmtree(self.path, ignore_errors=True) - data = pa.table( - { - "idx": pa.array(range(1000)), - "json": pa.array([f'{{"val": {i}}}' for i in range(1000)], pa.json_()), - } - ) - dataset = lance.write_dataset(data, self.path, max_rows_per_file=100) - dataset.create_scalar_index( - "json", - IndexConfig( - index_type="json", - parameters={"target_index_type": "btree", "path": "val"}, - ), - ) - - def check_read(self): - """Verify JSON index can be queried.""" - ds = lance.dataset(self.path) - table = ds.to_table(filter="json_get_int(json, 'val') == 7") - assert table.num_rows == 1 - assert table.column("idx").to_pylist() == [7] - - # Verify index is used - explain = ds.scanner(filter="json_get_int(json, 'val') == 7").explain_plan() - assert "ScalarIndexQuery" in explain - - def check_write(self): - """Verify can insert data with JSON index.""" - ds = lance.dataset(self.path) - data = pa.table( - { - "idx": pa.array([1000]), - "json": pa.array(['{"val": 1000}'], pa.json_()), - } - ) - ds.insert(data) - # TODO: fix this https://github.com/lancedb/lance/issues/5177 - # ds.optimize.optimize_indices() - ds.optimize.compact_files() - - -@compat_test(versions=["0.36.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE]) -class FtsIndex(UpgradeDowngradeTest): - """Test FTS (full-text search) index compatibility (introduced in 0.36.0).""" - - def __init__(self, path: Path): - self.path = path - - def create(self): - """Create dataset with FTS index.""" - shutil.rmtree(self.path, ignore_errors=True) - data = pa.table( - { - "idx": pa.array(range(1000)), - "text": pa.array( - [f"document with words {i} and more text" for i in range(1000)] - ), - } - ) - dataset = lance.write_dataset(data, self.path, max_rows_per_file=100) - dataset.create_scalar_index("text", "INVERTED") - - def check_read(self): - """Verify FTS index can be queried.""" - ds = lance.dataset(self.path) - # Search for documents containing "words" and "7" - # Note: Actual FTS query syntax may vary - table = ds.to_table(filter="text LIKE '%words 7 %'") - assert table.num_rows > 0 - - def check_write(self): - """Verify can insert data with FTS index.""" - # Dataset::load_manifest does not do retain_supported_indices - # so this can only work with no cache - session = lance.Session(index_cache_size_bytes=0, metadata_cache_size_bytes=0) - ds = lance.dataset(self.path, session=session) - data = pa.table( - { - "idx": pa.array([1000]), - "text": pa.array(["new document to index"]), - } - ) - ds.insert(data) - ds.optimize.compact_files() - - -@compat_test( - versions=[ - "0.29.1.beta2", - "0.30.0", - "0.36.0", - LAST_STABLE_RELEASE, - LAST_BETA_RELEASE, - ] -) -class PqVectorIndex(UpgradeDowngradeTest): - """Test PQ (Product Quantization) vector index compatibility.""" - - def __init__(self, path: Path): - self.path = path - - def create(self): - """Create dataset with PQ vector index.""" - shutil.rmtree(self.path, ignore_errors=True) - ndims = 32 - nvecs = 512 - - data = pa.table( - { - "id": pa.array(range(nvecs)), - "vec": pa.FixedSizeListArray.from_arrays( - pc.random(ndims * nvecs).cast(pa.float32()), ndims - ), - } - ) - - dataset = lance.write_dataset(data, self.path) - dataset.create_index( - "vec", - "IVF_PQ", - num_partitions=1, - num_sub_vectors=4, - ) - - def check_read(self): - """Verify PQ index can be queried.""" - ds = lance.dataset(self.path) - # Query with random vector - q = pc.random(32).cast(pa.float32()) - result = ds.to_table( - nearest={ - "q": q, - "k": 4, - "column": "vec", - } - ) - assert result.num_rows == 4 - - def check_write(self): - """Verify can insert vectors and rebuild index.""" - ds = lance.dataset(self.path) - # Add new vectors - data = pa.table( - { - "id": pa.array([1000]), - "vec": pa.FixedSizeListArray.from_arrays( - pc.random(32).cast(pa.float32()), 32 - ), - } - ) - ds.insert(data) - ds.optimize.optimize_indices() - ds.optimize.compact_files() diff --git a/python/python/tests/forward_compat/test_compat.py b/python/python/tests/forward_compat/test_compat.py deleted file mode 100644 index 5a1e2a1adde..00000000000 --- a/python/python/tests/forward_compat/test_compat.py +++ /dev/null @@ -1,243 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright The Lance Authors - -# Forward compatibility tests for older versions of Lance -# -# This file will be run on older versions of Lance to test that the -# current version of Lance can read the test data generated by datagen.py. - -import shutil - -import lance -import pyarrow as pa -import pyarrow.compute as pc -import pytest -from lance.file import LanceFileReader -from packaging.version import Version - -from .util import build_basic_types, build_large, get_path - - -@pytest.mark.forward -@pytest.mark.skipif( - Version(lance.__version__) < Version("0.36.0"), # at least 0.36.0 - reason="version is too old to support JSON index", -) -def test_json_index(): - ds = lance.dataset(get_path("json")) - tbl = ds.to_table(filter="json_get_int(json, 'val') == 7") - assert tbl.num_rows == 1 - assert tbl.column("idx").to_pylist() == [7] - - explain = ds.scanner(filter="json_get_int(json, 'val') == 7").explain_plan() - assert "ScalarIndexQuery" in explain - - -@pytest.mark.forward -@pytest.mark.skipif( - Version(lance.__version__) < Version("0.36.0"), # at least 0.36.0 - reason="version is too old to support NGRAM index", -) -def test_ngram_index(): - ds = lance.dataset(get_path("ngram_zonemap_bloomfilter_index")) - tbl = ds.to_table(filter="contains(ngram, 'word7')") - assert tbl.num_rows == 111 - - explain = ds.scanner(filter="contains(ngram, 'word7')").explain_plan() - assert "ScalarIndexQuery" in explain - - -def query_seven(ds, filt: str): - table = ds.to_table(filter=filt) - assert table.num_rows == 1 - assert table.column("idx").to_pylist() == [7] - - explain = ds.scanner(filter=filt).explain_plan() - assert "ScalarIndexQuery" in explain or "MaterializeIndex" in explain - - -@pytest.mark.forward -@pytest.mark.skipif( - Version(lance.__version__) < Version("0.20.0"), - reason="Version is too old to read index files stored with Lance 2.0 file format", -) -def test_index_search(): - ds = lance.dataset(get_path("btree_index")) - query_seven(ds, "btree == 7") - - ds = lance.dataset(get_path("bitmap_labellist_index")) - - query_seven(ds, "bitmap == 7") - query_seven(ds, "array_has_any(label_list, ['label7'])") - - -@pytest.mark.forward -@pytest.mark.skipif( - Version(lance.__version__) < Version("0.36.0"), - reason="ZONEMAP and BLOOMFILTER indices were introduced in 0.36.0", -) -def test_zonemap_bloomfilter_index_search(): - ds = lance.dataset(get_path("ngram_zonemap_bloomfilter_index")) - query_seven(ds, "zonemap == 7") - query_seven(ds, "bloomfilter == 7") - - -@pytest.mark.forward -def test_scans(): - expected_basic_types = build_basic_types() - actual_basic_types = ( - LanceFileReader(str(get_path("basic_types.lance"))).read_all().to_table() - ) - assert actual_basic_types.equals(expected_basic_types) - - expected_large = build_large() - actual_large = LanceFileReader(str(get_path("large.lance"))).read_all().to_table() - assert actual_large.equals(expected_large) - - -@pytest.mark.forward -@pytest.mark.skipif( - Version(lance.__version__) < Version("0.29.1.beta2"), # at least 0.29.1-beta.2 - reason="Lance 0.29.1-beta.2 would ignore indices too new", -) -def test_pq_buffer(): - ds = lance.dataset(get_path("pq_in_schema")) - # the index should be ignored, still able to query (brute force) - q = pc.random(32).cast(pa.float32()) - ds.to_table( - nearest={ - "q": q, - "k": 4, - "column": "vec", - } - ) - - -@pytest.mark.forward -@pytest.mark.skipif( - Version(lance.__version__) < Version("0.36.0"), - reason="FTS token set format was introduced in 0.36.0", -) -def test_list_indices_ignores_new_fts_index_version(): - # Dataset::load_manifest does not do retain_supported_indices - # so this can only work with no cache - session = lance.Session(index_cache_size_bytes=0, metadata_cache_size_bytes=0) - ds = lance.dataset(get_path("fts_index"), session=session) - indices = ds.list_indices() - # the new index version should be ignored - assert len(indices) == 0 - - -@pytest.mark.forward -@pytest.mark.skipif( - Version(lance.__version__) < Version("0.35.0"), - reason="0.35.0 changes BTREE index schema", -) -def test_write_btree_index(tmp_path: str): - path = get_path("btree_index") - # copy to tmp path to avoid modifying original - shutil.copytree(path, tmp_path, dirs_exist_ok=True) - - ds = lance.dataset(tmp_path) - data = pa.table( - { - "idx": pa.array([1000]), - "btree": pa.array([1000]), - } - ) - ds.insert(data) - ds.optimize.optimize_indices() - ds.optimize.compact_files() - - -@pytest.mark.forward -@pytest.mark.skipif( - Version(lance.__version__) < Version("0.20.0"), - reason="Version is too old to read index files stored with Lance 2.0 file format", -) -def test_write_bitmap_labellist_index(tmp_path: str): - path = get_path("bitmap_labellist_index") - # copy to tmp path to avoid modifying original - shutil.copytree(path, tmp_path, dirs_exist_ok=True) - - ds = lance.dataset(tmp_path) - data = pa.table( - { - "idx": pa.array([1000]), - "bitmap": pa.array([1000]), - "label_list": pa.array([["label1000"]]), - } - ) - ds.insert(data) - ds.optimize.optimize_indices() - ds.optimize.compact_files() - - -@pytest.mark.forward -@pytest.mark.skipif( - Version(lance.__version__) < Version("0.36.0"), - reason="NGRAM index was introduced in 0.36.0", -) -def test_write_ngram_index(tmp_path: str): - path = get_path("ngram_zonemap_bloomfilter_index") - # copy to tmp path to avoid modifying original - shutil.copytree(path, tmp_path, dirs_exist_ok=True) - - ds = lance.dataset(tmp_path) - data = pa.table( - { - "idx": pa.array([1000]), - "ngram": pa.array(["word1000"]), - } - ) - ds.insert(data) - ds.optimize.optimize_indices() - ds.optimize.compact_files() - - -@pytest.mark.forward -@pytest.mark.skipif( - Version(lance.__version__) < Version("0.36.0"), - reason="ZONEMAP and BLOOMFILTER index was introduced in 0.36.0", -) -def test_write_zonemap_bloomfilter_index(tmp_path: str): - path = get_path("ngram_zonemap_bloomfilter_index") - # copy to tmp path to avoid modifying original - shutil.copytree(path, tmp_path, dirs_exist_ok=True) - - ds = lance.dataset(tmp_path) - data = pa.table( - { - "idx": pa.array([1000]), - "zonemap": pa.array([1000]), - "bloomfilter": pa.array([1000]), - } - ) - ds.insert(data) - ds.optimize.optimize_indices() - ds.optimize.compact_files() - - -@pytest.mark.forward -@pytest.mark.skipif( - Version(lance.__version__) < Version("0.36.0"), - reason="FTS token set format was introduced in 0.36.0", -) -def test_write_fts(tmp_path: str): - path = get_path("fts_index") - # copy to tmp path to avoid modifying original - shutil.copytree(path, tmp_path, dirs_exist_ok=True) - - # Dataset::load_manifest does not do retain_supported_indices - # so this can only work with no cache - session = lance.Session(index_cache_size_bytes=0, metadata_cache_size_bytes=0) - ds = lance.dataset(tmp_path, session=session) - data = pa.table( - { - "idx": pa.array([1000]), - "text": pa.array(["new document to index"]), - } - ) - ds.insert(data) - # ds.optimize.optimize_indices() - ds.optimize.compact_files() diff --git a/python/python/tests/forward_compat/test_file_formats.py b/python/python/tests/forward_compat/test_file_formats.py new file mode 100644 index 00000000000..d4a0c318a58 --- /dev/null +++ b/python/python/tests/forward_compat/test_file_formats.py @@ -0,0 +1,114 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + +""" +File format compatibility tests for Lance. + +Tests that Lance files can be read and written across different versions, +covering various data types and file format versions. +""" + +from pathlib import Path + +import lance +import pytest +from lance.file import LanceFileReader, LanceFileWriter + +from .compat_decorator import ( + LAST_BETA_RELEASE, + LAST_STABLE_RELEASE, + UpgradeDowngradeTest, + compat_test, +) +from .util import build_basic_types, build_large + + +# We start testing against the first release where 2.1 was stable. Before that +# the format was unstable so the readers will panic. +@compat_test(versions=["0.38.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE]) +class BasicTypes2_1(UpgradeDowngradeTest): + """Test file format 2.1 compatibility with basic data types.""" + + def __init__(self, path: Path): + self.path = path + + def create(self): + batch = build_basic_types() + with LanceFileWriter( + str(self.path), version="2.1", schema=batch.schema + ) as writer: + writer.write_batch(batch) + + def check_read(self): + reader = LanceFileReader(str(self.path)) + table = reader.read_all().to_table() + assert table == build_basic_types() + + def check_write(self): + # Test with overwrite + with LanceFileWriter(str(self.path), version="2.1") as writer: + writer.write_batch(build_basic_types()) + + +@compat_test() +@pytest.mark.parametrize( + "data_factory,name", + [ + (build_basic_types, "basic_types"), + (build_large, "large"), + ], + ids=["basic_types", "large"], +) +class FileCompat(UpgradeDowngradeTest): + """Test file format compatibility with different data types. + + Tests both basic types (scalars, strings, etc.) and large data (vectors, binary). + """ + + def __init__(self, path: Path, data_factory, name: str): + self.path = path + self.data_factory = data_factory + self.name = name + + def create(self): + """Create Lance file with test data.""" + batch = self.data_factory() + with LanceFileWriter( + str(self.path), version="2.0", schema=batch.schema + ) as writer: + writer.write_batch(batch) + + def check_read(self): + """Verify file can be read and data matches.""" + reader = LanceFileReader(str(self.path)) + table = reader.read_all().to_table() + expected = self.data_factory() + assert table.equals(expected), f"Data mismatch for {self.name}" + + def check_write(self): + """Verify can overwrite the file.""" + batch = self.data_factory() + with LanceFileWriter(str(self.path), version="2.0") as writer: + writer.write_batch(batch) + + +@compat_test() +class BasicTypesLegacy(UpgradeDowngradeTest): + """Test legacy data storage version 0.1 compatibility.""" + + def __init__(self, path: Path): + self.path = path + + def create(self): + batch = build_basic_types() + lance.write_dataset(batch, self.path, data_storage_version="0.1") + + def check_read(self): + ds = lance.dataset(self.path) + table = ds.to_table() + assert table == build_basic_types() + + def check_write(self): + ds = lance.dataset(self.path) + ds.delete("true") + ds.insert(build_basic_types()) diff --git a/python/python/tests/forward_compat/test_scalar_indices.py b/python/python/tests/forward_compat/test_scalar_indices.py new file mode 100644 index 00000000000..9393769c862 --- /dev/null +++ b/python/python/tests/forward_compat/test_scalar_indices.py @@ -0,0 +1,313 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + +""" +Scalar index compatibility tests for Lance. + +Tests that scalar indices (BTREE, BITMAP, LABEL_LIST, NGRAM, ZONEMAP, +BLOOMFILTER, JSON, FTS) created with one version of Lance can be read +and written by other versions. +""" + +import shutil +from pathlib import Path + +import lance +import pyarrow as pa + +from .compat_decorator import ( + LAST_BETA_RELEASE, + LAST_STABLE_RELEASE, + UpgradeDowngradeTest, + compat_test, +) + + +@compat_test(versions=["0.30.0", "0.36.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE]) +class BTreeIndex(UpgradeDowngradeTest): + """Test BTREE scalar index compatibility (introduced in 0.20.0).""" + + def __init__(self, path: Path): + self.path = path + + def create(self): + """Create dataset with BTREE index.""" + shutil.rmtree(self.path, ignore_errors=True) + data = pa.table( + { + "idx": pa.array(range(1000)), + "btree": pa.array(range(1000)), + } + ) + dataset = lance.write_dataset(data, self.path, max_rows_per_file=100) + dataset.create_scalar_index("btree", "BTREE") + + def check_read(self): + """Verify BTREE index can be queried.""" + ds = lance.dataset(self.path) + table = ds.to_table(filter="btree == 7") + assert table.num_rows == 1 + assert table.column("idx").to_pylist() == [7] + + # Verify index is used + explain = ds.scanner(filter="btree == 7").explain_plan() + assert "ScalarIndexQuery" in explain or "MaterializeIndex" in explain + + def check_write(self): + """Verify can insert data and optimize BTREE index.""" + ds = lance.dataset(self.path) + data = pa.table( + { + "idx": pa.array([1000]), + "btree": pa.array([1000]), + } + ) + ds.insert(data) + ds.optimize.optimize_indices() + ds.optimize.compact_files() + + # Verify new data is queryable + table = ds.to_table(filter="btree == 1000") + assert table.num_rows == 1 + + +@compat_test( + versions=["0.20.0", "0.30.0", "0.36.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE] +) +class BitmapLabelListIndex(UpgradeDowngradeTest): + """Test BITMAP and LABEL_LIST scalar index compatibility (introduced in 0.20.0).""" + + def __init__(self, path: Path): + self.path = path + + def create(self): + """Create dataset with BITMAP and LABEL_LIST indices.""" + shutil.rmtree(self.path, ignore_errors=True) + data = pa.table( + { + "idx": pa.array(range(1000)), + "bitmap": pa.array(range(1000)), + "label_list": pa.array([[f"label{i}"] for i in range(1000)]), + } + ) + dataset = lance.write_dataset(data, self.path, max_rows_per_file=100) + dataset.create_scalar_index("bitmap", "BITMAP") + dataset.create_scalar_index("label_list", "LABEL_LIST") + + def check_read(self): + """Verify BITMAP and LABEL_LIST indices can be queried.""" + ds = lance.dataset(self.path) + + # Test BITMAP index + table = ds.to_table(filter="bitmap == 7") + assert table.num_rows == 1 + assert table.column("idx").to_pylist() == [7] + + # Test LABEL_LIST index + table = ds.to_table(filter="array_has_any(label_list, ['label7'])") + assert table.num_rows == 1 + assert table.column("idx").to_pylist() == [7] + + def check_write(self): + """Verify can insert data and optimize indices.""" + ds = lance.dataset(self.path) + data = pa.table( + { + "idx": pa.array([1000]), + "bitmap": pa.array([1000]), + "label_list": pa.array([["label1000"]]), + } + ) + ds.insert(data) + ds.optimize.optimize_indices() + ds.optimize.compact_files() + + +@compat_test(versions=["0.36.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE]) +class NgramIndex(UpgradeDowngradeTest): + """Test NGRAM index compatibility (introduced in 0.36.0).""" + + def __init__(self, path: Path): + self.path = path + + def create(self): + """Create dataset with NGRAM index.""" + shutil.rmtree(self.path, ignore_errors=True) + data = pa.table( + { + "idx": pa.array(range(1000)), + "ngram": pa.array([f"word{i}" for i in range(1000)]), + } + ) + dataset = lance.write_dataset(data, self.path, max_rows_per_file=100) + dataset.create_scalar_index("ngram", "NGRAM") + + def check_read(self): + """Verify NGRAM index can be queried.""" + ds = lance.dataset(self.path) + table = ds.to_table(filter="contains(ngram, 'word7')") + # word7, word70-79, word700-799 = 111 results + assert table.num_rows == 111 + + # Verify index is used + explain = ds.scanner(filter="contains(ngram, 'word7')").explain_plan() + assert "ScalarIndexQuery" in explain + + def check_write(self): + """Verify can insert data and optimize NGRAM index.""" + ds = lance.dataset(self.path) + data = pa.table( + { + "idx": pa.array([1000]), + "ngram": pa.array(["word1000"]), + } + ) + ds.insert(data) + ds.optimize.optimize_indices() + ds.optimize.compact_files() + + +@compat_test(versions=["0.36.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE]) +class ZonemapBloomfilterIndex(UpgradeDowngradeTest): + """Test ZONEMAP and BLOOMFILTER index compatibility (introduced in 0.36.0).""" + + def __init__(self, path: Path): + self.path = path + + def create(self): + """Create dataset with ZONEMAP and BLOOMFILTER indices.""" + shutil.rmtree(self.path, ignore_errors=True) + data = pa.table( + { + "idx": pa.array(range(1000)), + "zonemap": pa.array(range(1000)), + "bloomfilter": pa.array(range(1000)), + } + ) + dataset = lance.write_dataset(data, self.path, max_rows_per_file=100) + dataset.create_scalar_index("zonemap", "ZONEMAP") + dataset.create_scalar_index("bloomfilter", "BLOOMFILTER") + + def check_read(self): + """Verify ZONEMAP and BLOOMFILTER indices can be queried.""" + ds = lance.dataset(self.path) + + # Test ZONEMAP + table = ds.to_table(filter="zonemap == 7") + assert table.num_rows == 1 + assert table.column("idx").to_pylist() == [7] + + # Test BLOOMFILTER + table = ds.to_table(filter="bloomfilter == 7") + assert table.num_rows == 1 + assert table.column("idx").to_pylist() == [7] + + def check_write(self): + """Verify can insert data and optimize indices.""" + ds = lance.dataset(self.path) + data = pa.table( + { + "idx": pa.array([1000]), + "zonemap": pa.array([1000]), + "bloomfilter": pa.array([1000]), + } + ) + ds.insert(data) + ds.optimize.optimize_indices() + ds.optimize.compact_files() + + +@compat_test(versions=["0.36.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE]) +class JsonIndex(UpgradeDowngradeTest): + """Test JSON index compatibility (introduced in 0.36.0).""" + + def __init__(self, path: Path): + self.path = path + + def create(self): + """Create dataset with JSON index.""" + from lance.indices import IndexConfig + + shutil.rmtree(self.path, ignore_errors=True) + data = pa.table( + { + "idx": pa.array(range(1000)), + "json": pa.array([f'{{"val": {i}}}' for i in range(1000)], pa.json_()), + } + ) + dataset = lance.write_dataset(data, self.path, max_rows_per_file=100) + dataset.create_scalar_index( + "json", + IndexConfig( + index_type="json", + parameters={"target_index_type": "btree", "path": "val"}, + ), + ) + + def check_read(self): + """Verify JSON index can be queried.""" + ds = lance.dataset(self.path) + table = ds.to_table(filter="json_get_int(json, 'val') == 7") + assert table.num_rows == 1 + assert table.column("idx").to_pylist() == [7] + + # Verify index is used + explain = ds.scanner(filter="json_get_int(json, 'val') == 7").explain_plan() + assert "ScalarIndexQuery" in explain + + def check_write(self): + """Verify can insert data with JSON index.""" + ds = lance.dataset(self.path) + data = pa.table( + { + "idx": pa.array([1000]), + "json": pa.array(['{"val": 1000}'], pa.json_()), + } + ) + ds.insert(data) + ds.optimize.compact_files() + + +@compat_test(versions=["0.36.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE]) +class FtsIndex(UpgradeDowngradeTest): + """Test FTS (full-text search) index compatibility (introduced in 0.36.0).""" + + def __init__(self, path: Path): + self.path = path + + def create(self): + """Create dataset with FTS index.""" + shutil.rmtree(self.path, ignore_errors=True) + data = pa.table( + { + "idx": pa.array(range(1000)), + "text": pa.array( + [f"document with words {i} and more text" for i in range(1000)] + ), + } + ) + dataset = lance.write_dataset(data, self.path, max_rows_per_file=100) + dataset.create_scalar_index("text", "INVERTED") + + def check_read(self): + """Verify FTS index can be queried.""" + ds = lance.dataset(self.path) + # Search for documents containing "words" and "7" + # Note: Actual FTS query syntax may vary + table = ds.to_table(filter="text LIKE '%words 7 %'") + assert table.num_rows > 0 + + def check_write(self): + """Verify can insert data with FTS index.""" + # Dataset::load_manifest does not do retain_supported_indices + # so this can only work with no cache + session = lance.Session(index_cache_size_bytes=0, metadata_cache_size_bytes=0) + ds = lance.dataset(self.path, session=session) + data = pa.table( + { + "idx": pa.array([1000]), + "text": pa.array(["new document to index"]), + } + ) + ds.insert(data) + ds.optimize.compact_files() diff --git a/python/python/tests/forward_compat/test_vector_indices.py b/python/python/tests/forward_compat/test_vector_indices.py new file mode 100644 index 00000000000..8c027720696 --- /dev/null +++ b/python/python/tests/forward_compat/test_vector_indices.py @@ -0,0 +1,92 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + +""" +Vector index compatibility tests for Lance. + +Tests that vector indices (IVF_PQ, etc.) created with one version of Lance +can be read and written by other versions. +""" + +import shutil +from pathlib import Path + +import lance +import pyarrow as pa +import pyarrow.compute as pc + +from .compat_decorator import ( + LAST_BETA_RELEASE, + LAST_STABLE_RELEASE, + UpgradeDowngradeTest, + compat_test, +) + + +@compat_test( + versions=[ + "0.29.1.beta2", + "0.30.0", + "0.36.0", + LAST_STABLE_RELEASE, + LAST_BETA_RELEASE, + ] +) +class PqVectorIndex(UpgradeDowngradeTest): + """Test PQ (Product Quantization) vector index compatibility.""" + + def __init__(self, path: Path): + self.path = path + + def create(self): + """Create dataset with PQ vector index.""" + shutil.rmtree(self.path, ignore_errors=True) + ndims = 32 + nvecs = 512 + + data = pa.table( + { + "id": pa.array(range(nvecs)), + "vec": pa.FixedSizeListArray.from_arrays( + pc.random(ndims * nvecs).cast(pa.float32()), ndims + ), + } + ) + + dataset = lance.write_dataset(data, self.path) + dataset.create_index( + "vec", + "IVF_PQ", + num_partitions=1, + num_sub_vectors=4, + ) + + def check_read(self): + """Verify PQ index can be queried.""" + ds = lance.dataset(self.path) + # Query with random vector + q = pc.random(32).cast(pa.float32()) + result = ds.to_table( + nearest={ + "q": q, + "k": 4, + "column": "vec", + } + ) + assert result.num_rows == 4 + + def check_write(self): + """Verify can insert vectors and rebuild index.""" + ds = lance.dataset(self.path) + # Add new vectors + data = pa.table( + { + "id": pa.array([1000]), + "vec": pa.FixedSizeListArray.from_arrays( + pc.random(32).cast(pa.float32()), 32 + ), + } + ) + ds.insert(data) + ds.optimize.optimize_indices() + ds.optimize.compact_files() From cc4a67e95508bd2d005ae0155d6e5c8f9ea6fd11 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 6 Nov 2025 16:38:28 -0800 Subject: [PATCH 08/19] remove compat testing from ci --- .github/workflows/python.yml | 39 ------------------------------------ 1 file changed, 39 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 30ac105d47f..3da92e97dd3 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -126,45 +126,6 @@ jobs: name: linux-wheels path: python/target/wheels/pylance-*.whl - forward-compat: - needs: linux - runs-on: ubuntu-24.04 - name: Forward Compatibility Tests (${{ matrix.lance-version }}) - strategy: - matrix: - lance-version: ["0.16.0", "0.30.0", "0.36.0"] - defaults: - run: - shell: bash - working-directory: python - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - lfs: true - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: 3.13 - - name: Download wheels - uses: actions/download-artifact@v4 - with: - name: linux-wheels - path: python/wheels - - name: Install dependencies - run: | - pip install $(ls wheels/pylance-*.whl)[tests,ray] - - name: Generate forward compatibility files - env: - PYTHONPATH: python/tests - run: python -m forward_compat.datagen - - name: Run forward compatibility tests (pylance ${{ matrix.lance-version }}) - run: | - python -m venv venv - source venv/bin/activate - pip install pytest pylance==${{ matrix.lance-version }} - pytest python/tests/forward_compat --run-forward - linux-arm: timeout-minutes: 45 runs-on: ubuntu-2404-4x-arm64 From 43eadc57f46f3ca4236c103c779f8ecbdb6383a7 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 6 Nov 2025 16:56:23 -0800 Subject: [PATCH 09/19] optimize compat test performance and add instrumentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Performance improvements: - Removed pip upgrade step (saves ~1s per version, 7% faster) - Added --quiet flag to pip install for cleaner output Instrumentation: - Added detailed timing instrumentation for performance analysis - Timing output controlled by DEBUG=1 environment variable - Tracks venv creation, package install, Lance import, and execution time - Added PERFORMANCE.md documenting bottlenecks and optimization strategies Key findings from analysis: - Package installation: 4.9s (29% of total time) - First Lance import: ~5.0s (29% of total time) - Venv creation: 2.2s (13% of total time) - Persistent subprocess provides 500x speedup on subsequent calls 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../tests/forward_compat/PERFORMANCE.md | 80 +++++++++++++++++++ .../tests/forward_compat/venv_manager.py | 77 ++++++++++++++---- .../tests/forward_compat/venv_runner.py | 23 +++++- 3 files changed, 162 insertions(+), 18 deletions(-) create mode 100644 python/python/tests/forward_compat/PERFORMANCE.md diff --git a/python/python/tests/forward_compat/PERFORMANCE.md b/python/python/tests/forward_compat/PERFORMANCE.md new file mode 100644 index 00000000000..a9a5ea0099b --- /dev/null +++ b/python/python/tests/forward_compat/PERFORMANCE.md @@ -0,0 +1,80 @@ +# Compatibility Test Performance Analysis + +## Timing Breakdown (per version, first test) + +``` +Total: ~16-17s +├── Virtual environment setup: ~7-8s (47%) +│ ├── venv creation: 2.2s +│ └── package install (pylance + pytest): 4.9s +├── First Lance import in subprocess: ~5.0s (29%) +├── Test execution: ~0.04s (0.2%) +└── Overhead (pytest, data creation, etc.): ~4s (24%) +``` + +## What's Working Well + +1. **Persistent subprocess**: Subsequent method calls are 500x faster (5s → 0.01s) +2. **Venv reuse**: Virtual environments are cached across tests in same session +3. **Pip cache**: Leveraging ~11GB pip cache for faster installs + +## Current Optimizations + +- ✅ Removed pip upgrade step (saves ~1.1s per version) +- ✅ Added `--quiet` flag to pip install for cleaner output + +## Potential Future Optimizations + +### High Impact (but complex) + +1. **Parallel venv creation** (saves ~8s × versions): + - Create all venvs in parallel at session start + - Requires refactoring VenvFactory to pre-create venvs + +2. **Persistent venv directory** (saves ~8s on subsequent runs): + - Store venvs outside /tmp to persist across pytest sessions + - Add venv version/integrity checking + - Cleanup strategy for old venvs + +3. **Pre-built Docker image** (eliminates install time entirely): + - Container with all Lance versions pre-installed + - Good for CI, not for local dev + +### Low Impact + +1. **Venv with --without-pip**: Saves ~0.5s + - Requires symlinking pip from parent venv + - Adds complexity + +2. **Lazy Lance import**: Not applicable + - Import happens on first method call (already optimized) + +## Recommendations + +For local development: +- Current setup is good - optimizations have diminishing returns +- Most time is in package installation (pip already optimized) + +For CI: +- Consider parallel venv creation if testing many versions +- Consider persistent venv cache across CI runs + +## Performance Instrumentation + +Detailed timing information is available by setting the `DEBUG` environment variable: + +```bash +# Run tests with timing instrumentation +DEBUG=1 pytest tests/forward_compat/ --run-compat -v -s + +# Normal run (clean output, no timing) +pytest tests/forward_compat/ --run-compat -v +``` + +The timing output shows: +- `[TIMING]` - Main process timing (venv creation, IPC) +- `[VENV TIMING]` - Subprocess timing (actual method execution) + +This helps identify bottlenecks: +- If `receive` time is much larger than `[VENV TIMING]` execution, the bottleneck is Lance import +- If `[VENV TIMING]` is large, the bottleneck is the actual test logic diff --git a/python/python/tests/forward_compat/venv_manager.py b/python/python/tests/forward_compat/venv_manager.py index 8d08014325e..723d3f3d1b6 100644 --- a/python/python/tests/forward_compat/venv_manager.py +++ b/python/python/tests/forward_compat/venv_manager.py @@ -10,9 +10,13 @@ import struct import subprocess import sys +import time from pathlib import Path from typing import Any, Optional +# Enable detailed timing output with DEBUG=1 +DEBUG = os.environ.get("DEBUG", "").lower() in ("1", "true", "yes") + class VenvExecutor: """Manages a virtual environment with a specific Lance version.""" @@ -39,12 +43,21 @@ def create(self): if self._created: return + start_time = time.time() + if DEBUG: + print(f"[TIMING] Creating venv for {self.version}...", flush=True) + # Create virtual environment + venv_start = time.time() subprocess.run( [sys.executable, "-m", "venv", str(self.venv_path)], check=True, capture_output=True, ) + if DEBUG: + print( + f"[TIMING] venv creation: {time.time() - venv_start:.2f}s", flush=True + ) # Determine python path in venv if sys.platform == "win32": @@ -52,20 +65,15 @@ def create(self): else: self.python_path = self.venv_path / "bin" / "python" - # Upgrade pip - subprocess.run( - [str(self.python_path), "-m", "pip", "install", "--upgrade", "pip"], - check=True, - capture_output=True, - ) - - # Install specific pylance version and pytest (needed for test modules) + # Install specific pylance version and pytest + install_start = time.time() subprocess.run( [ str(self.python_path), "-m", "pip", "install", + "--quiet", "--pre", "--extra-index-url", "https://pypi.fury.io/lancedb/", @@ -75,8 +83,19 @@ def create(self): check=True, capture_output=True, ) + if DEBUG: + print( + f"[TIMING] package install: {time.time() - install_start:.2f}s", + flush=True, + ) self._created = True + if DEBUG: + total_time = time.time() - start_time + print( + f"[TIMING] Total venv creation for {self.version}: {total_time:.2f}s", + flush=True, + ) def _ensure_subprocess(self): """Ensure the persistent subprocess is running.""" @@ -84,6 +103,10 @@ def _ensure_subprocess(self): # Subprocess is already running return + if DEBUG: + print(f"[TIMING] Starting subprocess for {self.version}...", flush=True) + start_time = time.time() + # Start persistent subprocess runner_script = Path(__file__).parent / "venv_runner.py" @@ -96,9 +119,14 @@ def _ensure_subprocess(self): [str(self.python_path), "-u", str(runner_script)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, - stderr=subprocess.PIPE, + stderr=None, # Inherit stderr to see timing messages env=env, ) + if DEBUG: + print( + f"[TIMING] Subprocess started in {time.time() - start_time:.2f}s", + flush=True, + ) def _send_message(self, obj: Any): """Send a length-prefixed pickled message to subprocess.""" @@ -153,15 +181,37 @@ def execute_method(self, obj: Any, method_name: str) -> Any: if not self._created: raise RuntimeError("Virtual environment not created. Call create() first.") + start_time = time.time() + if DEBUG: + print(f"[TIMING] Executing {method_name} in {self.version}...", flush=True) + # Ensure subprocess is running + subprocess_start = time.time() self._ensure_subprocess() + if DEBUG and time.time() - subprocess_start > 0.1: + print( + f"[TIMING] subprocess ensure: {time.time() - subprocess_start:.2f}s", + flush=True, + ) try: # Send request: (obj, method_name) + send_start = time.time() self._send_message((obj, method_name)) + send_time = time.time() - send_start # Receive response + receive_start = time.time() response = self._receive_message() + receive_time = time.time() - receive_start + + if DEBUG: + total_time = time.time() - start_time + print( + f"[TIMING] send: {send_time:.2f}s, receive: {receive_time:.2f}s, " + f"total: {total_time:.2f}s", + flush=True, + ) if response["success"]: return response["result"] @@ -176,16 +226,9 @@ def execute_method(self, obj: Any, method_name: str) -> Any: except (BrokenPipeError, EOFError, struct.error) as e: # Subprocess died or communication failed - stderr_output = "" - if self._subprocess and self._subprocess.stderr: - stderr_output = self._subprocess.stderr.read().decode( - "utf-8", errors="replace" - ) - raise RuntimeError( f"Communication with venv subprocess failed (Lance {self.version}):\n" - f"Error: {e}\n" - f"stderr: {stderr_output}" + f"Error: {e}" ) def cleanup(self): diff --git a/python/python/tests/forward_compat/venv_runner.py b/python/python/tests/forward_compat/venv_runner.py index 860e6d75f94..29872faeb1d 100644 --- a/python/python/tests/forward_compat/venv_runner.py +++ b/python/python/tests/forward_compat/venv_runner.py @@ -13,11 +13,16 @@ - Writes pickled response dict """ +import os import pickle import struct import sys +import time import traceback +# Enable detailed timing output with DEBUG=1 +DEBUG = os.environ.get("DEBUG", "").lower() in ("1", "true", "yes") + def read_message(stream): """Read a length-prefixed pickled message from stream.""" @@ -59,10 +64,26 @@ def main(): obj, method_name = request - # Execute method + # Execute method with timing + start_time = time.time() + if DEBUG: + print( + f"[VENV TIMING] Executing {method_name}...", + file=sys.stderr, + flush=True, + ) + method = getattr(obj, method_name) result = method() + if DEBUG: + exec_time = time.time() - start_time + print( + f"[VENV TIMING] {method_name} completed in {exec_time:.2f}s", + file=sys.stderr, + flush=True, + ) + # Send success response response = {"success": True, "result": result} write_message(sys.stdout, response) From 8f687edf9630de27cd507166d5c9d7b55c660e35 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 6 Nov 2025 17:58:46 -0800 Subject: [PATCH 10/19] add persistent virtual environments for faster development MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Virtual environments are now persistent by default, stored in ~/.cache/lance-compat-venvs/. This provides a 5x speedup for interactive development after the first run. Changes: - Venvs persist across pytest sessions by default - Validation checks ensure correct Lance version is installed - Set COMPAT_TEMP_VENV=1 to use temporary venvs (old behavior) - Added cleanup instructions to PERFORMANCE.md Performance impact: - First run: ~13-16s per version (creates venv) - Subsequent runs: ~2-6s per version (reuses venv) - Example: 2 tests that took 26s now take 6s This makes iterative test development much more pleasant! 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../tests/forward_compat/PERFORMANCE.md | 71 ++++++++++++++----- .../python/tests/forward_compat/conftest.py | 31 ++++++-- .../tests/forward_compat/venv_manager.py | 67 +++++++++++++++-- 3 files changed, 142 insertions(+), 27 deletions(-) diff --git a/python/python/tests/forward_compat/PERFORMANCE.md b/python/python/tests/forward_compat/PERFORMANCE.md index a9a5ea0099b..496bfe348af 100644 --- a/python/python/tests/forward_compat/PERFORMANCE.md +++ b/python/python/tests/forward_compat/PERFORMANCE.md @@ -1,6 +1,15 @@ # Compatibility Test Performance Analysis -## Timing Breakdown (per version, first test) +## Persistent Virtual Environments (Default) + +By default, virtual environments are **persistent** and stored in `~/.cache/lance-compat-venvs/`. + +**First run (creates venv):** ~13-16s per version +**Subsequent runs (reuses venv):** ~2-6s per version + +This makes interactive development much faster - you only pay the setup cost once! + +## Timing Breakdown (per version, first test with venv creation) ``` Total: ~16-17s @@ -12,6 +21,16 @@ Total: ~16-17s └── Overhead (pytest, data creation, etc.): ~4s (24%) ``` +## Timing Breakdown (subsequent runs with persistent venv) + +``` +Total: ~2-6s +├── Venv validation: ~0.1s +├── Lance import (if new subprocess): ~2-5s +├── Test execution: ~0.04s +└── Overhead: ~1-2s +``` + ## What's Working Well 1. **Persistent subprocess**: Subsequent method calls are 500x faster (5s → 0.01s) @@ -20,23 +39,45 @@ Total: ~16-17s ## Current Optimizations +- ✅ **Persistent virtual environments** (default, 5x speedup for subsequent runs) - ✅ Removed pip upgrade step (saves ~1.1s per version) - ✅ Added `--quiet` flag to pip install for cleaner output +- ✅ Venv validation to ensure correct Lance version is installed + +## Configuration Options + +### Persistent vs Temporary Venvs + +By default, venvs are persistent. To use temporary venvs (old behavior): + +```bash +COMPAT_TEMP_VENV=1 pytest tests/forward_compat/ --run-compat +``` + +### Cleaning Up Persistent Venvs + +To remove all cached venvs: + +```bash +rm -rf ~/.cache/lance-compat-venvs/ +``` + +Or to remove specific versions: + +```bash +rm -rf ~/.cache/lance-compat-venvs/venv_0.30.0 +``` ## Potential Future Optimizations -### High Impact (but complex) +### High Impact 1. **Parallel venv creation** (saves ~8s × versions): - Create all venvs in parallel at session start - - Requires refactoring VenvFactory to pre-create venvs - -2. **Persistent venv directory** (saves ~8s on subsequent runs): - - Store venvs outside /tmp to persist across pytest sessions - - Add venv version/integrity checking - - Cleanup strategy for old venvs + - Most beneficial for CI or first-time setup + - Requires refactoring VenvFactory -3. **Pre-built Docker image** (eliminates install time entirely): +2. **Pre-built Docker image** (eliminates install time entirely): - Container with all Lance versions pre-installed - Good for CI, not for local dev @@ -46,18 +87,16 @@ Total: ~16-17s - Requires symlinking pip from parent venv - Adds complexity -2. **Lazy Lance import**: Not applicable - - Import happens on first method call (already optimized) - ## Recommendations For local development: -- Current setup is good - optimizations have diminishing returns -- Most time is in package installation (pip already optimized) +- ✅ **Use persistent venvs** (default) - 5x speedup after first run +- Run tests frequently without worrying about setup time +- Manually clean cache if disk space is a concern For CI: -- Consider parallel venv creation if testing many versions -- Consider persistent venv cache across CI runs +- Consider caching `~/.cache/lance-compat-venvs/` across CI runs +- Or use `COMPAT_TEMP_VENV=1` for clean environments each time ## Performance Instrumentation diff --git a/python/python/tests/forward_compat/conftest.py b/python/python/tests/forward_compat/conftest.py index 60b0db598fd..8a4d869b021 100644 --- a/python/python/tests/forward_compat/conftest.py +++ b/python/python/tests/forward_compat/conftest.py @@ -1,6 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright The Lance Authors +import os +from pathlib import Path + import pytest from .venv_manager import VenvFactory @@ -13,9 +16,27 @@ def venv_factory(tmp_path_factory): This fixture is session-scoped so virtual environments are reused across tests, improving test performance. + + By default, venvs are persistent (stored in ~/.cache/lance-compat-venvs/). + Set COMPAT_TEMP_VENV=1 to use temporary venvs that are cleaned up after + each session. """ - base_path = tmp_path_factory.mktemp("venvs") - factory = VenvFactory(base_path) - yield factory - # Cleanup all venvs at end of session - factory.cleanup_all() + use_temp = os.environ.get("COMPAT_TEMP_VENV", "").lower() in ( + "1", + "true", + "yes", + ) + + if use_temp: + # Use temporary venvs (old behavior) + base_path = tmp_path_factory.mktemp("venvs") + factory = VenvFactory(base_path, persistent=False) + yield factory + factory.cleanup_all() + else: + # Use persistent venvs + cache_dir = Path.home() / ".cache" / "lance-compat-venvs" + cache_dir.mkdir(parents=True, exist_ok=True) + factory = VenvFactory(cache_dir, persistent=True) + yield factory + # Don't cleanup persistent venvs diff --git a/python/python/tests/forward_compat/venv_manager.py b/python/python/tests/forward_compat/venv_manager.py index 723d3f3d1b6..a1e78861068 100644 --- a/python/python/tests/forward_compat/venv_manager.py +++ b/python/python/tests/forward_compat/venv_manager.py @@ -21,7 +21,7 @@ class VenvExecutor: """Manages a virtual environment with a specific Lance version.""" - def __init__(self, version: str, venv_path: Path): + def __init__(self, version: str, venv_path: Path, persistent: bool = False): """ Initialize a VenvExecutor. @@ -31,18 +31,69 @@ def __init__(self, version: str, venv_path: Path): Lance version to install (e.g., "0.30.0") venv_path : Path Directory where virtual environment will be created + persistent : bool + If True, venv is persistent and validated before use """ self.version = version self.venv_path = Path(venv_path) + self.persistent = persistent self.python_path: Optional[Path] = None self._created = False self._subprocess: Optional[subprocess.Popen] = None + def _validate_venv(self) -> bool: + """Check if existing venv is valid and has correct Lance version.""" + if not self.venv_path.exists(): + return False + + # Determine python path + if sys.platform == "win32": + python_path = self.venv_path / "Scripts" / "python.exe" + else: + python_path = self.venv_path / "bin" / "python" + + if not python_path.exists(): + return False + + # Check if pylance is installed with correct version + try: + result = subprocess.run( + [str(python_path), "-m", "pip", "show", "pylance"], + capture_output=True, + text=True, + timeout=5, + ) + if result.returncode != 0: + return False + + # Parse version from output + for line in result.stdout.splitlines(): + if line.startswith("Version:"): + installed_version = line.split(":", 1)[1].strip() + return installed_version == self.version + + except Exception: + return False + + return False + def create(self): """Create the virtual environment and install the specified Lance version.""" if self._created: return + # Check if persistent venv already exists and is valid + if self.persistent and self._validate_venv(): + if DEBUG: + print(f"[TIMING] Reusing existing venv for {self.version}", flush=True) + # Set python path + if sys.platform == "win32": + self.python_path = self.venv_path / "Scripts" / "python.exe" + else: + self.python_path = self.venv_path / "bin" / "python" + self._created = True + return + start_time = time.time() if DEBUG: print(f"[TIMING] Creating venv for {self.version}...", flush=True) @@ -256,7 +307,7 @@ def cleanup(self): class VenvFactory: """Factory for creating and managing VenvExecutor instances.""" - def __init__(self, base_path: Path): + def __init__(self, base_path: Path, persistent: bool = False): """ Initialize the factory. @@ -264,8 +315,11 @@ def __init__(self, base_path: Path): ---------- base_path : Path Base directory for creating virtual environments + persistent : bool + If True, venvs are not cleaned up and can be reused across sessions """ self.base_path = Path(base_path) + self.persistent = persistent self.venvs: dict[str, VenvExecutor] = {} def get_venv(self, version: str) -> VenvExecutor: @@ -284,13 +338,14 @@ def get_venv(self, version: str) -> VenvExecutor: """ if version not in self.venvs: venv_path = self.base_path / f"venv_{version}" - executor = VenvExecutor(version, venv_path) + executor = VenvExecutor(version, venv_path, persistent=self.persistent) executor.create() self.venvs[version] = executor return self.venvs[version] def cleanup_all(self): - """Clean up all created virtual environments.""" - for venv in self.venvs.values(): - venv.cleanup() + """Clean up all created virtual environments (skips persistent venvs).""" + if not self.persistent: + for venv in self.venvs.values(): + venv.cleanup() self.venvs.clear() From f7a8fa042a30e91250db0a8011d1a243cd14dee2 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 6 Nov 2025 19:14:12 -0800 Subject: [PATCH 11/19] add compatibility tests to CI workflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a new CI job that runs compatibility tests across multiple Lance versions to verify forward/backward compatibility. The job: - Runs on Ubuntu 24.04 with Python 3.13 - Uses temporary venvs (COMPAT_TEMP_VENV=1) for clean CI environments - Has 60-minute timeout to account for venv creation - Tests compatibility with versions: 0.16.0, 0.30.0, 0.36.0, latest stable, and latest beta 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .github/workflows/python.yml | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 3da92e97dd3..efbafbc2660 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -126,6 +126,36 @@ jobs: name: linux-wheels path: python/target/wheels/pylance-*.whl + compat: + timeout-minutes: 60 + runs-on: ubuntu-24.04 + name: Compatibility Tests + defaults: + run: + shell: bash + working-directory: python + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + lfs: true + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: 3.13 + - uses: Swatinem/rust-cache@v2 + with: + workspaces: python + prefix-key: ${{ env.CACHE_PREFIX }} + cache-targets: false + cache-workspace-crates: true + - uses: ./.github/workflows/build_linux_wheel + - name: Run compatibility tests + run: | + pytest python/tests/forward_compat/ --run-compat -v + env: + COMPAT_TEMP_VENV: 1 + linux-arm: timeout-minutes: 45 runs-on: ubuntu-2404-4x-arm64 From 696cd7db1048a864dd1443200327df04dee8a6ef Mon Sep 17 00:00:00 2001 From: Will Jones Date: Fri, 7 Nov 2025 13:13:56 -0800 Subject: [PATCH 12/19] add license --- python/python/tests/forward_compat/venv_manager.py | 3 +++ python/python/tests/forward_compat/venv_runner.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/python/python/tests/forward_compat/venv_manager.py b/python/python/tests/forward_compat/venv_manager.py index a1e78861068..ccc5f66d8e3 100644 --- a/python/python/tests/forward_compat/venv_manager.py +++ b/python/python/tests/forward_compat/venv_manager.py @@ -1,3 +1,6 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + """ Virtual environment management for compatibility testing. diff --git a/python/python/tests/forward_compat/venv_runner.py b/python/python/tests/forward_compat/venv_runner.py index 29872faeb1d..ce7b3c3de77 100644 --- a/python/python/tests/forward_compat/venv_runner.py +++ b/python/python/tests/forward_compat/venv_runner.py @@ -1,3 +1,6 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + """ Runner script executed inside virtual environments to run compatibility tests. From 9edf9e80acaaa64628071995250c7248462c316b Mon Sep 17 00:00:00 2001 From: Will Jones Date: Fri, 7 Nov 2025 13:16:34 -0800 Subject: [PATCH 13/19] reduce changes to CI test --- .github/workflows/python.yml | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index efbafbc2660..666addbbf61 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -127,6 +127,7 @@ jobs: path: python/target/wheels/pylance-*.whl compat: + needs: linux timeout-minutes: 60 runs-on: ubuntu-24.04 name: Compatibility Tests @@ -143,13 +144,14 @@ jobs: uses: actions/setup-python@v5 with: python-version: 3.13 - - uses: Swatinem/rust-cache@v2 + - name: Download wheels + uses: actions/download-artifact@v4 with: - workspaces: python - prefix-key: ${{ env.CACHE_PREFIX }} - cache-targets: false - cache-workspace-crates: true - - uses: ./.github/workflows/build_linux_wheel + name: linux-wheels + path: python/wheels + - name: Install dependencies + run: | + pip install $(ls wheels/pylance-*.whl)[tests,ray] - name: Run compatibility tests run: | pytest python/tests/forward_compat/ --run-compat -v From db5ff280996a3f68d95647ac2baad30d109cec26 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Fri, 7 Nov 2025 14:26:22 -0800 Subject: [PATCH 14/19] cleanup --- python/Makefile | 8 +- .../{forward_compat => compat}/__init__.py | 0 .../compat_decorator.py | 0 .../{forward_compat => compat}/conftest.py | 0 .../test_file_formats.py | 0 .../test_scalar_indices.py | 0 .../test_vector_indices.py | 0 .../tests/{forward_compat => compat}/util.py | 11 -- .../venv_manager.py | 0 .../{forward_compat => compat}/venv_runner.py | 0 .../tests/forward_compat/PERFORMANCE.md | 119 ------------------ 11 files changed, 4 insertions(+), 134 deletions(-) rename python/python/tests/{forward_compat => compat}/__init__.py (100%) rename python/python/tests/{forward_compat => compat}/compat_decorator.py (100%) rename python/python/tests/{forward_compat => compat}/conftest.py (100%) rename python/python/tests/{forward_compat => compat}/test_file_formats.py (100%) rename python/python/tests/{forward_compat => compat}/test_scalar_indices.py (100%) rename python/python/tests/{forward_compat => compat}/test_vector_indices.py (100%) rename python/python/tests/{forward_compat => compat}/util.py (90%) rename python/python/tests/{forward_compat => compat}/venv_manager.py (100%) rename python/python/tests/{forward_compat => compat}/venv_runner.py (100%) delete mode 100644 python/python/tests/forward_compat/PERFORMANCE.md diff --git a/python/Makefile b/python/Makefile index b224fa7461f..78c0489c522 100644 --- a/python/Makefile +++ b/python/Makefile @@ -16,6 +16,10 @@ doctest: pytest --doctest-modules $(PYTEST_ARGS) python/lance .PHONY: doctest +compattest: + pytest --run-compat $(PYTEST_ARGS) python/tests/compat +.PHONY: compattest + format: format-python cargo fmt .PHONY: format @@ -24,10 +28,6 @@ build: maturin develop .PHONY: build -clean: - rm -rf ./target -.PHONY: clean - format-python: ruff format python ruff check --fix python diff --git a/python/python/tests/forward_compat/__init__.py b/python/python/tests/compat/__init__.py similarity index 100% rename from python/python/tests/forward_compat/__init__.py rename to python/python/tests/compat/__init__.py diff --git a/python/python/tests/forward_compat/compat_decorator.py b/python/python/tests/compat/compat_decorator.py similarity index 100% rename from python/python/tests/forward_compat/compat_decorator.py rename to python/python/tests/compat/compat_decorator.py diff --git a/python/python/tests/forward_compat/conftest.py b/python/python/tests/compat/conftest.py similarity index 100% rename from python/python/tests/forward_compat/conftest.py rename to python/python/tests/compat/conftest.py diff --git a/python/python/tests/forward_compat/test_file_formats.py b/python/python/tests/compat/test_file_formats.py similarity index 100% rename from python/python/tests/forward_compat/test_file_formats.py rename to python/python/tests/compat/test_file_formats.py diff --git a/python/python/tests/forward_compat/test_scalar_indices.py b/python/python/tests/compat/test_scalar_indices.py similarity index 100% rename from python/python/tests/forward_compat/test_scalar_indices.py rename to python/python/tests/compat/test_scalar_indices.py diff --git a/python/python/tests/forward_compat/test_vector_indices.py b/python/python/tests/compat/test_vector_indices.py similarity index 100% rename from python/python/tests/forward_compat/test_vector_indices.py rename to python/python/tests/compat/test_vector_indices.py diff --git a/python/python/tests/forward_compat/util.py b/python/python/tests/compat/util.py similarity index 90% rename from python/python/tests/forward_compat/util.py rename to python/python/tests/compat/util.py index 319d38d1178..210bc581579 100644 --- a/python/python/tests/forward_compat/util.py +++ b/python/python/tests/compat/util.py @@ -5,21 +5,10 @@ # # Everything here must be runnable by older versions of Lance. -from pathlib import Path import pyarrow as pa -def get_path(name: str): - dataset_dir = ( - Path(__file__).parent.parent.parent.parent.parent - / "test_data" - / "forward_compat" - / name - ) - return dataset_dir - - def build_basic_types(): schema = pa.schema( [ diff --git a/python/python/tests/forward_compat/venv_manager.py b/python/python/tests/compat/venv_manager.py similarity index 100% rename from python/python/tests/forward_compat/venv_manager.py rename to python/python/tests/compat/venv_manager.py diff --git a/python/python/tests/forward_compat/venv_runner.py b/python/python/tests/compat/venv_runner.py similarity index 100% rename from python/python/tests/forward_compat/venv_runner.py rename to python/python/tests/compat/venv_runner.py diff --git a/python/python/tests/forward_compat/PERFORMANCE.md b/python/python/tests/forward_compat/PERFORMANCE.md deleted file mode 100644 index 496bfe348af..00000000000 --- a/python/python/tests/forward_compat/PERFORMANCE.md +++ /dev/null @@ -1,119 +0,0 @@ -# Compatibility Test Performance Analysis - -## Persistent Virtual Environments (Default) - -By default, virtual environments are **persistent** and stored in `~/.cache/lance-compat-venvs/`. - -**First run (creates venv):** ~13-16s per version -**Subsequent runs (reuses venv):** ~2-6s per version - -This makes interactive development much faster - you only pay the setup cost once! - -## Timing Breakdown (per version, first test with venv creation) - -``` -Total: ~16-17s -├── Virtual environment setup: ~7-8s (47%) -│ ├── venv creation: 2.2s -│ └── package install (pylance + pytest): 4.9s -├── First Lance import in subprocess: ~5.0s (29%) -├── Test execution: ~0.04s (0.2%) -└── Overhead (pytest, data creation, etc.): ~4s (24%) -``` - -## Timing Breakdown (subsequent runs with persistent venv) - -``` -Total: ~2-6s -├── Venv validation: ~0.1s -├── Lance import (if new subprocess): ~2-5s -├── Test execution: ~0.04s -└── Overhead: ~1-2s -``` - -## What's Working Well - -1. **Persistent subprocess**: Subsequent method calls are 500x faster (5s → 0.01s) -2. **Venv reuse**: Virtual environments are cached across tests in same session -3. **Pip cache**: Leveraging ~11GB pip cache for faster installs - -## Current Optimizations - -- ✅ **Persistent virtual environments** (default, 5x speedup for subsequent runs) -- ✅ Removed pip upgrade step (saves ~1.1s per version) -- ✅ Added `--quiet` flag to pip install for cleaner output -- ✅ Venv validation to ensure correct Lance version is installed - -## Configuration Options - -### Persistent vs Temporary Venvs - -By default, venvs are persistent. To use temporary venvs (old behavior): - -```bash -COMPAT_TEMP_VENV=1 pytest tests/forward_compat/ --run-compat -``` - -### Cleaning Up Persistent Venvs - -To remove all cached venvs: - -```bash -rm -rf ~/.cache/lance-compat-venvs/ -``` - -Or to remove specific versions: - -```bash -rm -rf ~/.cache/lance-compat-venvs/venv_0.30.0 -``` - -## Potential Future Optimizations - -### High Impact - -1. **Parallel venv creation** (saves ~8s × versions): - - Create all venvs in parallel at session start - - Most beneficial for CI or first-time setup - - Requires refactoring VenvFactory - -2. **Pre-built Docker image** (eliminates install time entirely): - - Container with all Lance versions pre-installed - - Good for CI, not for local dev - -### Low Impact - -1. **Venv with --without-pip**: Saves ~0.5s - - Requires symlinking pip from parent venv - - Adds complexity - -## Recommendations - -For local development: -- ✅ **Use persistent venvs** (default) - 5x speedup after first run -- Run tests frequently without worrying about setup time -- Manually clean cache if disk space is a concern - -For CI: -- Consider caching `~/.cache/lance-compat-venvs/` across CI runs -- Or use `COMPAT_TEMP_VENV=1` for clean environments each time - -## Performance Instrumentation - -Detailed timing information is available by setting the `DEBUG` environment variable: - -```bash -# Run tests with timing instrumentation -DEBUG=1 pytest tests/forward_compat/ --run-compat -v -s - -# Normal run (clean output, no timing) -pytest tests/forward_compat/ --run-compat -v -``` - -The timing output shows: -- `[TIMING]` - Main process timing (venv creation, IPC) -- `[VENV TIMING]` - Subprocess timing (actual method execution) - -This helps identify bottlenecks: -- If `receive` time is much larger than `[VENV TIMING]` execution, the bottleneck is Lance import -- If `[VENV TIMING]` is large, the bottleneck is the actual test logic From c1d4766188435a666f2adefdf0299c48deb7699f Mon Sep 17 00:00:00 2001 From: Will Jones Date: Fri, 7 Nov 2025 14:47:05 -0800 Subject: [PATCH 15/19] improve tests coverage --- python/python/tests/compat/compat_decorator.py | 2 ++ python/python/tests/compat/test_file_formats.py | 4 +++- python/python/tests/compat/test_scalar_indices.py | 4 ++-- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/python/python/tests/compat/compat_decorator.py b/python/python/tests/compat/compat_decorator.py index f3f56cff97d..37c2882b60c 100644 --- a/python/python/tests/compat/compat_decorator.py +++ b/python/python/tests/compat/compat_decorator.py @@ -283,6 +283,7 @@ def test_func({sig_params}): # Old version: verify can read venv = venv_factory.get_venv(version) venv.execute_method(obj, "check_read") + venv.execute_method(obj, "check_write") ''' else: # upgrade_downgrade func_body = f''' @@ -298,6 +299,7 @@ def test_func({sig_params}): obj.check_write() # Old version: verify can still read venv.execute_method(obj, "check_read") + venv.execute_method(obj, "check_write") ''' # Execute to create the function diff --git a/python/python/tests/compat/test_file_formats.py b/python/python/tests/compat/test_file_formats.py index d4a0c318a58..8cef20c15ea 100644 --- a/python/python/tests/compat/test_file_formats.py +++ b/python/python/tests/compat/test_file_formats.py @@ -111,4 +111,6 @@ def check_read(self): def check_write(self): ds = lance.dataset(self.path) ds.delete("true") - ds.insert(build_basic_types()) + lance.write_dataset( + build_basic_types(), self.path, data_storage_version="0.1", mode="append" + ) diff --git a/python/python/tests/compat/test_scalar_indices.py b/python/python/tests/compat/test_scalar_indices.py index 9393769c862..5c5291594e3 100644 --- a/python/python/tests/compat/test_scalar_indices.py +++ b/python/python/tests/compat/test_scalar_indices.py @@ -68,11 +68,11 @@ def check_write(self): # Verify new data is queryable table = ds.to_table(filter="btree == 1000") - assert table.num_rows == 1 + assert table.num_rows >= 1 @compat_test( - versions=["0.20.0", "0.30.0", "0.36.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE] + versions=["0.22.0", "0.30.0", "0.36.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE] ) class BitmapLabelListIndex(UpgradeDowngradeTest): """Test BITMAP and LABEL_LIST scalar index compatibility (introduced in 0.20.0).""" From 40b2597ec729020da4377de0702951ce9d3fa77c Mon Sep 17 00:00:00 2001 From: Will Jones Date: Fri, 7 Nov 2025 15:04:01 -0800 Subject: [PATCH 16/19] fix workflow --- .github/workflows/python.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 666addbbf61..358846f68b4 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -154,7 +154,7 @@ jobs: pip install $(ls wheels/pylance-*.whl)[tests,ray] - name: Run compatibility tests run: | - pytest python/tests/forward_compat/ --run-compat -v + make compattest env: COMPAT_TEMP_VENV: 1 From f26fc45de67cc71ba4f2dfdeacbf839561752862 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Fri, 7 Nov 2025 15:15:42 -0800 Subject: [PATCH 17/19] add IVF_HNSW_PQ and IVF_HNSW_SQ compatibility tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds compatibility tests for two additional vector index types: - IVF_HNSW_PQ: Hierarchical Navigable Small World with Product Quantization - IVF_HNSW_SQ: Hierarchical Navigable Small World with Scalar Quantization These tests only run against versions >= 0.39.0 because earlier versions do not support remapping for HNSW indices, which is required for optimize operations like compact_files(). Adds 12 new test cases (6 per index type: 3 versions × 2 test types). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../tests/compat/test_vector_indices.py | 142 ++++++++++++++++++ 1 file changed, 142 insertions(+) diff --git a/python/python/tests/compat/test_vector_indices.py b/python/python/tests/compat/test_vector_indices.py index 8c027720696..ab476bf44fa 100644 --- a/python/python/tests/compat/test_vector_indices.py +++ b/python/python/tests/compat/test_vector_indices.py @@ -90,3 +90,145 @@ def check_write(self): ds.insert(data) ds.optimize.optimize_indices() ds.optimize.compact_files() + + +@compat_test( + versions=[ + "0.39.0", + LAST_STABLE_RELEASE, + LAST_BETA_RELEASE, + ] +) +class HnswPqVectorIndex(UpgradeDowngradeTest): + """Test IVF_HNSW_PQ vector index compatibility. + + Note: Only tests versions >= 0.39.0 because earlier versions don't support + remapping for IVF_HNSW_PQ indices, which is required for optimize operations. + """ + + def __init__(self, path: Path): + self.path = path + + def create(self): + """Create dataset with IVF_HNSW_PQ vector index.""" + shutil.rmtree(self.path, ignore_errors=True) + ndims = 32 + nvecs = 512 + + data = pa.table( + { + "id": pa.array(range(nvecs)), + "vec": pa.FixedSizeListArray.from_arrays( + pc.random(ndims * nvecs).cast(pa.float32()), ndims + ), + } + ) + + dataset = lance.write_dataset(data, self.path) + dataset.create_index( + "vec", + "IVF_HNSW_PQ", + num_partitions=4, + num_sub_vectors=4, + ) + + def check_read(self): + """Verify IVF_HNSW_PQ index can be queried.""" + ds = lance.dataset(self.path) + # Query with random vector + q = pc.random(32).cast(pa.float32()) + result = ds.to_table( + nearest={ + "q": q, + "k": 4, + "column": "vec", + } + ) + assert result.num_rows == 4 + + def check_write(self): + """Verify can insert vectors and rebuild index.""" + ds = lance.dataset(self.path) + # Add new vectors + data = pa.table( + { + "id": pa.array([1000]), + "vec": pa.FixedSizeListArray.from_arrays( + pc.random(32).cast(pa.float32()), 32 + ), + } + ) + ds.insert(data) + ds.optimize.optimize_indices() + ds.optimize.compact_files() + + +@compat_test( + versions=[ + "0.39.0", + LAST_STABLE_RELEASE, + LAST_BETA_RELEASE, + ] +) +class HnswSqVectorIndex(UpgradeDowngradeTest): + """Test IVF_HNSW_SQ vector index compatibility. + + Note: Only tests versions >= 0.39.0 because earlier versions don't support + remapping for IVF_HNSW_SQ indices, which is required for optimize operations. + """ + + def __init__(self, path: Path): + self.path = path + + def create(self): + """Create dataset with IVF_HNSW_SQ vector index.""" + shutil.rmtree(self.path, ignore_errors=True) + ndims = 32 + nvecs = 512 + + data = pa.table( + { + "id": pa.array(range(nvecs)), + "vec": pa.FixedSizeListArray.from_arrays( + pc.random(ndims * nvecs).cast(pa.float32()), ndims + ), + } + ) + + dataset = lance.write_dataset(data, self.path) + dataset.create_index( + "vec", + "IVF_HNSW_SQ", + num_partitions=4, + num_sub_vectors=4, + ) + + def check_read(self): + """Verify IVF_HNSW_SQ index can be queried.""" + ds = lance.dataset(self.path) + # Query with random vector + q = pc.random(32).cast(pa.float32()) + result = ds.to_table( + nearest={ + "q": q, + "k": 4, + "column": "vec", + } + ) + assert result.num_rows == 4 + + def check_write(self): + """Verify can insert vectors and rebuild index.""" + ds = lance.dataset(self.path) + # Add new vectors + data = pa.table( + { + "id": pa.array([1000]), + "vec": pa.FixedSizeListArray.from_arrays( + pc.random(32).cast(pa.float32()), 32 + ), + } + ) + ds.insert(data) + ds.optimize.optimize_indices() + ds.optimize.compact_files() From df3924880f4680aa4998ddb1bc8f0c0b2f65a9ca Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 13 Nov 2025 14:06:26 -0800 Subject: [PATCH 18/19] cleanup --- python/python/tests/compat/venv_manager.py | 87 ++-------------------- 1 file changed, 8 insertions(+), 79 deletions(-) diff --git a/python/python/tests/compat/venv_manager.py b/python/python/tests/compat/venv_manager.py index ccc5f66d8e3..3f803439860 100644 --- a/python/python/tests/compat/venv_manager.py +++ b/python/python/tests/compat/venv_manager.py @@ -13,13 +13,9 @@ import struct import subprocess import sys -import time from pathlib import Path from typing import Any, Optional -# Enable detailed timing output with DEBUG=1 -DEBUG = os.environ.get("DEBUG", "").lower() in ("1", "true", "yes") - class VenvExecutor: """Manages a virtual environment with a specific Lance version.""" @@ -40,28 +36,27 @@ def __init__(self, version: str, venv_path: Path, persistent: bool = False): self.version = version self.venv_path = Path(venv_path) self.persistent = persistent - self.python_path: Optional[Path] = None self._created = False self._subprocess: Optional[subprocess.Popen] = None + @property + def python_path(self) -> Path: + if sys.platform == "win32": + return self.venv_path / "Scripts" / "python.exe" + return self.venv_path / "bin" / "python" + def _validate_venv(self) -> bool: """Check if existing venv is valid and has correct Lance version.""" if not self.venv_path.exists(): return False - # Determine python path - if sys.platform == "win32": - python_path = self.venv_path / "Scripts" / "python.exe" - else: - python_path = self.venv_path / "bin" / "python" - - if not python_path.exists(): + if not self.python_path.exists(): return False # Check if pylance is installed with correct version try: result = subprocess.run( - [str(python_path), "-m", "pip", "show", "pylance"], + [str(self.python_path), "-m", "pip", "show", "pylance"], capture_output=True, text=True, timeout=5, @@ -87,40 +82,17 @@ def create(self): # Check if persistent venv already exists and is valid if self.persistent and self._validate_venv(): - if DEBUG: - print(f"[TIMING] Reusing existing venv for {self.version}", flush=True) - # Set python path - if sys.platform == "win32": - self.python_path = self.venv_path / "Scripts" / "python.exe" - else: - self.python_path = self.venv_path / "bin" / "python" self._created = True return - start_time = time.time() - if DEBUG: - print(f"[TIMING] Creating venv for {self.version}...", flush=True) - # Create virtual environment - venv_start = time.time() subprocess.run( [sys.executable, "-m", "venv", str(self.venv_path)], check=True, capture_output=True, ) - if DEBUG: - print( - f"[TIMING] venv creation: {time.time() - venv_start:.2f}s", flush=True - ) - - # Determine python path in venv - if sys.platform == "win32": - self.python_path = self.venv_path / "Scripts" / "python.exe" - else: - self.python_path = self.venv_path / "bin" / "python" # Install specific pylance version and pytest - install_start = time.time() subprocess.run( [ str(self.python_path), @@ -137,19 +109,8 @@ def create(self): check=True, capture_output=True, ) - if DEBUG: - print( - f"[TIMING] package install: {time.time() - install_start:.2f}s", - flush=True, - ) self._created = True - if DEBUG: - total_time = time.time() - start_time - print( - f"[TIMING] Total venv creation for {self.version}: {total_time:.2f}s", - flush=True, - ) def _ensure_subprocess(self): """Ensure the persistent subprocess is running.""" @@ -157,10 +118,6 @@ def _ensure_subprocess(self): # Subprocess is already running return - if DEBUG: - print(f"[TIMING] Starting subprocess for {self.version}...", flush=True) - start_time = time.time() - # Start persistent subprocess runner_script = Path(__file__).parent / "venv_runner.py" @@ -176,11 +133,6 @@ def _ensure_subprocess(self): stderr=None, # Inherit stderr to see timing messages env=env, ) - if DEBUG: - print( - f"[TIMING] Subprocess started in {time.time() - start_time:.2f}s", - flush=True, - ) def _send_message(self, obj: Any): """Send a length-prefixed pickled message to subprocess.""" @@ -235,37 +187,14 @@ def execute_method(self, obj: Any, method_name: str) -> Any: if not self._created: raise RuntimeError("Virtual environment not created. Call create() first.") - start_time = time.time() - if DEBUG: - print(f"[TIMING] Executing {method_name} in {self.version}...", flush=True) - # Ensure subprocess is running - subprocess_start = time.time() self._ensure_subprocess() - if DEBUG and time.time() - subprocess_start > 0.1: - print( - f"[TIMING] subprocess ensure: {time.time() - subprocess_start:.2f}s", - flush=True, - ) - try: # Send request: (obj, method_name) - send_start = time.time() self._send_message((obj, method_name)) - send_time = time.time() - send_start # Receive response - receive_start = time.time() response = self._receive_message() - receive_time = time.time() - receive_start - - if DEBUG: - total_time = time.time() - start_time - print( - f"[TIMING] send: {send_time:.2f}s, receive: {receive_time:.2f}s, " - f"total: {total_time:.2f}s", - flush=True, - ) if response["success"]: return response["result"] From a19e318c158ca679d38bdf09b0ca7afc9dbdd306 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 13 Nov 2025 15:32:52 -0800 Subject: [PATCH 19/19] test more versions automatically --- .../python/tests/compat/compat_decorator.py | 79 ++++++++++--------- .../python/tests/compat/test_file_formats.py | 8 +- .../tests/compat/test_scalar_indices.py | 26 +++--- .../tests/compat/test_vector_indices.py | 28 +------ 4 files changed, 62 insertions(+), 79 deletions(-) diff --git a/python/python/tests/compat/compat_decorator.py b/python/python/tests/compat/compat_decorator.py index 37c2882b60c..2f3c1cee044 100644 --- a/python/python/tests/compat/compat_decorator.py +++ b/python/python/tests/compat/compat_decorator.py @@ -12,34 +12,57 @@ import json import subprocess import sys +import urllib.request from functools import lru_cache +from typing import List import pytest +from packaging.version import Version @lru_cache(maxsize=1) -def last_stable_release(): - """Returns the latest stable version available on PyPI. - - Queries the PyPI JSON API to get the latest stable release of pylance. - Results are cached to avoid repeated network calls. - """ +def pylance_stable_versions() -> List[Version]: + """Fetches and returns a sorted list of stable pylance versions from PyPI.""" try: - import urllib.request - with urllib.request.urlopen( "https://pypi.org/pypi/pylance/json", timeout=5 ) as response: data = json.loads(response.read()) - version = data["info"]["version"] - return version + releases = data["releases"].keys() + stable_versions = [ + Version(v) + for v in releases + if not any(c in v for c in ["a", "b", "rc"]) + ] + stable_versions.sort() + return stable_versions except Exception as e: - # If we can't fetch, return None which will be filtered out print( - f"Warning: Could not fetch latest stable release from PyPI: {e}", + f"Warning: Could not fetch pylance versions from PyPI: {e}", file=sys.stderr, ) - return None + return [] + + +def recent_major_versions(n: int) -> List[str]: + """Returns the n most recent major versions of pylance as strings.""" + stable_versions = pylance_stable_versions() + major_versions = [] + seen_majors = set() + + def key(v: Version): + # On 0.x versions, we bumped minor version for breaking changes. + if v.major == 0: + return (0, v.minor) + return v.major + + for v in reversed(stable_versions): + if key(v) not in seen_majors: + seen_majors.add(key(v)) + major_versions.append(str(v)) + if len(major_versions) >= n: + break + return major_versions @lru_cache(maxsize=1) @@ -98,9 +121,10 @@ def last_beta_release(): return None -# Fetch versions (cached) -LAST_STABLE_RELEASE = last_stable_release() +VERSIONS = recent_major_versions(3) LAST_BETA_RELEASE = last_beta_release() +if LAST_BETA_RELEASE is not None: + VERSIONS.append(LAST_BETA_RELEASE) class UpgradeDowngradeTest: @@ -122,15 +146,7 @@ def check_write(self): pass -# Default versions to test, filtering out any that couldn't be fetched -VERSIONS = [ - v - for v in ["0.16.0", "0.30.0", "0.36.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE] - if v is not None -] - - -def compat_test(versions=None): +def compat_test(min_version: str = "0.16.0"): """Decorator to generate upgrade/downgrade compatibility tests. This decorator transforms a test class into two parameterized pytest test functions: @@ -173,19 +189,8 @@ def check_write(self): # Write data pass """ - if versions is None: - versions = VERSIONS - - # Filter out None values (in case some versions couldn't be fetched) - versions = [v for v in versions if v is not None] - - # Skip if no valid versions - if not versions: - - def decorator(cls): - return cls - - return decorator + version = set([min_version, *VERSIONS]) + versions = [v for v in version if Version(v) >= Version(min_version)] def decorator(cls): # Extract existing parametrize marks from the class diff --git a/python/python/tests/compat/test_file_formats.py b/python/python/tests/compat/test_file_formats.py index 8cef20c15ea..f65c8611ff6 100644 --- a/python/python/tests/compat/test_file_formats.py +++ b/python/python/tests/compat/test_file_formats.py @@ -15,8 +15,6 @@ from lance.file import LanceFileReader, LanceFileWriter from .compat_decorator import ( - LAST_BETA_RELEASE, - LAST_STABLE_RELEASE, UpgradeDowngradeTest, compat_test, ) @@ -25,7 +23,7 @@ # We start testing against the first release where 2.1 was stable. Before that # the format was unstable so the readers will panic. -@compat_test(versions=["0.38.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE]) +@compat_test(min_version="0.38.0") class BasicTypes2_1(UpgradeDowngradeTest): """Test file format 2.1 compatibility with basic data types.""" @@ -50,7 +48,7 @@ def check_write(self): writer.write_batch(build_basic_types()) -@compat_test() +@compat_test(min_version="0.16.0") @pytest.mark.parametrize( "data_factory,name", [ @@ -92,7 +90,7 @@ def check_write(self): writer.write_batch(batch) -@compat_test() +@compat_test(min_version="0.16.0") class BasicTypesLegacy(UpgradeDowngradeTest): """Test legacy data storage version 0.1 compatibility.""" diff --git a/python/python/tests/compat/test_scalar_indices.py b/python/python/tests/compat/test_scalar_indices.py index 5c5291594e3..5d42a837bdc 100644 --- a/python/python/tests/compat/test_scalar_indices.py +++ b/python/python/tests/compat/test_scalar_indices.py @@ -16,16 +16,17 @@ import pyarrow as pa from .compat_decorator import ( - LAST_BETA_RELEASE, - LAST_STABLE_RELEASE, UpgradeDowngradeTest, compat_test, ) -@compat_test(versions=["0.30.0", "0.36.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE]) +@compat_test(min_version="0.30.0") class BTreeIndex(UpgradeDowngradeTest): - """Test BTREE scalar index compatibility (introduced in 0.20.0).""" + """Test BTREE scalar index compatibility (introduced in 0.20.0). + + Started fully working in 0.30.0 with various fixes. + """ def __init__(self, path: Path): self.path = path @@ -71,11 +72,12 @@ def check_write(self): assert table.num_rows >= 1 -@compat_test( - versions=["0.22.0", "0.30.0", "0.36.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE] -) +@compat_test(min_version="0.22.0") class BitmapLabelListIndex(UpgradeDowngradeTest): - """Test BITMAP and LABEL_LIST scalar index compatibility (introduced in 0.20.0).""" + """Test BITMAP and LABEL_LIST scalar index compatibility (introduced in 0.20.0). + + Started fully working in 0.22.0 with fixes to LABEL_LIST index. + """ def __init__(self, path: Path): self.path = path @@ -123,7 +125,7 @@ def check_write(self): ds.optimize.compact_files() -@compat_test(versions=["0.36.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE]) +@compat_test(min_version="0.36.0") class NgramIndex(UpgradeDowngradeTest): """Test NGRAM index compatibility (introduced in 0.36.0).""" @@ -167,7 +169,7 @@ def check_write(self): ds.optimize.compact_files() -@compat_test(versions=["0.36.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE]) +@compat_test(min_version="0.36.0") class ZonemapBloomfilterIndex(UpgradeDowngradeTest): """Test ZONEMAP and BLOOMFILTER index compatibility (introduced in 0.36.0).""" @@ -217,7 +219,7 @@ def check_write(self): ds.optimize.compact_files() -@compat_test(versions=["0.36.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE]) +@compat_test(min_version="0.36.0") class JsonIndex(UpgradeDowngradeTest): """Test JSON index compatibility (introduced in 0.36.0).""" @@ -268,7 +270,7 @@ def check_write(self): ds.optimize.compact_files() -@compat_test(versions=["0.36.0", LAST_STABLE_RELEASE, LAST_BETA_RELEASE]) +@compat_test(min_version="0.36.0") class FtsIndex(UpgradeDowngradeTest): """Test FTS (full-text search) index compatibility (introduced in 0.36.0).""" diff --git a/python/python/tests/compat/test_vector_indices.py b/python/python/tests/compat/test_vector_indices.py index ab476bf44fa..b58ded4f5ff 100644 --- a/python/python/tests/compat/test_vector_indices.py +++ b/python/python/tests/compat/test_vector_indices.py @@ -16,22 +16,12 @@ import pyarrow.compute as pc from .compat_decorator import ( - LAST_BETA_RELEASE, - LAST_STABLE_RELEASE, UpgradeDowngradeTest, compat_test, ) -@compat_test( - versions=[ - "0.29.1.beta2", - "0.30.0", - "0.36.0", - LAST_STABLE_RELEASE, - LAST_BETA_RELEASE, - ] -) +@compat_test(min_version="0.29.1.beta2") class PqVectorIndex(UpgradeDowngradeTest): """Test PQ (Product Quantization) vector index compatibility.""" @@ -92,13 +82,7 @@ def check_write(self): ds.optimize.compact_files() -@compat_test( - versions=[ - "0.39.0", - LAST_STABLE_RELEASE, - LAST_BETA_RELEASE, - ] -) +@compat_test(min_version="0.39.0") class HnswPqVectorIndex(UpgradeDowngradeTest): """Test IVF_HNSW_PQ vector index compatibility. @@ -163,13 +147,7 @@ def check_write(self): ds.optimize.compact_files() -@compat_test( - versions=[ - "0.39.0", - LAST_STABLE_RELEASE, - LAST_BETA_RELEASE, - ] -) +@compat_test(min_version="0.39.0") class HnswSqVectorIndex(UpgradeDowngradeTest): """Test IVF_HNSW_SQ vector index compatibility.