-
Notifications
You must be signed in to change notification settings - Fork 638
test: add regression test for ivf/pq search #5476
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
westonpace
merged 1 commit into
lance-format:main
from
westonpace:test/ivf-pq-regression
Jan 5, 2026
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
250 changes: 250 additions & 0 deletions
250
python/python/ci_benchmarks/benchmarks/test_ivf_pq_search.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,250 @@ | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # SPDX-FileCopyrightText: Copyright The Lance Authors | ||
|
|
||
| """Benchmarks for IVF_PQ vector search performance.""" | ||
|
|
||
| import math | ||
| import tempfile | ||
| from pathlib import Path | ||
|
|
||
| import lance | ||
| import numpy as np | ||
| import pyarrow as pa | ||
| import pytest | ||
| from ci_benchmarks.utils import wipe_os_cache | ||
| from lance.tracing import trace_to_chrome | ||
|
|
||
| trace_to_chrome(file="/tmp/trace.json") | ||
|
|
||
|
|
||
| # Test parameters | ||
| DATASET_SIZES = [100_000, 1_000_000] | ||
| DATASET_SIZE_LABELS = ["100K", "1M"] | ||
| VECTOR_DIM = 1024 | ||
|
|
||
| # Number of partitions to search (nprobes) | ||
| NPROBES = [10, 50] | ||
| NPROBES_LABELS = ["10probes", "50probes"] | ||
|
|
||
| # Refine factor for vector search | ||
| REFINE_FACTORS = [None, 1] | ||
| REFINE_FACTOR_LABELS = ["no_refine", "refine_1x"] | ||
|
|
||
| # Number of results to return (k) | ||
| K_VALUES = [10, 100] | ||
| K_LABELS = ["k10", "k100"] | ||
|
|
||
|
|
||
| # Global cache for datasets, keyed by (num_rows, dim) | ||
| _DATASET_CACHE = {} | ||
|
|
||
|
|
||
| def _generate_vector_dataset(num_rows: int, dim: int = 1024): | ||
| """Generate random vector dataset for IVF_PQ search benchmarks. | ||
|
|
||
| Args: | ||
| num_rows: Number of vectors to generate | ||
| dim: Dimensionality of vectors (default: 1024) | ||
|
|
||
| Yields: | ||
| PyArrow RecordBatch with random float32 vectors | ||
| """ | ||
| batch_size = 10_000 | ||
| num_batches = num_rows // batch_size | ||
|
|
||
| for batch_idx in range(num_batches): | ||
| # Generate random vectors with 32-bit floats | ||
| vectors = np.random.randn(batch_size, dim).astype(np.float32) | ||
|
|
||
| # Convert to PyArrow fixed_size_list | ||
| vector_array = pa.FixedSizeListArray.from_arrays( | ||
| pa.array(vectors.flatten(), type=pa.float32()), list_size=dim | ||
| ) | ||
|
|
||
| # Add an ID column for reference | ||
| ids = pa.array( | ||
| range(batch_idx * batch_size, (batch_idx + 1) * batch_size), type=pa.int64() | ||
| ) | ||
|
|
||
| batch = pa.record_batch([vector_array, ids], names=["vector", "id"]) | ||
| yield batch | ||
|
|
||
|
|
||
| def _get_or_create_dataset(num_rows: int, dim: int = 1024) -> str: | ||
| """Get or create a dataset with the specified parameters. | ||
|
|
||
| Datasets are cached globally per process to avoid expensive recreation. | ||
| Returns the URI to the dataset. | ||
| """ | ||
| cache_key = (num_rows, dim) | ||
|
|
||
| if cache_key not in _DATASET_CACHE: | ||
| # Create a persistent temporary directory for this dataset | ||
| tmpdir = tempfile.mkdtemp(prefix=f"lance_bench_{num_rows}_{dim}_") | ||
| dataset_uri = str(Path(tmpdir) / "vector_dataset.lance") | ||
|
|
||
| # Create schema | ||
| schema = pa.schema( | ||
| [ | ||
| pa.field("vector", pa.list_(pa.float32(), dim)), | ||
| pa.field("id", pa.int64()), | ||
| ] | ||
| ) | ||
|
|
||
| # Generate and write dataset | ||
| data = _generate_vector_dataset(num_rows, dim) | ||
| ds = lance.write_dataset( | ||
| data, | ||
| dataset_uri, | ||
| schema=schema, | ||
| mode="create", | ||
| ) | ||
|
|
||
| num_partitions = min(num_rows // 4000, int(math.sqrt(num_rows))) | ||
|
|
||
| # Create IVF_PQ index | ||
| ds.create_index( | ||
| "vector", | ||
| index_type="IVF_PQ", | ||
| num_partitions=num_partitions, | ||
| num_sub_vectors=dim // 16, | ||
| ) | ||
|
|
||
| _DATASET_CACHE[cache_key] = dataset_uri | ||
|
|
||
| return _DATASET_CACHE[cache_key] | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("num_rows", DATASET_SIZES, ids=DATASET_SIZE_LABELS) | ||
| @pytest.mark.parametrize("nprobes", NPROBES, ids=NPROBES_LABELS) | ||
| @pytest.mark.parametrize("refine_factor", REFINE_FACTORS, ids=REFINE_FACTOR_LABELS) | ||
| @pytest.mark.parametrize("k", K_VALUES, ids=K_LABELS) | ||
| @pytest.mark.parametrize("use_cache", [True, False], ids=["cache", "no_cache"]) | ||
| def test_ivf_pq_search( | ||
| benchmark, | ||
| num_rows: int, | ||
| nprobes: int, | ||
| refine_factor: int | None, | ||
| k: int, | ||
| use_cache: bool, | ||
| ): | ||
| """Benchmark IVF_PQ vector search with different configurations. | ||
|
|
||
| Tests vector search performance with: | ||
| - Different dataset sizes (100K, 1M vectors) | ||
| - Different numbers of partitions searched (10, 50 nprobes) | ||
| - Different refine factors (None, 1x) | ||
| - Different result counts (k=10, k=100) | ||
| - Cached vs uncached index performance | ||
|
|
||
| Uses 1024-dimensional float32 vectors with IVF_PQ index. | ||
| """ | ||
| # Get or create the dataset (cached globally per process) | ||
| dataset_uri = _get_or_create_dataset(num_rows, dim=VECTOR_DIM) | ||
| ds = lance.dataset(dataset_uri) | ||
|
|
||
| # Generate query vector | ||
| query_vector = np.random.randn(VECTOR_DIM).astype(np.float32) | ||
|
|
||
| # Setup function to clear OS cache if needed | ||
| def clear_cache(): | ||
| if not use_cache: | ||
| wipe_os_cache(dataset_uri) | ||
|
|
||
| def bench(): | ||
| # Reload dataset if not using cache | ||
| search_ds = ds if use_cache else lance.dataset(dataset_uri) | ||
|
|
||
| # Build search parameters | ||
| search_params = { | ||
| "column": "vector", | ||
| "q": query_vector, | ||
| "k": k, | ||
| "nprobes": nprobes, | ||
| } | ||
| if refine_factor is not None: | ||
| search_params["refine_factor"] = refine_factor | ||
|
|
||
| # Perform vector search | ||
| search_ds.to_table( | ||
| nearest=search_params, | ||
| columns=["id"], | ||
| ) | ||
|
|
||
| if use_cache: | ||
| setup = None | ||
| warmup_rounds = 1 | ||
| else: | ||
| setup = clear_cache | ||
| warmup_rounds = 0 | ||
|
|
||
| benchmark.pedantic( | ||
| bench, | ||
| warmup_rounds=warmup_rounds, | ||
| rounds=100, | ||
| setup=setup, | ||
| ) | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("num_rows", DATASET_SIZES, ids=DATASET_SIZE_LABELS) | ||
| @pytest.mark.parametrize("nprobes", NPROBES, ids=NPROBES_LABELS) | ||
| @pytest.mark.parametrize("refine_factor", REFINE_FACTORS, ids=REFINE_FACTOR_LABELS) | ||
| @pytest.mark.parametrize("k", K_VALUES, ids=K_LABELS) | ||
| @pytest.mark.parametrize("use_cache", [True, False], ids=["cache", "no_cache"]) | ||
| def test_ivf_pq_search_with_payload( | ||
| benchmark, | ||
| num_rows: int, | ||
| nprobes: int, | ||
| refine_factor: int | None, | ||
| k: int, | ||
| use_cache: bool, | ||
| ): | ||
| """Benchmark IVF_PQ vector search with payload columns. | ||
|
|
||
| Similar to test_ivf_pq_search but includes retrieving vector data | ||
| along with results, which tests data loading performance. | ||
| """ | ||
| # Get or create the dataset (cached globally per process) | ||
| dataset_uri = _get_or_create_dataset(num_rows, dim=VECTOR_DIM) | ||
| ds = lance.dataset(dataset_uri) | ||
|
|
||
| # Generate query vector | ||
| query_vector = np.random.randn(VECTOR_DIM).astype(np.float32) | ||
|
|
||
| def clear_cache(): | ||
| if not use_cache: | ||
| wipe_os_cache(dataset_uri) | ||
|
|
||
| def bench(): | ||
| search_ds = ds if use_cache else lance.dataset(dataset_uri) | ||
|
|
||
| # Build search parameters | ||
| search_params = { | ||
| "column": "vector", | ||
| "q": query_vector, | ||
| "k": k, | ||
| "nprobes": nprobes, | ||
| } | ||
| if refine_factor is not None: | ||
| search_params["refine_factor"] = refine_factor | ||
|
|
||
| # Search and retrieve both vector and id columns | ||
| search_ds.to_table( | ||
| nearest=search_params, | ||
| columns=["vector", "id"], | ||
| ) | ||
|
|
||
| if use_cache: | ||
| setup = None | ||
| warmup_rounds = 1 | ||
| else: | ||
| setup = clear_cache | ||
| warmup_rounds = 0 | ||
|
|
||
| benchmark.pedantic( | ||
| bench, | ||
| warmup_rounds=warmup_rounds, | ||
| rounds=100, | ||
| iterations=1, | ||
| setup=setup, | ||
| ) |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Two dimensions not tested here that seem worthwhile are:
num_bitsin PQ (4-bit vs 8-bit)data_typein the vector (f16 vs f32)These setting will trigger different code paths, which I think are important to cover in regression testing.
There are some setting here like
kwhich I don't think really hit different code paths as much as they just change a variable in the same code path.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think rust-level benchmarks would sufficiently (and probably with more fidelity) capture those parameters since they primarily affect the compute driven PQ search time. These parameters would also require training completely new indexes which is what dominates the benchmark time.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We have
pq_dist_tableand4bit_pq_dist_tablewhich covernum_bitsanddistance_type. However, these only look at f32. I'll add an issue to adddata_typeto these benchmarks (I want to review them anyways since the lance-index benches don't pass reliably and in sufficient time for me)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#5478