From 4116d5e8a74e303bd7ed9e7e9a461f7c4edb2bb5 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Tue, 6 Jan 2026 06:27:22 -0800
Subject: [PATCH 1/4] Add vector throughput benchmark

---
 .../benchmarks/test_ivf_pq_search.py          | 142 +++++--
 python/python/ci_benchmarks/datagen/basic.py  |   2 -
 rust/lance/Cargo.toml                         |   5 +
 rust/lance/benches/vector_throughput.rs       | 352 ++++++++++++++++++
 4 files changed, 461 insertions(+), 40 deletions(-)
 create mode 100644 rust/lance/benches/vector_throughput.rs

diff --git a/python/python/ci_benchmarks/benchmarks/test_ivf_pq_search.py b/python/python/ci_benchmarks/benchmarks/test_ivf_pq_search.py
index 6b81d7e9887..c271c574679 100644
--- a/python/python/ci_benchmarks/benchmarks/test_ivf_pq_search.py
+++ b/python/python/ci_benchmarks/benchmarks/test_ivf_pq_search.py
@@ -4,7 +4,9 @@
 """Benchmarks for IVF_PQ vector search performance."""
 
 import math
+import multiprocessing as mp
 import tempfile
+from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 
 import lance
@@ -35,8 +37,8 @@
 K_LABELS = ["k10", "k100"]
 
 
-# Global cache for datasets, keyed by (num_rows, dim)
-_DATASET_CACHE = {}
+# Datasets are stored in fixed temporary directories and reused between runs
+# to avoid retraining indexes
 
 
 def _generate_vector_dataset(num_rows: int, dim: int = 1024):
@@ -73,46 +75,56 @@ def _generate_vector_dataset(num_rows: int, dim: int = 1024):
 def _get_or_create_dataset(num_rows: int, dim: int = 1024) -> str:
     """Get or create a dataset with the specified parameters.
 
-    Datasets are cached globally per process to avoid expensive recreation.
+    Uses a fixed temporary directory so datasets persist between benchmark runs.
+    If the dataset exists and has the correct number of rows, it will be reused.
     Returns the URI to the dataset.
     """
-    cache_key = (num_rows, dim)
-
-    if cache_key not in _DATASET_CACHE:
-        # Create a persistent temporary directory for this dataset
-        tmpdir = tempfile.mkdtemp(prefix=f"lance_bench_{num_rows}_{dim}_")
-        dataset_uri = str(Path(tmpdir) / "vector_dataset.lance")
-
-        # Create schema
-        schema = pa.schema(
-            [
-                pa.field("vector", pa.list_(pa.float32(), dim)),
-                pa.field("id", pa.int64()),
-            ]
-        )
-
-        # Generate and write dataset
-        data = _generate_vector_dataset(num_rows, dim)
-        ds = lance.write_dataset(
-            data,
-            dataset_uri,
-            schema=schema,
-            mode="create",
-        )
+    # Use a fixed directory path based on parameters
+    tmpdir = Path(tempfile.gettempdir()) / f"lance_bench_{num_rows}_{dim}"
+    tmpdir.mkdir(exist_ok=True)
+    dataset_uri = "file+uring://" + str(tmpdir / "vector_dataset.lance")
+
+    # Check if dataset already exists and has correct row count
+    try:
+        ds = lance.dataset(dataset_uri)
+        if ds.count_rows() == num_rows:
+            print(f"Reusing existing dataset at {dataset_uri}")
+            return dataset_uri
+        else:
+            print(
+                f"Dataset exists but has wrong row count ({ds.count_rows()} vs {num_rows}), recreating..."
+            )
+    except Exception:
+        print(f"Creating new dataset at {dataset_uri}")
+
+    # Create schema
+    schema = pa.schema(
+        [
+            pa.field("vector", pa.list_(pa.float32(), dim)),
+            pa.field("id", pa.int64()),
+        ]
+    )
 
-        num_partitions = min(num_rows // 4000, int(math.sqrt(num_rows)))
+    # Generate and write dataset
+    data = _generate_vector_dataset(num_rows, dim)
+    ds = lance.write_dataset(
+        data,
+        dataset_uri,
+        schema=schema,
+        mode="overwrite",  # Use overwrite to handle recreation
+    )
 
-        # Create IVF_PQ index
-        ds.create_index(
-            "vector",
-            index_type="IVF_PQ",
-            num_partitions=num_partitions,
-            num_sub_vectors=dim // 16,
-        )
+    num_partitions = min(num_rows // 4000, int(math.sqrt(num_rows)))
 
-        _DATASET_CACHE[cache_key] = dataset_uri
+    # Create IVF_PQ index
+    ds.create_index(
+        "vector",
+        index_type="IVF_PQ",
+        num_partitions=num_partitions,
+        num_sub_vectors=dim // 16,
+    )
 
-    return _DATASET_CACHE[cache_key]
+    return dataset_uri
 
 
 @pytest.mark.parametrize("num_rows", DATASET_SIZES, ids=DATASET_SIZE_LABELS)
@@ -139,7 +151,7 @@ def test_ivf_pq_search(
 
     Uses 1024-dimensional float32 vectors with IVF_PQ index.
     """
-    # Get or create the dataset (cached globally per process)
+    # Get or create the dataset (reused from fixed temp directory between runs)
     dataset_uri = _get_or_create_dataset(num_rows, dim=VECTOR_DIM)
     ds = lance.dataset(dataset_uri)
 
@@ -204,7 +216,7 @@ def test_ivf_pq_search_with_payload(
     Similar to test_ivf_pq_search but includes retrieving vector data
     along with results, which tests data loading performance.
     """
-    # Get or create the dataset (cached globally per process)
+    # Get or create the dataset (reused from fixed temp directory between runs)
     dataset_uri = _get_or_create_dataset(num_rows, dim=VECTOR_DIM)
     ds = lance.dataset(dataset_uri)
 
@@ -248,3 +260,57 @@ def bench():
         iterations=1,
         setup=setup,
     )
+
+
+@pytest.mark.parametrize("use_cache", [True, False], ids=["cache", "no_cache"])
+def test_ivf_pq_throughput(
+    benchmark,
+    use_cache: bool,
+):
+    """Benchmark IVF_PQ vector search throughput (with payload)"""
+    # Get or create the dataset (reused from fixed temp directory between runs)
+    dataset_uri = _get_or_create_dataset(1_000_000, dim=768)
+    ds = lance.dataset(dataset_uri)
+
+    NUM_QUERIES = 1000
+
+    # Generate query vectors
+    query_vectors = [
+        np.random.randn(768).astype(np.float32) for _ in range(NUM_QUERIES)
+    ]
+
+    def clear_cache():
+        if not use_cache:
+            wipe_os_cache(dataset_uri)
+
+    def bench():
+        with ThreadPoolExecutor(max_workers=2 * (mp.cpu_count() - 2)) as executor:
+            futures = [
+                executor.submit(
+                    ds.to_table,
+                    nearest={
+                        "column": "vector",
+                        "q": query_vector,
+                        "k": 50,
+                        "nprobes": 20,
+                        "refine_factor": 10,
+                    },
+                    columns=["vector", "_distance"],
+                )
+                for query_vector in query_vectors
+            ]
+            for future in futures:
+                future.result()
+
+    if use_cache:
+        setup = None
+    else:
+        setup = clear_cache
+
+    benchmark.pedantic(
+        bench,
+        warmup_rounds=1,
+        rounds=1,
+        iterations=1,
+        setup=setup,
+    )
diff --git a/python/python/ci_benchmarks/datagen/basic.py b/python/python/ci_benchmarks/datagen/basic.py
index c14d7dcb47a..9629ac09509 100644
--- a/python/python/ci_benchmarks/datagen/basic.py
+++ b/python/python/ci_benchmarks/datagen/basic.py
@@ -58,7 +58,6 @@ def _create(dataset_uri: str):
                     dataset_uri,
                     schema=SCHEMA,
                     mode="append",
-                    use_legacy_format=False,
                 )
             else:
                 raise Exception(
@@ -72,7 +71,6 @@ def _create(dataset_uri: str):
             dataset_uri,
             schema=SCHEMA,
             mode="create",
-            use_legacy_format=False,
         )
     if ds.list_indices() == []:
         ds.create_scalar_index("row_number", "BTREE")
diff --git a/rust/lance/Cargo.toml b/rust/lance/Cargo.toml
index 25c30230b35..a36cb6c25e0 100644
--- a/rust/lance/Cargo.toml
+++ b/rust/lance/Cargo.toml
@@ -91,6 +91,7 @@ lzma-sys = { version = "0.1" }
 lance-test-macros = { workspace = true }
 lance-datagen = { workspace = true }
 pretty_assertions = { workspace = true }
+libc = { workspace = true }
 clap = { workspace = true, features = ["derive"] }
 criterion = { workspace = true }
 approx.workspace = true
@@ -165,5 +166,9 @@ harness = false
 name = "random_access"
 harness = false
 
+[[bench]]
+name = "vector_throughput"
+harness = false
+
 [lints]
 workspace = true
diff --git a/rust/lance/benches/vector_throughput.rs b/rust/lance/benches/vector_throughput.rs
new file mode 100644
index 00000000000..89f82372988
--- /dev/null
+++ b/rust/lance/benches/vector_throughput.rs
@@ -0,0 +1,352 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+//! Benchmark for IVF_PQ vector search throughput
+//!
+//! This benchmark measures concurrent vector search performance with IVF_PQ indexes,
+//! similar to the Python test_ivf_pq_throughput benchmark.
+
+#![allow(clippy::print_stdout)]
+
+use std::sync::Arc;
+
+use arrow_array::{FixedSizeListArray, Float32Array, RecordBatch, RecordBatchIterator};
+use arrow_schema::{DataType, Field, FieldRef, Schema as ArrowSchema};
+use criterion::{criterion_group, criterion_main, BatchSize, Criterion, Throughput};
+use futures::{StreamExt, TryStreamExt};
+use lance_file::version::LanceFileVersion;
+#[cfg(target_os = "linux")]
+use pprof::criterion::{Output, PProfProfiler};
+use rand::Rng;
+
+use lance::dataset::{Dataset, WriteMode, WriteParams};
+use lance::index::vector::VectorIndexParams;
+use lance_arrow::FixedSizeListArrayExt;
+use lance_index::{
+    vector::{ivf::IvfBuildParams, pq::PQBuildParams},
+    DatasetIndexExt, IndexType,
+};
+use lance_linalg::distance::MetricType;
+use lance_testing::datagen::generate_random_array;
+use tokio::runtime::Runtime;
+
+// Benchmark parameters matching Python test_ivf_pq_throughput
+const NUM_ROWS: usize = 1_000_000;
+const DIM: usize = 768;
+const NUM_QUERIES: usize = 100;
+const K: usize = 50;
+const NPROBES: usize = 20;
+const REFINE_FACTOR: u32 = 10;
+
+// IVF_PQ index parameters
+const IVF_PARTITIONS: usize = 256;
+const PQ_BITS: usize = 8;
+const PQ_SUB_VECTORS: usize = DIM / 16;
+const MAX_ITERATIONS: usize = 50;
+
+/// Cached dataset with pre-generated query vectors
+struct CachedDataset {
+    uri: String,
+    dataset: Arc<Dataset>,
+    query_vectors: Vec<Arc<Float32Array>>,
+}
+
+/// Get or create a cached dataset with IVF_PQ index and query vectors
+fn get_or_create_dataset(rt: &Runtime, version: LanceFileVersion) -> Arc<CachedDataset> {
+    // Create dataset in fixed temp directory
+    let uri = format!(
+        "file+uring:///tmp/lance_bench_throughput_{}_{}_{}",
+        NUM_ROWS, DIM, version
+    );
+
+    rt.block_on(async {
+        // Check if dataset exists on disk with correct row count
+        let mut needs_creation = true;
+        let mut needs_indexing = true;
+
+        if let Ok(dataset) = Dataset::open(&uri).await {
+            let row_count = dataset.count_rows(None).await.unwrap();
+            if row_count == NUM_ROWS {
+                println!("Reusing existing dataset at {} ({} rows)", uri, row_count);
+                needs_creation = false;
+
+                // Check if index exists
+                let indices = dataset.load_indices().await.unwrap();
+                if !indices.is_empty() {
+                    println!(
+                        "Dataset already has {} index(es), skipping index creation",
+                        indices.len()
+                    );
+                    needs_indexing = false;
+                } else {
+                    println!("Dataset exists but has no index, will create index");
+                }
+            } else {
+                println!(
+                    "Dataset exists but has wrong row count ({} vs {}), recreating",
+                    row_count, NUM_ROWS
+                );
+                std::fs::remove_dir_all(&uri).ok();
+            }
+        } else {
+            println!(
+                "Creating new dataset with {} rows, {} dimensions",
+                NUM_ROWS, DIM
+            );
+        }
+
+        // Create dataset if needed
+        if needs_creation {
+            create_dataset(&uri).await;
+        }
+
+        // Open dataset
+        let mut dataset = Dataset::open(&uri).await.unwrap();
+
+        // Create index if needed
+        if needs_indexing {
+            create_ivf_pq_index(&mut dataset).await;
+        }
+
+        // Generate query vectors
+        let query_vectors = generate_query_vectors();
+
+        let cached = Arc::new(CachedDataset {
+            uri: uri.clone(),
+            dataset: Arc::new(dataset),
+            query_vectors,
+        });
+
+        cached
+    })
+}
+
+/// Create a dataset with random vectors
+async fn create_dataset(uri: &str) {
+    let schema = Arc::new(ArrowSchema::new(vec![Field::new(
+        "vector",
+        DataType::FixedSizeList(
+            FieldRef::new(Field::new("item", DataType::Float32, true)),
+            DIM as i32,
+        ),
+        false,
+    )]));
+
+    let batch_size = 10_000;
+    let batches: Vec<RecordBatch> = (0..(NUM_ROWS / batch_size))
+        .map(|_| {
+            RecordBatch::try_new(
+                schema.clone(),
+                vec![Arc::new(
+                    FixedSizeListArray::try_new_from_values(
+                        generate_random_array(batch_size * DIM),
+                        DIM as i32,
+                    )
+                    .unwrap(),
+                )],
+            )
+            .unwrap()
+        })
+        .collect();
+
+    let write_params = WriteParams {
+        max_rows_per_file: NUM_ROWS,
+        max_rows_per_group: batch_size,
+        mode: WriteMode::Create,
+        ..Default::default()
+    };
+
+    let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema.clone());
+    Dataset::write(reader, uri, Some(write_params))
+        .await
+        .unwrap();
+
+    println!("Dataset created at {}", uri);
+}
+
+/// Create IVF_PQ index on the dataset
+async fn create_ivf_pq_index(dataset: &mut Dataset) {
+    println!("Creating IVF_PQ index...");
+
+    let ivf_params = IvfBuildParams {
+        num_partitions: Some(IVF_PARTITIONS),
+        max_iters: MAX_ITERATIONS,
+        ..Default::default()
+    };
+    let pq_params = PQBuildParams {
+        num_bits: PQ_BITS,
+        num_sub_vectors: PQ_SUB_VECTORS,
+        ..Default::default()
+    };
+    let params = VectorIndexParams::with_ivf_pq_params(MetricType::L2, ivf_params, pq_params);
+
+    dataset
+        .create_index(
+            vec!["vector"].as_slice(),
+            IndexType::Vector,
+            Some("ivf_pq_index".to_string()),
+            &params,
+            true,
+        )
+        .await
+        .unwrap();
+
+    println!("IVF_PQ index created");
+}
+
+/// Generate random query vectors
+fn generate_query_vectors() -> Vec<Arc<Float32Array>> {
+    let mut rng = rand::rng();
+    (0..NUM_QUERIES)
+        .map(|_| {
+            let values: Vec<f32> = (0..DIM).map(|_| rng.random_range(0.0..1.0)).collect();
+            Arc::new(Float32Array::from(values))
+        })
+        .collect()
+}
+
+/// Drop dataset files from OS page cache (Linux only)
+#[cfg(target_os = "linux")]
+fn drop_dataset_from_cache(uri: &str) -> std::io::Result<()> {
+    use std::fs;
+    use std::os::unix::io::AsRawFd;
+
+    // Walk the dataset directory and drop each file from cache
+    if let Ok(entries) = fs::read_dir(uri) {
+        for entry in entries.flatten() {
+            let path = entry.path();
+            if path.is_file() {
+                if let Ok(file) = fs::File::open(&path) {
+                    let fd = file.as_raw_fd();
+                    // POSIX_FADV_DONTNEED = 4
+                    let result =
+                        unsafe { libc::posix_fadvise(fd, 0, 0, libc::POSIX_FADV_DONTNEED) };
+                    if result != 0 {
+                        eprintln!(
+                            "Warning: Failed to drop {:?} from cache: {}",
+                            path,
+                            std::io::Error::from_raw_os_error(result)
+                        );
+                    }
+                }
+            }
+        }
+    }
+
+    Ok(())
+}
+
+#[cfg(not(target_os = "linux"))]
+fn drop_dataset_from_cache(_uri: &str) -> std::io::Result<()> {
+    Ok(())
+}
+
+/// Run vector search queries
+async fn run_queries(
+    dataset: Arc<Dataset>,
+    query_vectors: &[Arc<Float32Array>],
+    concurrent_queries: usize,
+) {
+    // Run queries concurrently using tokio tasks
+    futures::stream::iter(query_vectors)
+        .map(|q| {
+            let dataset = dataset.clone();
+            let q = q.clone();
+            tokio::spawn(async move {
+                dataset
+                    .scan()
+                    .nearest("vector", q.as_ref(), K)
+                    .unwrap()
+                    .minimum_nprobes(NPROBES)
+                    .maximum_nprobes(NPROBES)
+                    .refine(REFINE_FACTOR)
+                    .project(&["vector", "_distance"])
+                    .unwrap()
+                    .try_into_stream()
+                    .await
+                    .unwrap()
+                    .try_collect::<Vec<_>>()
+                    .await
+                    .unwrap()
+            })
+        })
+        .buffered(concurrent_queries)
+        .try_collect::<Vec<_>>()
+        .await
+        .unwrap();
+}
+
+fn bench_ivf_pq_throughput(c: &mut Criterion) {
+    env_logger::init();
+
+    let rt = tokio::runtime::Builder::new_multi_thread().build().unwrap();
+
+    let mut group = c.benchmark_group("ivf_pq_throughput");
+    group.throughput(Throughput::Elements(NUM_QUERIES as u64));
+
+    for &version in &[LanceFileVersion::V2_0, LanceFileVersion::V2_1] {
+        // Get or create cached dataset
+        let cached_dataset = get_or_create_dataset(&rt, version);
+
+        for &concurrent_queries in &[1, 16] {
+            for &cached in &[true, false] {
+                // Skip uncached tests on non-Linux platforms
+                #[cfg(not(target_os = "linux"))]
+                if !cached {
+                    continue;
+                }
+
+                let cache_label = if cached { "cached" } else { "nocache" };
+
+                // One pass to warm up the index cache
+                rt.block_on(run_queries(
+                    cached_dataset.dataset.clone(),
+                    &cached_dataset.query_vectors,
+                    concurrent_queries,
+                ));
+
+                group.bench_function(
+                    format!("{}_{}threads_{}", version, concurrent_queries, cache_label),
+                    |b| {
+                        b.iter_batched(
+                            || {
+                                // Setup: drop cache if uncached
+                                if !cached {
+                                    drop_dataset_from_cache(&cached_dataset.uri).ok();
+                                }
+                            },
+                            |_| {
+                                // Run the queries
+                                rt.block_on(run_queries(
+                                    cached_dataset.dataset.clone(),
+                                    &cached_dataset.query_vectors,
+                                    concurrent_queries,
+                                ));
+                            },
+                            BatchSize::PerIteration,
+                        );
+                    },
+                );
+            }
+        }
+    }
+    group.finish();
+}
+
+#[cfg(target_os = "linux")]
+criterion_group!(
+    name=benches;
+    config = Criterion::default().significance_level(0.1).sample_size(10)
+        .with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
+    targets = bench_ivf_pq_throughput
+);
+
+// Non-linux version does not support pprof.
+#[cfg(not(target_os = "linux"))]
+criterion_group!(
+    name=benches;
+    config = Criterion::default().significance_level(0.1).sample_size(10);
+    targets = bench_ivf_pq_throughput
+);
+
+criterion_main!(benches);

From 1a981457952002ba45060971b22267aabbfe58df Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Tue, 6 Jan 2026 06:59:12 -0800
Subject: [PATCH 2/4] Remove uring references

---
 Cargo.lock                                                   | 1 +
 python/python/ci_benchmarks/benchmarks/test_ivf_pq_search.py | 5 +++--
 rust/lance/benches/vector_throughput.rs                      | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 9cc4d912115..583283b75b2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4739,6 +4739,7 @@ dependencies = [
  "lance-test-macros",
  "lance-testing",
  "lapack",
+ "libc",
  "log",
  "lzma-sys",
  "mock_instant",
diff --git a/python/python/ci_benchmarks/benchmarks/test_ivf_pq_search.py b/python/python/ci_benchmarks/benchmarks/test_ivf_pq_search.py
index c271c574679..5bef0492964 100644
--- a/python/python/ci_benchmarks/benchmarks/test_ivf_pq_search.py
+++ b/python/python/ci_benchmarks/benchmarks/test_ivf_pq_search.py
@@ -82,7 +82,7 @@ def _get_or_create_dataset(num_rows: int, dim: int = 1024) -> str:
     # Use a fixed directory path based on parameters
     tmpdir = Path(tempfile.gettempdir()) / f"lance_bench_{num_rows}_{dim}"
     tmpdir.mkdir(exist_ok=True)
-    dataset_uri = "file+uring://" + str(tmpdir / "vector_dataset.lance")
+    dataset_uri = "file://" + str(tmpdir / "vector_dataset.lance")
 
     # Check if dataset already exists and has correct row count
     try:
@@ -92,7 +92,8 @@ def _get_or_create_dataset(num_rows: int, dim: int = 1024) -> str:
             return dataset_uri
         else:
             print(
-                f"Dataset exists but has wrong row count ({ds.count_rows()} vs {num_rows}), recreating..."
+                "Dataset exists but has wrong row count "
+                f"({ds.count_rows()} vs {num_rows}), recreating..."
             )
     except Exception:
         print(f"Creating new dataset at {dataset_uri}")
diff --git a/rust/lance/benches/vector_throughput.rs b/rust/lance/benches/vector_throughput.rs
index 89f82372988..b03d725324b 100644
--- a/rust/lance/benches/vector_throughput.rs
+++ b/rust/lance/benches/vector_throughput.rs
@@ -55,7 +55,7 @@ struct CachedDataset {
 fn get_or_create_dataset(rt: &Runtime, version: LanceFileVersion) -> Arc<CachedDataset> {
     // Create dataset in fixed temp directory
     let uri = format!(
-        "file+uring:///tmp/lance_bench_throughput_{}_{}_{}",
+        "file:///tmp/lance_bench_throughput_{}_{}_{}",
         NUM_ROWS, DIM, version
     );
 

From 5c4102b07716f2aa9dafaee96248afd26cf05970 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Tue, 6 Jan 2026 13:59:28 -0800
Subject: [PATCH 3/4] Replace println with logging.  Fix cache removal code

---
 rust/lance/benches/vector_throughput.rs | 75 +++++++++++++------------
 1 file changed, 40 insertions(+), 35 deletions(-)

diff --git a/rust/lance/benches/vector_throughput.rs b/rust/lance/benches/vector_throughput.rs
index b03d725324b..ac03932f849 100644
--- a/rust/lance/benches/vector_throughput.rs
+++ b/rust/lance/benches/vector_throughput.rs
@@ -6,8 +6,6 @@
 //! This benchmark measures concurrent vector search performance with IVF_PQ indexes,
 //! similar to the Python test_ivf_pq_throughput benchmark.
 
-#![allow(clippy::print_stdout)]
-
 use std::sync::Arc;
 
 use arrow_array::{FixedSizeListArray, Float32Array, RecordBatch, RecordBatchIterator};
@@ -15,6 +13,7 @@ use arrow_schema::{DataType, Field, FieldRef, Schema as ArrowSchema};
 use criterion::{criterion_group, criterion_main, BatchSize, Criterion, Throughput};
 use futures::{StreamExt, TryStreamExt};
 use lance_file::version::LanceFileVersion;
+use log::info;
 #[cfg(target_os = "linux")]
 use pprof::criterion::{Output, PProfProfiler};
 use rand::Rng;
@@ -46,18 +45,21 @@ const MAX_ITERATIONS: usize = 50;
 
 /// Cached dataset with pre-generated query vectors
 struct CachedDataset {
-    uri: String,
     dataset: Arc<Dataset>,
     query_vectors: Vec<Arc<Float32Array>>,
 }
 
+fn dataset_path(version: LanceFileVersion) -> String {
+    format!(
+        "/tmp/lance_bench_throughput_{}_{}_{}",
+        NUM_ROWS, DIM, version
+    )
+}
+
 /// Get or create a cached dataset with IVF_PQ index and query vectors
 fn get_or_create_dataset(rt: &Runtime, version: LanceFileVersion) -> Arc<CachedDataset> {
     // Create dataset in fixed temp directory
-    let uri = format!(
-        "file:///tmp/lance_bench_throughput_{}_{}_{}",
-        NUM_ROWS, DIM, version
-    );
+    let uri = format!("file://{}", dataset_path(version));
 
     rt.block_on(async {
         // Check if dataset exists on disk with correct row count
@@ -67,29 +69,29 @@ fn get_or_create_dataset(rt: &Runtime, version: LanceFileVersion) -> Arc<CachedD
         if let Ok(dataset) = Dataset::open(&uri).await {
             let row_count = dataset.count_rows(None).await.unwrap();
             if row_count == NUM_ROWS {
-                println!("Reusing existing dataset at {} ({} rows)", uri, row_count);
+                info!("Reusing existing dataset at {} ({} rows)", uri, row_count);
                 needs_creation = false;
 
                 // Check if index exists
                 let indices = dataset.load_indices().await.unwrap();
                 if !indices.is_empty() {
-                    println!(
+                    log::info!(
                         "Dataset already has {} index(es), skipping index creation",
                         indices.len()
                     );
                     needs_indexing = false;
                 } else {
-                    println!("Dataset exists but has no index, will create index");
+                    info!("Dataset exists but has no index, will create index");
                 }
             } else {
-                println!(
+                info!(
                     "Dataset exists but has wrong row count ({} vs {}), recreating",
                     row_count, NUM_ROWS
                 );
                 std::fs::remove_dir_all(&uri).ok();
             }
         } else {
-            println!(
+            info!(
                 "Creating new dataset with {} rows, {} dimensions",
                 NUM_ROWS, DIM
             );
@@ -112,7 +114,6 @@ fn get_or_create_dataset(rt: &Runtime, version: LanceFileVersion) -> Arc<CachedD
         let query_vectors = generate_query_vectors();
 
         let cached = Arc::new(CachedDataset {
-            uri: uri.clone(),
             dataset: Arc::new(dataset),
             query_vectors,
         });
@@ -161,12 +162,12 @@ async fn create_dataset(uri: &str) {
         .await
         .unwrap();
 
-    println!("Dataset created at {}", uri);
+    info!("Dataset created at {}", uri);
 }
 
 /// Create IVF_PQ index on the dataset
 async fn create_ivf_pq_index(dataset: &mut Dataset) {
-    println!("Creating IVF_PQ index...");
+    info!("Creating IVF_PQ index...");
 
     let ivf_params = IvfBuildParams {
         num_partitions: Some(IVF_PARTITIONS),
@@ -191,7 +192,7 @@ async fn create_ivf_pq_index(dataset: &mut Dataset) {
         .await
         .unwrap();
 
-    println!("IVF_PQ index created");
+    info!("IVF_PQ index created");
 }
 
 /// Generate random query vectors
@@ -207,37 +208,41 @@ fn generate_query_vectors() -> Vec<Arc<Float32Array>> {
 
 /// Drop dataset files from OS page cache (Linux only)
 #[cfg(target_os = "linux")]
-fn drop_dataset_from_cache(uri: &str) -> std::io::Result<()> {
+fn drop_dataset_from_cache(dataset_dir: &str) -> std::io::Result<()> {
     use std::fs;
     use std::os::unix::io::AsRawFd;
 
     // Walk the dataset directory and drop each file from cache
-    if let Ok(entries) = fs::read_dir(uri) {
-        for entry in entries.flatten() {
-            let path = entry.path();
-            if path.is_file() {
-                if let Ok(file) = fs::File::open(&path) {
-                    let fd = file.as_raw_fd();
-                    // POSIX_FADV_DONTNEED = 4
-                    let result =
-                        unsafe { libc::posix_fadvise(fd, 0, 0, libc::POSIX_FADV_DONTNEED) };
-                    if result != 0 {
-                        eprintln!(
-                            "Warning: Failed to drop {:?} from cache: {}",
-                            path,
-                            std::io::Error::from_raw_os_error(result)
-                        );
-                    }
+    let mut num_dropped = 0;
+    let entries = fs::read_dir(format!("{}/data", dataset_dir)).unwrap();
+    for entry in entries.flatten() {
+        let path = entry.path();
+        if path.is_file() {
+            if let Ok(file) = fs::File::open(&path) {
+                let fd = file.as_raw_fd();
+                // POSIX_FADV_DONTNEED = 4
+                let result = unsafe { libc::posix_fadvise(fd, 0, 0, libc::POSIX_FADV_DONTNEED) };
+                if result != 0 {
+                    panic!(
+                        "Warning: Failed to drop {:?} from cache: {}",
+                        path,
+                        std::io::Error::from_raw_os_error(result)
+                    );
                 }
+                num_dropped += 1;
             }
         }
     }
+    if num_dropped == 0 {
+        // Sanity check to ensure that we actually dropped some files from cache.
+        panic!("No files dropped from cache");
+    }
 
     Ok(())
 }
 
 #[cfg(not(target_os = "linux"))]
-fn drop_dataset_from_cache(_uri: &str) -> std::io::Result<()> {
+fn drop_dataset_from_cache(_path: &str) -> std::io::Result<()> {
     Ok(())
 }
 
@@ -312,7 +317,7 @@ fn bench_ivf_pq_throughput(c: &mut Criterion) {
                             || {
                                 // Setup: drop cache if uncached
                                 if !cached {
-                                    drop_dataset_from_cache(&cached_dataset.uri).ok();
+                                    drop_dataset_from_cache(&dataset_path(version)).ok();
                                 }
                             },
                             |_| {

From 856b255d2306417dbcc275e9e82150404c532725 Mon Sep 17 00:00:00 2001
From: Weston Pace <weston.pace@gmail.com>
Date: Fri, 16 Jan 2026 06:56:19 -0800
Subject: [PATCH 4/4] Address clippy suggestions

---
 rust/lance/benches/vector_throughput.rs | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/rust/lance/benches/vector_throughput.rs b/rust/lance/benches/vector_throughput.rs
index ac03932f849..aa557863d2b 100644
--- a/rust/lance/benches/vector_throughput.rs
+++ b/rust/lance/benches/vector_throughput.rs
@@ -113,12 +113,10 @@ fn get_or_create_dataset(rt: &Runtime, version: LanceFileVersion) -> Arc<CachedD
         // Generate query vectors
         let query_vectors = generate_query_vectors();
 
-        let cached = Arc::new(CachedDataset {
+        Arc::new(CachedDataset {
             dataset: Arc::new(dataset),
             query_vectors,
-        });
-
-        cached
+        })
     })
 }