From ad6a209a966677740b0f766fba2c132a7c97d423 Mon Sep 17 00:00:00 2001
From: Lei Xu <eddyxu@gmail.com>
Date: Wed, 10 Dec 2025 09:18:42 -0800
Subject: [PATCH 1/2] use uv to run dbpedia

---
 benchmarks/dbpedia-openai/README.md      | 11 ++---------
 benchmarks/dbpedia-openai/benchmarks.py  |  8 ++++++--
 benchmarks/dbpedia-openai/pyproject.toml | 13 +++++++++++++
 3 files changed, 21 insertions(+), 11 deletions(-)
 create mode 100644 benchmarks/dbpedia-openai/pyproject.toml

diff --git a/benchmarks/dbpedia-openai/README.md b/benchmarks/dbpedia-openai/README.md
index f0159de751a..50d218623ec 100644
--- a/benchmarks/dbpedia-openai/README.md
+++ b/benchmarks/dbpedia-openai/README.md
@@ -6,15 +6,8 @@ contains 1M openai embeddings.
 ## Prepare Dataset
 
 ```sh
-# Python 3.10+
-python3 -m venv venv
-. ./venv/bin/activate
-
-# install dependencies
-pip install -r requirements.txt
-
 # Generate dataset in lance format.
-./datagen.py
+uv run ./datagen.py
 ```
 
 ## Run benchmark
@@ -23,5 +16,5 @@ pip install -r requirements.txt
 as well as `refine_factor`.
 
 ```sh
-./benchmarks.py -k 20
+uv run ./benchmarks.py
 ```
\ No newline at end of file
diff --git a/benchmarks/dbpedia-openai/benchmarks.py b/benchmarks/dbpedia-openai/benchmarks.py
index d3b783aef84..21469557a6b 100755
--- a/benchmarks/dbpedia-openai/benchmarks.py
+++ b/benchmarks/dbpedia-openai/benchmarks.py
@@ -2,6 +2,7 @@
 #
 
 import argparse
+import time
 
 import lance
 import numpy as np
@@ -20,7 +21,7 @@ def run_query(
     results = []
     for query in queries:
         tbl = ds.scanner(
-            columns=["_id"],
+            columns=["_id", "_distance"],
             nearest={
                 "column": "openai",
                 "q": query,
@@ -56,7 +57,7 @@ def compute_recall(gt: np.ndarray, result: np.ndarray) -> float:
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument("uri", help="dataset uri")
+    parser.add_argument("--uri", help="dataset uri", default="./dbpedia.lance")
     parser.add_argument(
         "-k",
         "--top-k",
@@ -90,6 +91,7 @@ def main():
 
     for ivf in [256, 512, 1024]:
         for pq in [32, 96, 192]:
+            start = time.perf_counter()
             ds.create_index(
                 "openai",
                 "IVF_PQ",
@@ -98,6 +100,8 @@ def main():
                 replace=True,
                 metric=args.metric,
             )
+            end = time.perf_counter()
+            print(f"Create IVF{ivf}_PQ{pq} index in {end - start:0.2f}s")
             for refine in [None, 2, 5, 10, 50, 100]:
                 results = run_query(
                     ds,
diff --git a/benchmarks/dbpedia-openai/pyproject.toml b/benchmarks/dbpedia-openai/pyproject.toml
new file mode 100644
index 00000000000..7f6635e1b48
--- /dev/null
+++ b/benchmarks/dbpedia-openai/pyproject.toml
@@ -0,0 +1,13 @@
+[project]
+name = "dbpedia-openai"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.12,<3.14"
+dependencies = [
+	"pylance",
+	"datasets",
+]
+
+[dependency-groups]
+dev = [ "ruff" ]

From 7e8879ca545525ec0eb3af22c40f5bf21f707d99 Mon Sep 17 00:00:00 2001
From: Lei Xu <eddyxu@gmail.com>
Date: Wed, 10 Dec 2025 09:22:41 -0800
Subject: [PATCH 2/2] messages

---
 benchmarks/dbpedia-openai/pyproject.toml | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/benchmarks/dbpedia-openai/pyproject.toml b/benchmarks/dbpedia-openai/pyproject.toml
index 7f6635e1b48..0164aa05d86 100644
--- a/benchmarks/dbpedia-openai/pyproject.toml
+++ b/benchmarks/dbpedia-openai/pyproject.toml
@@ -1,13 +1,10 @@
 [project]
 name = "dbpedia-openai"
 version = "0.1.0"
-description = "Add your description here"
+description = "Benchmarks for huggingface dpbedia dataset with OpenAI embeddings"
 readme = "README.md"
 requires-python = ">=3.12,<3.14"
-dependencies = [
-	"pylance",
-	"datasets",
-]
+dependencies = ["pylance", "datasets"]
 
 [dependency-groups]
-dev = [ "ruff" ]
+dev = ["ruff"]