From ad6a209a966677740b0f766fba2c132a7c97d423 Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Wed, 10 Dec 2025 09:18:42 -0800 Subject: [PATCH 1/2] use uv to run dbpedia --- benchmarks/dbpedia-openai/README.md | 11 ++--------- benchmarks/dbpedia-openai/benchmarks.py | 8 ++++++-- benchmarks/dbpedia-openai/pyproject.toml | 13 +++++++++++++ 3 files changed, 21 insertions(+), 11 deletions(-) create mode 100644 benchmarks/dbpedia-openai/pyproject.toml diff --git a/benchmarks/dbpedia-openai/README.md b/benchmarks/dbpedia-openai/README.md index f0159de751a..50d218623ec 100644 --- a/benchmarks/dbpedia-openai/README.md +++ b/benchmarks/dbpedia-openai/README.md @@ -6,15 +6,8 @@ contains 1M openai embeddings. ## Prepare Dataset ```sh -# Python 3.10+ -python3 -m venv venv -. ./venv/bin/activate - -# install dependencies -pip install -r requirements.txt - # Generate dataset in lance format. -./datagen.py +uv run ./datagen.py ``` ## Run benchmark @@ -23,5 +16,5 @@ pip install -r requirements.txt as well as `refine_factor`. ```sh -./benchmarks.py -k 20 +uv run ./benchmarks.py ``` \ No newline at end of file diff --git a/benchmarks/dbpedia-openai/benchmarks.py b/benchmarks/dbpedia-openai/benchmarks.py index d3b783aef84..21469557a6b 100755 --- a/benchmarks/dbpedia-openai/benchmarks.py +++ b/benchmarks/dbpedia-openai/benchmarks.py @@ -2,6 +2,7 @@ # import argparse +import time import lance import numpy as np @@ -20,7 +21,7 @@ def run_query( results = [] for query in queries: tbl = ds.scanner( - columns=["_id"], + columns=["_id", "_distance"], nearest={ "column": "openai", "q": query, @@ -56,7 +57,7 @@ def compute_recall(gt: np.ndarray, result: np.ndarray) -> float: def main(): parser = argparse.ArgumentParser() - parser.add_argument("uri", help="dataset uri") + parser.add_argument("--uri", help="dataset uri", default="./dbpedia.lance") parser.add_argument( "-k", "--top-k", @@ -90,6 +91,7 @@ def main(): for ivf in [256, 512, 1024]: for pq in [32, 96, 192]: + start = time.perf_counter() ds.create_index( "openai", "IVF_PQ", @@ -98,6 +100,8 @@ def main(): replace=True, metric=args.metric, ) + end = time.perf_counter() + print(f"Create IVF{ivf}_PQ{pq} index in {end - start:0.2f}s") for refine in [None, 2, 5, 10, 50, 100]: results = run_query( ds, diff --git a/benchmarks/dbpedia-openai/pyproject.toml b/benchmarks/dbpedia-openai/pyproject.toml new file mode 100644 index 00000000000..7f6635e1b48 --- /dev/null +++ b/benchmarks/dbpedia-openai/pyproject.toml @@ -0,0 +1,13 @@ +[project] +name = "dbpedia-openai" +version = "0.1.0" +description = "Add your description here" +readme = "README.md" +requires-python = ">=3.12,<3.14" +dependencies = [ + "pylance", + "datasets", +] + +[dependency-groups] +dev = [ "ruff" ] From 7e8879ca545525ec0eb3af22c40f5bf21f707d99 Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Wed, 10 Dec 2025 09:22:41 -0800 Subject: [PATCH 2/2] messages --- benchmarks/dbpedia-openai/pyproject.toml | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/benchmarks/dbpedia-openai/pyproject.toml b/benchmarks/dbpedia-openai/pyproject.toml index 7f6635e1b48..0164aa05d86 100644 --- a/benchmarks/dbpedia-openai/pyproject.toml +++ b/benchmarks/dbpedia-openai/pyproject.toml @@ -1,13 +1,10 @@ [project] name = "dbpedia-openai" version = "0.1.0" -description = "Add your description here" +description = "Benchmarks for huggingface dpbedia dataset with OpenAI embeddings" readme = "README.md" requires-python = ">=3.12,<3.14" -dependencies = [ - "pylance", - "datasets", -] +dependencies = ["pylance", "datasets"] [dependency-groups] -dev = [ "ruff" ] +dev = ["ruff"]