diff --git a/benchmarks/dbpedia-openai/README.md b/benchmarks/dbpedia-openai/README.md index f0159de751a..50d218623ec 100644 --- a/benchmarks/dbpedia-openai/README.md +++ b/benchmarks/dbpedia-openai/README.md @@ -6,15 +6,8 @@ contains 1M openai embeddings. ## Prepare Dataset ```sh -# Python 3.10+ -python3 -m venv venv -. ./venv/bin/activate - -# install dependencies -pip install -r requirements.txt - # Generate dataset in lance format. -./datagen.py +uv run ./datagen.py ``` ## Run benchmark @@ -23,5 +16,5 @@ pip install -r requirements.txt as well as `refine_factor`. ```sh -./benchmarks.py -k 20 +uv run ./benchmarks.py ``` \ No newline at end of file diff --git a/benchmarks/dbpedia-openai/benchmarks.py b/benchmarks/dbpedia-openai/benchmarks.py index d3b783aef84..21469557a6b 100755 --- a/benchmarks/dbpedia-openai/benchmarks.py +++ b/benchmarks/dbpedia-openai/benchmarks.py @@ -2,6 +2,7 @@ # import argparse +import time import lance import numpy as np @@ -20,7 +21,7 @@ def run_query( results = [] for query in queries: tbl = ds.scanner( - columns=["_id"], + columns=["_id", "_distance"], nearest={ "column": "openai", "q": query, @@ -56,7 +57,7 @@ def compute_recall(gt: np.ndarray, result: np.ndarray) -> float: def main(): parser = argparse.ArgumentParser() - parser.add_argument("uri", help="dataset uri") + parser.add_argument("--uri", help="dataset uri", default="./dbpedia.lance") parser.add_argument( "-k", "--top-k", @@ -90,6 +91,7 @@ def main(): for ivf in [256, 512, 1024]: for pq in [32, 96, 192]: + start = time.perf_counter() ds.create_index( "openai", "IVF_PQ", @@ -98,6 +100,8 @@ def main(): replace=True, metric=args.metric, ) + end = time.perf_counter() + print(f"Create IVF{ivf}_PQ{pq} index in {end - start:0.2f}s") for refine in [None, 2, 5, 10, 50, 100]: results = run_query( ds, diff --git a/benchmarks/dbpedia-openai/pyproject.toml b/benchmarks/dbpedia-openai/pyproject.toml new file mode 100644 index 00000000000..0164aa05d86 --- /dev/null +++ b/benchmarks/dbpedia-openai/pyproject.toml @@ -0,0 +1,10 @@ +[project] +name = "dbpedia-openai" +version = "0.1.0" +description = "Benchmarks for huggingface dpbedia dataset with OpenAI embeddings" +readme = "README.md" +requires-python = ">=3.12,<3.14" +dependencies = ["pylance", "datasets"] + +[dependency-groups] +dev = ["ruff"]