Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
3da8a9d
Add pgdiskann client
wahajali Aug 28, 2024
b85d7d2
Add testing configuration, capture table/index sizes
wahajali Sep 1, 2024
e0e6309
Increase timeout
wahajali Sep 2, 2024
4ad827a
update run count
wahajali Sep 2, 2024
0e74142
Update parallel workers
wahajali Sep 2, 2024
4a8c1c9
Update num-concurrency
wahajali Sep 2, 2024
6380d82
Fix spelling
wahajali Sep 2, 2024
778fba8
Saving benchmark results in DB
Sheharyar570 Sep 2, 2024
2900e13
Fix spelling mistake
Sheharyar570 Sep 2, 2024
7652731
Reduce concurrency and add K
wahajali Sep 3, 2024
b7c7f68
Merge branch 'pgvector-testing' of github.com:EmumbaOrg/VectorDBBench…
wahajali Sep 3, 2024
e931034
Add pgdiskann client
wahajali Aug 28, 2024
50ff354
Add CLI support in pgdiskann
Sheharyar570 Sep 6, 2024
0e36aeb
Merge branch 'diskann-support' of github.com:EmumbaOrg/VectorDBBench …
wahajali Sep 8, 2024
d081997
Merge pull request #5 from EmumbaOrg/add-pgdiskann-cli-support
wahajali Sep 8, 2024
021cbde
Merge branch 'diskann-support' of github.com:EmumbaOrg/VectorDBBench …
wahajali Sep 8, 2024
f984400
Remove data from git
wahajali Sep 8, 2024
b7f982b
Merge branch 'pgvector-testing' into diskann-testing
wahajali Sep 8, 2024
c1cd526
Add get_size_info method for pgdiskann
wahajali Sep 8, 2024
4b773c4
Add automation for diskann
wahajali Sep 8, 2024
9927505
Add sample config files
wahajali Sep 8, 2024
a54a4f4
Set storage to plain
wahajali Sep 9, 2024
3555837
Update timeout
wahajali Sep 9, 2024
8468260
Update run.py
wahajali Sep 10, 2024
6d1a701
added xxl instance config file
Sheharyar570 Sep 10, 2024
421b000
Merge pull request #7 from EmumbaOrg/add-xxl-config-file
wahajali Sep 10, 2024
4f187e2
Add filter search sample config
wahajali Sep 14, 2024
9ada5ee
Add script that creates subsets of dataset.
Sheharyar570 Oct 4, 2024
3ea17ed
Add script to compute ground truth for dataset subset
Sheharyar570 Oct 4, 2024
174dbf2
Fixed dataset file name issues
Sheharyar570 Oct 6, 2024
113d3ae
Fixed missing gt_file_path parameter issue
Sheharyar570 Oct 8, 2024
1202350
Added initial custom dataset config and run file.
Sheharyar570 Oct 8, 2024
6409c81
fixed config keys
Sheharyar570 Oct 8, 2024
853f254
Updated config for memory comparison run
Sheharyar570 Oct 8, 2024
a3b4588
remove extra use-shuffled flag from command
Sheharyar570 Oct 8, 2024
5d1dde7
Added new arguments for command line interface.
Sheharyar570 Oct 9, 2024
cfc4cf4
Merge branch 'diskann-testing' of github.com:EmumbaOrg/VectorDBBench …
wahajali Oct 11, 2024
cc252b8
Merge branch 'main' of github.com:EmumbaOrg/VectorDBBench into diskan…
wahajali Oct 11, 2024
c1d0dba
Refactored, command line arguments made consistent
Sheharyar570 Oct 11, 2024
a017db9
Added progress log
Sheharyar570 Oct 11, 2024
dfc9a95
Generates test dataset of a given size using original dataset files.
Sheharyar570 Oct 11, 2024
1ba7fbd
Added hit ratio logs in results.
Sheharyar570 Oct 11, 2024
889aad5
Changed dataset dir default value.
Sheharyar570 Oct 11, 2024
4ab4287
Merge branch 'main' of https://github.com/EmumbaOrg/VectorDBBench int…
Sheharyar570 Oct 15, 2024
fc2faf3
fixed test dataset generation script
Sheharyar570 Oct 17, 2024
d3c003d
Merge branch 'main' of github.com:zilliztech/VectorDBBench into diska…
wahajali Oct 28, 2024
e81eb0f
Added runner script for pgvectorscale
wahajali Nov 4, 2024
95bd59e
Add sample config for pgvectorscale
wahajali Nov 4, 2024
ae98055
Generate test data script updated
Sheharyar570 Nov 14, 2024
cb2b736
Filtering case configs added for hnsw and diskann
Sheharyar570 Nov 19, 2024
1f59ba8
fixed search serial and conncurrent keys issue in run files
Sheharyar570 Nov 19, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ All the database client supported
| pgvector | `pip install vectordb-bench[pgvector]` |
| pgvecto.rs | `pip install vectordb-bench[pgvecto_rs]` |
| pgvectorscale | `pip install vectordb-bench[pgvectorscale]` |
| pgdiskann | `pip install vectordb-bench[pgdiskann]` |
| redis | `pip install vectordb-bench[redis]` |
| memorydb | `pip install vectordb-bench[memorydb]` |
| chromadb | `pip install vectordb-bench[chromadb]` |
Expand Down
77 changes: 77 additions & 0 deletions compute_gt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import argparse
import numpy as np
import pandas as pd
import pyarrow as pa
import psycopg
import psycopg.sql as sql
import pyarrow.parquet as pq
from pgvector.psycopg import register_vector


def query_database(query, emb, k, cursor):
result = cursor.execute(query, (emb, k), prepare=True, binary=True)
return [int(i[0]) for i in result.fetchall()]

def write_parquet_file(data, file_path):
df = pd.DataFrame(data, columns=["id", "neighbors_id"])
table = pa.Table.from_pandas(df)
pq.write_table(table, file_path, use_dictionary=False)

def main():
parser = argparse.ArgumentParser(description="Create subsets of Parquet files using Dask.")
parser.add_argument("--test-file-path", type=str, help="Path of the parquet test file.")
parser.add_argument("--gt-file-path", type=str, help="Parquet file path where ground truth will be saved.")
parser.add_argument("--table-name", type=str, help="Vector table name")
parser.add_argument("--k", type=str, help="K nearest neighbors")
parser.add_argument("--db-name", type=str, help="Database name")
parser.add_argument("--db-user", type=str, help="Database user")
parser.add_argument("--db-pass", type=str, help="Database password")
parser.add_argument("--host", type=str, help="Database host")
parser.add_argument("--port", type=str, help="Database port")
args = parser.parse_args()

'''
vectordbbench pgvectorhnsw --drop-old --load --skip-search-serial --skip-search-concurrent --case-type PerformanceCustomDataset --user-name postgres --password admin123 --host 172.17.0.2 --db-name postgres --maintenance-work-mem 4GB --max-parallel-workers 3 --ef-search 100 --ef-construction 64 --m 16 --k 10 --num-concurrency 1,5,10,15,20,25,30 --custom-case-name "Computing GT for 500k openai" --custom-dataset-name custom-openai --custom-dataset-dir subset_500k --custom-dataset-size 500000 --custom-dataset-dim 1536 --custom-dataset-file-count 10 --custom-dataset-use-shuffled
'''

df = pd.read_parquet(args.test_file_path)
try:
connection = psycopg.connect(
dbname=args.db_name,
user=args.db_user,
password=args.db_pass,
host=args.host,
port=args.port,
)
register_vector(connection)
print("Connection established.")

results = []
count = 0
for _, row in df.iterrows():
q = np.asarray(row["emb"])
query = sql.Composed(
[
sql.SQL("SELECT id FROM public.{} ORDER BY embedding <=> ").format(
sql.Identifier(args.table_name)
),
sql.SQL(" %s::vector LIMIT %s::int"),
]
)
result = (row["id"], np.asarray(query_database(query, q, args.k, connection)))
results.append(result)
count += 1
if count%10 == 0:
print(f"GT computed for {count} rows.")
connection.close()

write_parquet_file(results, args.gt_file_path)
print("Ground truth calculated and saved.")
except:
print("Connection failed.")
connection.close()
finally:
connection.close()

if __name__ == "__main__":
main()
65 changes: 65 additions & 0 deletions config-diskann-filter.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
{
"database": {
"host": "localhost",
"username": "postgres",
"password": "postgres",
"db_name": "ann-filter",
"instance_type": "Standard_D8ds_v5",
"provider": "azure",
"enable_seqscan": "on"
},
"cases": [
{
"db-label": "diskann-no-filter",
"drop_old": true,
"load": true,
"search-serial": true,
"search-concurrent": true,
"case-type": "Performance1536D500K",
"maintenance-work-mem": "8GB",
"max-parallel-workers": 7,
"max-neighbors": 64,
"l-value-ib": 128,
"l-value-is": [32],
"num-concurrency": "1,10,20,30,40,50,60,70,80,90,100",
"concurrency-duration": 30,
"k": 10,
"run_count": 2
},
{
"db-label": "diskann-filter-high",
"drop_old": false,
"load": false,
"search-serial": true,
"search-concurrent": true,
"case-type": "Performance1536D500K99P",
"maintenance-work-mem": "8GB",
"max-parallel-workers": 7,
"max-neighbors": 64,
"l-value-ib": 128,
"l-value-is": [32],
"num-concurrency": "1,10,20,30,40,50,60,70,80,90,100",
"concurrency-duration": 30,
"k": 10,
"run_count": 2
},
{
"db-label": "diskann-filter-low",
"drop_old": false,
"load": false,
"search-serial": true,
"search-concurrent": true,
"case-type": "Performance1536D500K1P",
"maintenance-work-mem": "8GB",
"max-parallel-workers": 7,
"max-neighbors": 64,
"l-value-ib": 128,
"l-value-is": [32],
"num-concurrency": "1,10,20,30,40,50,60,70,80,90,100",
"concurrency-duration": 30,
"k": 10,
"run_count": 2
}
]
}

66 changes: 66 additions & 0 deletions config-hnsw-filter.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@

{
"database": {
"host": "localhost",
"username": "postgres",
"password": "postgres",
"db_name": "ann-filter",
"instance_type": "Standard_D8ds_v5",
"provider": "azure",
"enable_seqscan": "on"
},
"cases": [
{
"db-label": "hnsw-no-filter",
"drop_old": true,
"load": true,
"search-serial": true,
"search-concurrent": true,
"case-type": "Performance1536D500K",
"maintenance-work-mem": "8GB",
"max-parallel-workers": 7,
"ef-search": [40],
"ef-construction": 128,
"m": 32,
"num-concurrency": "1,10,20,30,40,50,60,70,80,90,100",
"concurrency-duration": 30,
"k": 10,
"run_count": 2
},
{
"db-label": "hnsw-no-filter-high",
"drop_old": false,
"load": false,
"search-serial": true,
"search-concurrent": true,
"case-type": "Performance1536D500K99P",
"maintenance-work-mem": "8GB",
"max-parallel-workers": 7,
"ef-search": [40],
"ef-construction": 128,
"m": 32,
"num-concurrency": "1,10,20,30,40,50,60,70,80,90,100",
"concurrency-duration": 30,
"k": 10,
"run_count": 2
},
{
"db-label": "hnsw-filter-low",
"drop_old": false,
"load": false,
"search-serial": true,
"search-concurrent": true,
"case-type": "Performance1536D500K1P",
"maintenance-work-mem": "8GB",
"max-parallel-workers": 7,
"ef-search": [40],
"ef-construction": 128,
"m": 32,
"num-concurrency": "1,10,20,30,40,50,60,70,80,90,100",
"concurrency-duration": 30,
"k": 10,
"run_count": 2
}
]
}

54 changes: 54 additions & 0 deletions create_dataset_subsets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import os
import shutil
import argparse


def get_file_name(file_name: str, file_prefix: str, files_count: int) -> str:
if files_count == 1:
return file_prefix + ".parquet"
file_name = file_name.split("of-")[0]
return file_name + "of-" + str(files_count).zfill(2) + ".parquet"

def create_subsets(base_dir, save_dir_path, subset_prefix, file_prefix, step_size):
files = sorted([f for f in os.listdir(base_dir) if f.startswith(file_prefix)])
num_files = len(files)

for i in range(1, num_files + 1):
subset_dir = os.path.join(save_dir_path, f"{subset_prefix}_{i * step_size // 1000}k")
os.makedirs(subset_dir, exist_ok=True)

for j in range(i):
src_file = os.path.join(base_dir, files[j])
dst_file = os.path.join(subset_dir, get_file_name(files[j], file_prefix, i))
shutil.copy(src_file, dst_file)
src_test_file = os.path.join(base_dir, "test.parquet")
dst_test_file = os.path.join(subset_dir, "test.parquet")
shutil.copy(src_test_file, dst_test_file)

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Create subsets of Parquet files using Dask.")
parser.add_argument("--directory", type=str, help="Path to the directory containing Parquet files.")
parser.add_argument("--save-dir-path", type=str, help="Directory path where data will be saved")
parser.add_argument("--dataset-name-prefix", type=str, help="Name prefix of the folder where each subset will be saved.")
parser.add_argument("--is-shuffled", type=bool, help="Whether the files are shuffled or not.")
args = parser.parse_args()

file_prefix = (
"shuffle_train"
if args.is_shuffled
else "train"
)
subset_prefix = (
args.dataset_name_prefix
if args.dataset_name_prefix
else "openai"
)
save_dir_path = (
args.save_dir_path
if args.save_dir_path
else args.directory
)
step_size = 500_000 # 500k

create_subsets(args.directory, save_dir_path, subset_prefix, file_prefix, step_size)
print(f'Finished creating subsets of Parquet files in {args.directory}.')
78 changes: 78 additions & 0 deletions generate_test_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
import glob
import gc
import argparse
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Parse command line arguments
parser = argparse.ArgumentParser(description='Generate test dataset from Parquet files.')
parser.add_argument('--folder-path', type=str, required=True, help='Path to the folder containing the train files')
parser.add_argument('--file-pattern', type=str, required=False, help='Pattern to match the train files')
parser.add_argument('--queries-per-file', type=int, required=False, help='Number of queries to sample per file')
parser.add_argument('--output-file', type=str, required=False, help='Output Parquet file name')

args = parser.parse_args()

folder_path = args.folder_path
file_pattern = (
args.file_pattern
if args.file_pattern
else "train-*-of-10.parquet"
)
queries_per_file = (
args.queries_per_file
if args.queries_per_file
else 1000
)
output_file = (
args.output_file
if args.output_file
else 'test-10000.parquet'
)

logging.info(f"Folder path: {folder_path}")
logging.info(f"File pattern: {file_pattern}")
logging.info(f"Queries per file: {queries_per_file}")
logging.info(f"Output file: {output_file}")

schema = pa.schema([
pa.field('id', pa.int64()),
pa.field('emb', pa.list_(pa.float64())) # emb as a list of double precision floats
])

# Open Parquet writer with the specified schema
with pq.ParquetWriter(output_file, schema) as writer:
# Track the current ID incrementally
current_id = 0

# Loop through each file and sample specified queries
for file_path in glob.glob(folder_path + file_pattern):
logging.info(f"Processing file: {file_path}")

# Load and sample data
train_table = pq.read_table(file_path, columns=['id', 'emb'])
train_df = train_table.to_pandas()
sampled_df = train_df.sample(n=queries_per_file).reset_index(drop=True)

# Reassign IDs starting from the current count
sampled_df['id'] = np.arange(current_id, current_id + queries_per_file)
current_id += queries_per_file

# Convert the DataFrame to a PyArrow table with the specified schema
sampled_table = pa.Table.from_pandas(sampled_df, schema=schema)

# Write the sampled data to the output file
writer.write_table(sampled_table)

# Clean up to free memory
del train_table, train_df, sampled_df, sampled_table
gc.collect()

logging.info(f"Finished processing file: {file_path}")

logging.info(f"{output_file} has been created with {queries_per_file} randomly sampled embeddings per file.")
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ weaviate = [ "weaviate-client" ]
elastic = [ "elasticsearch" ]
pgvector = [ "psycopg", "psycopg-binary", "pgvector" ]
pgvectorscale = [ "psycopg", "psycopg-binary", "pgvector" ]
pgdiskann = [ "psycopg", "psycopg-binary", "pgvector" ]
pgvecto_rs = [ "pgvecto_rs[psycopg3]>=0.2.2" ]
redis = [ "redis" ]
memorydb = [ "memorydb" ]
Expand Down
Loading