EmumbaOrg · wahajali · Aug 28, 2024 · Sep 1, 2024 · Sep 2, 2024 · Sep 2, 2024
diff --git a/README.md b/README.md
@@ -38,6 +38,7 @@ All the database client supported
 | pgvector                 | `pip install vectordb-bench[pgvector]`      |
 | pgvecto.rs               | `pip install vectordb-bench[pgvecto_rs]`    |
 | pgvectorscale            | `pip install vectordb-bench[pgvectorscale]` |
+| pgdiskann                | `pip install vectordb-bench[pgdiskann]` |
 | redis                    | `pip install vectordb-bench[redis]`         |
 | memorydb                 | `pip install vectordb-bench[memorydb]`      |
 | chromadb                 | `pip install vectordb-bench[chromadb]`      |

diff --git a/compute_gt.py b/compute_gt.py
@@ -0,0 +1,77 @@
+import argparse
+import numpy as np
+import pandas as pd
+import pyarrow as pa
+import psycopg
+import psycopg.sql as sql
+import pyarrow.parquet as pq
+from pgvector.psycopg import register_vector
+
+
+def query_database(query, emb, k, cursor):
+    result = cursor.execute(query, (emb, k), prepare=True, binary=True)
+    return [int(i[0]) for i in result.fetchall()]
+
+def write_parquet_file(data, file_path):
+    df = pd.DataFrame(data, columns=["id", "neighbors_id"])
+    table = pa.Table.from_pandas(df)
+    pq.write_table(table, file_path, use_dictionary=False)
+
+def main():
+    parser = argparse.ArgumentParser(description="Create subsets of Parquet files using Dask.")
+    parser.add_argument("--test-file-path", type=str, help="Path of the parquet test file.")
+    parser.add_argument("--gt-file-path", type=str, help="Parquet file path where ground truth will be saved.")
+    parser.add_argument("--table-name", type=str, help="Vector table name")
+    parser.add_argument("--k", type=str, help="K nearest neighbors")
+    parser.add_argument("--db-name", type=str, help="Database name")
+    parser.add_argument("--db-user", type=str, help="Database user")
+    parser.add_argument("--db-pass", type=str, help="Database password")
+    parser.add_argument("--host", type=str, help="Database host")
+    parser.add_argument("--port", type=str, help="Database port")
+    args = parser.parse_args()
+
+    '''
+    vectordbbench pgvectorhnsw --drop-old --load --skip-search-serial --skip-search-concurrent --case-type PerformanceCustomDataset --user-name postgres  --password admin123 --host 172.17.0.2 --db-name postgres --maintenance-work-mem 4GB --max-parallel-workers 3 --ef-search 100 --ef-construction 64 --m 16 --k 10 --num-concurrency 1,5,10,15,20,25,30 --custom-case-name "Computing GT for 500k openai" --custom-dataset-name custom-openai --custom-dataset-dir subset_500k --custom-dataset-size 500000 --custom-dataset-dim 1536 --custom-dataset-file-count 10 --custom-dataset-use-shuffled
+    '''
+
+    df = pd.read_parquet(args.test_file_path)
+    try:
+        connection = psycopg.connect(
+            dbname=args.db_name,
+            user=args.db_user,
+            password=args.db_pass,
+            host=args.host,
+            port=args.port,
+        )
+        register_vector(connection)
+        print("Connection established.")
+
+        results = []
+        count = 0
+        for _, row in df.iterrows():
+            q = np.asarray(row["emb"])
+            query = sql.Composed(
+                    [
+                        sql.SQL("SELECT id FROM public.{} ORDER BY embedding <=> ").format(
+                            sql.Identifier(args.table_name)
+                        ),
+                        sql.SQL(" %s::vector LIMIT %s::int"),
+                    ]
+                )
+            result = (row["id"], np.asarray(query_database(query, q, args.k, connection)))
+            results.append(result)
+            count += 1
+            if count%10 == 0:
+                print(f"GT computed for {count} rows.")
+        connection.close()
+
+        write_parquet_file(results, args.gt_file_path)
+        print("Ground truth calculated and saved.")
+    except:
+        print("Connection failed.")
+        connection.close()
+    finally:
+        connection.close()
+
+if __name__ == "__main__":
+    main()
diff --git a/config-diskann-filter.json b/config-diskann-filter.json
@@ -0,0 +1,65 @@
+{
+    "database": {
+      "host": "localhost",
+      "username": "postgres",
+      "password": "postgres",
+      "db_name": "ann-filter",
+      "instance_type": "Standard_D8ds_v5",
+      "provider": "azure",
+      "enable_seqscan": "on"
+    },
+    "cases": [
+      {
+        "db-label": "diskann-no-filter",  
+        "drop_old": true,
+        "load": true,
+        "search-serial": true,
+        "search-concurrent": true,
+        "case-type": "Performance1536D500K",
+        "maintenance-work-mem": "8GB",
+        "max-parallel-workers": 7,
+        "max-neighbors": 64,
+        "l-value-ib": 128,
+        "l-value-is": [32],
+        "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100",
+        "concurrency-duration": 30, 
+        "k": 10,
+        "run_count": 2
+      },
+      {
+        "db-label": "diskann-filter-high",  
+        "drop_old": false,
+        "load": false,
+        "search-serial": true,
+        "search-concurrent": true,
+        "case-type": "Performance1536D500K99P",
+        "maintenance-work-mem": "8GB",
+        "max-parallel-workers": 7,
+        "max-neighbors": 64,
+        "l-value-ib": 128,
+        "l-value-is": [32],
+        "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100",
+        "concurrency-duration": 30, 
+        "k": 10,
+        "run_count": 2
+      },
+      {
+        "db-label": "diskann-filter-low",  
+        "drop_old": false,
+        "load": false,
+        "search-serial": true,
+        "search-concurrent": true,
+        "case-type": "Performance1536D500K1P",
+        "maintenance-work-mem": "8GB",
+        "max-parallel-workers": 7,
+        "max-neighbors": 64,
+        "l-value-ib": 128,
+        "l-value-is": [32],
+        "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100",
+        "concurrency-duration": 30,
+        "k": 10,
+        "run_count": 2
+      }
+    ]
+  }
+
diff --git a/config-hnsw-filter.json b/config-hnsw-filter.json
@@ -0,0 +1,66 @@
+
+{
+    "database": {
+      "host": "localhost",
+      "username": "postgres",
+      "password": "postgres",
+      "db_name": "ann-filter",
+      "instance_type": "Standard_D8ds_v5",
+      "provider": "azure",
+      "enable_seqscan": "on"
+    },
+    "cases": [
+      {
+        "db-label": "hnsw-no-filter",  
+        "drop_old": true,
+        "load": true,
+        "search-serial": true,
+        "search-concurrent": true,
+        "case-type": "Performance1536D500K",
+        "maintenance-work-mem": "8GB",
+        "max-parallel-workers": 7,
+        "ef-search": [40],
+        "ef-construction": 128,
+        "m": 32,
+        "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100",
+        "concurrency-duration": 30, 
+        "k": 10,
+        "run_count": 2
+      },
+      {
+        "db-label": "hnsw-no-filter-high",  
+        "drop_old": false,
+        "load": false,
+        "search-serial": true,
+        "search-concurrent": true,
+        "case-type": "Performance1536D500K99P",
+        "maintenance-work-mem": "8GB",
+        "max-parallel-workers": 7,
+        "ef-search": [40],
+        "ef-construction": 128,
+        "m": 32,
+        "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100",
+        "concurrency-duration": 30, 
+        "k": 10,
+        "run_count": 2
+      },
+      {
+        "db-label": "hnsw-filter-low",  
+        "drop_old": false,
+        "load": false,
+        "search-serial": true,
+        "search-concurrent": true,
+        "case-type": "Performance1536D500K1P",
+        "maintenance-work-mem": "8GB",
+        "max-parallel-workers": 7,
+        "ef-search": [40],
+        "ef-construction": 128,
+        "m": 32,
+        "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100",
+        "concurrency-duration": 30, 
+        "k": 10,
+        "run_count": 2
+      }
+    ]
+  }
+
diff --git a/create_dataset_subsets.py b/create_dataset_subsets.py
@@ -0,0 +1,54 @@
+import os
+import shutil
+import argparse
+
+
+def get_file_name(file_name: str, file_prefix: str, files_count: int) -> str:
+    if files_count == 1:
+        return file_prefix + ".parquet"
+    file_name = file_name.split("of-")[0]
+    return file_name + "of-" + str(files_count).zfill(2) + ".parquet"
+
+def create_subsets(base_dir, save_dir_path, subset_prefix, file_prefix, step_size):
+    files = sorted([f for f in os.listdir(base_dir) if f.startswith(file_prefix)])
+    num_files = len(files)
+
+    for i in range(1, num_files + 1):
+        subset_dir = os.path.join(save_dir_path, f"{subset_prefix}_{i * step_size // 1000}k")
+        os.makedirs(subset_dir, exist_ok=True)
+
+        for j in range(i):
+            src_file = os.path.join(base_dir, files[j])
+            dst_file = os.path.join(subset_dir, get_file_name(files[j], file_prefix, i))
+            shutil.copy(src_file, dst_file)
+        src_test_file = os.path.join(base_dir, "test.parquet")
+        dst_test_file = os.path.join(subset_dir, "test.parquet")
+        shutil.copy(src_test_file, dst_test_file)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Create subsets of Parquet files using Dask.")
+    parser.add_argument("--directory", type=str, help="Path to the directory containing Parquet files.")
+    parser.add_argument("--save-dir-path", type=str, help="Directory path where data will be saved")
+    parser.add_argument("--dataset-name-prefix", type=str, help="Name prefix of the folder where each subset will be saved.")
+    parser.add_argument("--is-shuffled", type=bool, help="Whether the files are shuffled or not.")
+    args = parser.parse_args()
+
+    file_prefix = (
+        "shuffle_train"
+        if args.is_shuffled
+        else "train"
+    )
+    subset_prefix = (
+        args.dataset_name_prefix
+        if args.dataset_name_prefix
+        else "openai"
+    )
+    save_dir_path = (
+        args.save_dir_path
+        if args.save_dir_path
+        else args.directory
+    )
+    step_size = 500_000  # 500k
+
+    create_subsets(args.directory, save_dir_path, subset_prefix, file_prefix, step_size)
+    print(f'Finished creating subsets of Parquet files in {args.directory}.')
diff --git a/generate_test_dataset.py b/generate_test_dataset.py
@@ -0,0 +1,78 @@
+import pyarrow as pa
+import pyarrow.parquet as pq
+import pandas as pd
+import numpy as np
+import glob
+import gc
+import argparse
+import logging
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+# Parse command line arguments
+parser = argparse.ArgumentParser(description='Generate test dataset from Parquet files.')
+parser.add_argument('--folder-path', type=str, required=True, help='Path to the folder containing the train files')
+parser.add_argument('--file-pattern', type=str, required=False, help='Pattern to match the train files')
+parser.add_argument('--queries-per-file', type=int, required=False, help='Number of queries to sample per file')
+parser.add_argument('--output-file', type=str, required=False, help='Output Parquet file name')
+
+args = parser.parse_args()
+
+folder_path = args.folder_path
+file_pattern = (
+    args.file_pattern
+    if args.file_pattern
+    else "train-*-of-10.parquet"
+)
+queries_per_file = (
+    args.queries_per_file
+    if args.queries_per_file
+    else 1000
+)
+output_file = (
+    args.output_file
+    if args.output_file
+    else 'test-10000.parquet'
+)
+
+logging.info(f"Folder path: {folder_path}")
+logging.info(f"File pattern: {file_pattern}")
+logging.info(f"Queries per file: {queries_per_file}")
+logging.info(f"Output file: {output_file}")
+
+schema = pa.schema([
+    pa.field('id', pa.int64()),
+    pa.field('emb', pa.list_(pa.float64()))  # emb as a list of double precision floats
+])
+
+# Open Parquet writer with the specified schema
+with pq.ParquetWriter(output_file, schema) as writer:
+    # Track the current ID incrementally
+    current_id = 0
+
+    # Loop through each file and sample specified queries
+    for file_path in glob.glob(folder_path + file_pattern):
+        logging.info(f"Processing file: {file_path}")
+
+        # Load and sample data
+        train_table = pq.read_table(file_path, columns=['id', 'emb'])
+        train_df = train_table.to_pandas()
+        sampled_df = train_df.sample(n=queries_per_file).reset_index(drop=True)
+
+        # Reassign IDs starting from the current count
+        sampled_df['id'] = np.arange(current_id, current_id + queries_per_file)
+        current_id += queries_per_file
+
+        # Convert the DataFrame to a PyArrow table with the specified schema
+        sampled_table = pa.Table.from_pandas(sampled_df, schema=schema)
+
+        # Write the sampled data to the output file
+        writer.write_table(sampled_table)
+
+        # Clean up to free memory
+        del train_table, train_df, sampled_df, sampled_table
+        gc.collect()
+
+        logging.info(f"Finished processing file: {file_path}")
+
+logging.info(f"{output_file} has been created with {queries_per_file} randomly sampled embeddings per file.")
diff --git a/pyproject.toml b/pyproject.toml
@@ -72,6 +72,7 @@ weaviate = [ "weaviate-client" ]
 elastic = [ "elasticsearch" ]
 pgvector = [ "psycopg", "psycopg-binary", "pgvector" ]
 pgvectorscale = [ "psycopg", "psycopg-binary", "pgvector" ]
+pgdiskann = [ "psycopg", "psycopg-binary", "pgvector" ]
 pgvecto_rs = [ "pgvecto_rs[psycopg3]>=0.2.2" ]
 redis = [ "redis" ]
 memorydb = [ "memorydb" ]