yugabyte · shaharuk-yb · May 15, 2025 · Jan 10, 2025 · Jan 13, 2025 · Jan 14, 2025
diff --git a/.gitignore b/.gitignore
@@ -8,5 +8,7 @@ __MACOSX
 .DS_Store
 build/
 venv/
+.venv/
 .idea/
-results/
+results/
+logs/
diff --git a/README.md b/README.md
@@ -13,6 +13,8 @@ Closely mimicking real-world production environments, we've set up diverse testi
 
 Prepare to delve into the world of VectorDBBench, and let it guide you in uncovering your perfect vector database match.  
 
+VectorDBBench is sponsered by Zilliz，the leading opensource vectorDB company behind Milvus. Choose smarter with VectorDBBench- start your free test on [zilliz cloud](https://zilliz.com/) today!
+
 **Leaderboard:** https://zilliz.com/benchmark
 ## Quick Start
 ### Prerequirement
@@ -53,6 +55,8 @@ All the database client supported
 | awsopensearch            | `pip install vectordb-bench[opensearch]` |
 | aliyun_opensearch        | `pip install vectordb-bench[aliyun_opensearch]` |
 | mongodb                  | `pip install vectordb-bench[mongodb]`       |
+| tidb                     | `pip install vectordb-bench[tidb]`          |
+| vespa                    | `pip install vectordb-bench[vespa]`         |
 
 ### Run
 
@@ -110,6 +114,10 @@ Options:
   --num-concurrency TEXT          Comma-separated list of concurrency values
                                   to test during concurrent search  [default:
                                   1,10,20]
+  --concurrency-timeout INTEGER   Timeout (in seconds) to wait for a
+                                  concurrency slot before failing. Set to a
+                                  negative value to wait indefinitely.
+                                  [default: 3600]
   --user-name TEXT                Db username  [required]
   --password TEXT                 Db password  [required]
   --host TEXT                     Db host  [required]
@@ -129,7 +137,11 @@ Options:
   --ef-construction INTEGER       hnsw ef-construction
   --ef-search INTEGER             hnsw ef-search
   --quantization-type [none|bit|halfvec]
-                                  quantization type for vectors
+                                  quantization type for vectors (in index)
+  --table-quantization-type [none|bit|halfvec]
+                                  quantization type for vectors (in table). If
+                                  equal to bit, the parameter
+                                  quantization_type will be set to bit too.
   --custom-case-name TEXT         Custom case name i.e. PerformanceCase1536D50K
   --custom-case-description TEXT  Custom name description
   --custom-case-load-timeout INTEGER
@@ -153,6 +165,48 @@ Options:
                                   with-gt]
   --help                          Show this message and exit.
 ```
+
+### Run awsopensearch from command line
+
+```shell
+vectordbbench awsopensearch --db-label awsopensearch \
+--m 16 --ef-construction 256 \
+--host search-vector-db-prod-h4f6m4of6x7yp2rz7gdmots7w4.us-west-2.es.amazonaws.com --port 443 \
+--user vector --password '<password>' \
+--case-type Performance1536D5M --num-insert-workers 10  \
+--skip-load --num-concurrency 75
+```
+
+To list the options for awsopensearch, execute `vectordbbench awsopensearch --help`
+
+```text
+$ vectordbbench awsopensearch --help
+Usage: vectordbbench awsopensearch [OPTIONS]
+
+Options:
+  # Sharding and Replication
+  --number-of-shards INTEGER      Number of primary shards for the index
+  --number-of-replicas INTEGER    Number of replica copies for each primary
+                                  shard
+  # Indexing Performance                              
+  --index-thread-qty INTEGER      Thread count for native engine indexing
+  --index-thread-qty-during-force-merge INTEGER
+                                  Thread count during force merge operations
+  --number-of-indexing-clients INTEGER
+                                  Number of concurrent indexing clients
+  # Index Management
+  --number-of-segments INTEGER    Target number of segments after merging
+  --refresh-interval TEXT         How often to make new data available for
+                                  search
+  --force-merge-enabled BOOLEAN   Whether to perform force merge operation
+  --flush-threshold-size TEXT     Size threshold for flushing the transaction
+                                  log
+  # Memory Management
+  --cb-threshold TEXT             k-NN Memory circuit breaker threshold
+
+  --help                          Show this message and exit.
+  ```
+
 #### Using a configuration file.
 
 The vectordbbench command can optionally read some or all the options from a yaml formatted configuration file.
@@ -218,13 +272,13 @@ pip install -e '.[pinecone]'
 ```
 ### Run test server
 ```
-$ python -m vectordb_bench
+python -m vectordb_bench
 ```
 
 OR:
 
 ```shell
-$ init_bench
+init_bench
 ```
 
 OR:
@@ -241,13 +295,13 @@ After reopen the repository in container, run `python -m vectordb_bench` in the
 
 ### Check coding styles
 ```shell
-$ make lint
+make lint
 ```
 
 To fix the coding styles automatically
 
 ```shell
-$ make format
+make format
 ```
 
 ## How does it work?
@@ -319,6 +373,13 @@ We have strict requirements for the data set format, please follow them.
 - `Folder Path` - The path to the folder containing all the files. Please ensure that all files in the folder are in the `Parquet` format.
   - Vectors data files: The file must be named `train.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
   - Query test vectors: The file must be named `test.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
+    - We recommend limiting the number of test query vectors, like 1,000.
+    When conducting concurrent query tests, Vdbbench creates a large number of processes. 
+    To minimize additional communication overhead during testing, 
+    we prepare a complete set of test queries for each process, allowing them to run independently.
+    However, this means that as the number of concurrent processes increases, 
+    the number of copied query vectors also increases significantly, 
+    which can place substantial pressure on memory resources.
   - Ground truth file: The file must be named `neighbors.parquet` and should have two columns: `id` corresponding to query vectors and `neighbors_id` as an array of `int`.
 
 - `Train File Count` - If the vector file is too large, you can consider splitting it into multiple files. The naming format for the split files should be `train-[index]-of-[file_count].parquet`. For example, `train-01-of-10.parquet` represents the second file (0-indexed) among 10 split files.

diff --git a/install.py b/install.py
@@ -1,7 +1,8 @@
-import os
 import argparse
+import os
 import subprocess
 
+
 def docker_tag_base():
     return 'vdbbench'
 

diff --git a/install/requirements_py3.11.txt b/install/requirements_py3.11.txt
@@ -1,4 +1,4 @@
-grpcio==1.53.0
+grpcio==1.53.2
 grpcio-tools==1.53.0
 qdrant-client
 pinecone-client
@@ -22,3 +22,5 @@ environs
 pydantic<v2
 scikit-learn
 pymilvus
+clickhouse_connect
+pyvespa
diff --git a/pyproject.toml b/pyproject.toml
@@ -27,7 +27,7 @@ dependencies = [
     "click",
     "pytz",
     "streamlit-autorefresh",
-    "streamlit!=1.34.0",
+    "streamlit<1.44,!=1.34.0", # There is a breaking change in 1.44 related to get_page https://discuss.streamlit.io/t/from-streamlit-source-util-import-get-pages-gone-in-v-1-44-0-need-urgent-help/98399
     "streamlit_extras",
     "tqdm",
     "s3fs",
@@ -68,6 +68,11 @@ all = [
     "memorydb",
     "alibabacloud_ha3engine_vector",
     "alibabacloud_searchengine20211025",
+    "mariadb",
+    "PyMySQL",
+    "clickhouse-connect",
+    "pyvespa",
+    "lancedb",
 ]
 
 qdrant          = [ "qdrant-client" ]
@@ -86,6 +91,11 @@ chromadb        = [ "chromadb" ]
 opensearch      = [ "opensearch-py" ]
 aliyun_opensearch = [ "alibabacloud_ha3engine_vector", "alibabacloud_searchengine20211025"]
 mongodb         = [ "pymongo" ]
+mariadb         = [ "mariadb" ]
+tidb            = [ "PyMySQL" ]
+clickhouse      = [ "clickhouse-connect" ]
+vespa           = [ "pyvespa" ]
+lancedb         = [ "lancedb" ]
 
 [project.urls]
 "repository" = "https://github.com/zilliztech/VectorDBBench"

diff --git a/vectordb_bench/__init__.py b/vectordb_bench/__init__.py
@@ -6,7 +6,7 @@
 from . import log_util
 
 env = environs.Env()
-env.read_env(".env", False)
+env.read_env(path=".env", recurse=False)
 
 
 class config:
@@ -52,6 +52,8 @@ class config:
 
     CONCURRENCY_DURATION = 30
 
+    CONCURRENCY_TIMEOUT = 3600
+
     RESULTS_LOCAL_DIR = env.path(
         "RESULTS_LOCAL_DIR",
         pathlib.Path(__file__).parent.joinpath("results"),

diff --git a/vectordb_bench/backend/clients/__init__.py b/vectordb_bench/backend/clients/__init__.py
@@ -38,12 +38,17 @@ class DB(Enum):
     Chroma = "Chroma"
     AWSOpenSearch = "OpenSearch"
     AliyunElasticsearch = "AliyunElasticsearch"
+    MariaDB = "MariaDB"
     Test = "test"
     AliyunOpenSearch = "AliyunOpenSearch"
     MongoDB = "MongoDB"
+    TiDB = "TiDB"
+    Clickhouse = "Clickhouse"
+    Vespa = "Vespa"
+    LanceDB = "LanceDB"
 
     @property
-    def init_cls(self) -> type[VectorDB]:  # noqa: PLR0911, PLR0912, C901
+    def init_cls(self) -> type[VectorDB]:  # noqa: PLR0911, PLR0912, C901, PLR0915
         """Import while in use"""
         if self == DB.Milvus:
             from .milvus.milvus import Milvus
@@ -115,6 +120,11 @@ def init_cls(self) -> type[VectorDB]:  # noqa: PLR0911, PLR0912, C901
 
             return AWSOpenSearch
 
+        if self == DB.Clickhouse:
+            from .clickhouse.clickhouse import Clickhouse
+
+            return Clickhouse
+
         if self == DB.AlloyDB:
             from .alloydb.alloydb import AlloyDB
 
@@ -135,16 +145,36 @@ def init_cls(self) -> type[VectorDB]:  # noqa: PLR0911, PLR0912, C901
 
             return MongoDB
 
+        if self == DB.MariaDB:
+            from .mariadb.mariadb import MariaDB
+
+            return MariaDB
+
+        if self == DB.TiDB:
+            from .tidb.tidb import TiDB
+
+            return TiDB
+
         if self == DB.Test:
             from .test.test import Test
 
             return Test
 
+        if self == DB.Vespa:
+            from .vespa.vespa import Vespa
+
+            return Vespa
+
+        if self == DB.LanceDB:
+            from .lancedb.lancedb import LanceDB
+
+            return LanceDB
+
         msg = f"Unknown DB: {self.name}"
         raise ValueError(msg)
 
     @property
-    def config_cls(self) -> type[DBConfig]:  # noqa: PLR0911, PLR0912, C901
+    def config_cls(self) -> type[DBConfig]:  # noqa: PLR0911, PLR0912, C901, PLR0915
         """Import while in use"""
         if self == DB.Milvus:
             from .milvus.config import MilvusConfig
@@ -216,6 +246,11 @@ def config_cls(self) -> type[DBConfig]:  # noqa: PLR0911, PLR0912, C901
 
             return AWSOpenSearchConfig
 
+        if self == DB.Clickhouse:
+            from .clickhouse.config import ClickhouseConfig
+
+            return ClickhouseConfig
+
         if self == DB.AlloyDB:
             from .alloydb.config import AlloyDBConfig
 
@@ -236,15 +271,35 @@ def config_cls(self) -> type[DBConfig]:  # noqa: PLR0911, PLR0912, C901
 
             return MongoDBConfig
 
+        if self == DB.MariaDB:
+            from .mariadb.config import MariaDBConfig
+
+            return MariaDBConfig
+
+        if self == DB.TiDB:
+            from .tidb.config import TiDBConfig
+
+            return TiDBConfig
+
         if self == DB.Test:
             from .test.config import TestConfig
 
             return TestConfig
 
+        if self == DB.Vespa:
+            from .vespa.config import VespaConfig
+
+            return VespaConfig
+
+        if self == DB.LanceDB:
+            from .lancedb.config import LanceDBConfig
+
+            return LanceDBConfig
+
         msg = f"Unknown DB: {self.name}"
         raise ValueError(msg)
 
-    def case_config_cls(  # noqa: PLR0911
+    def case_config_cls(  # noqa: C901, PLR0911, PLR0912
         self,
         index_type: IndexType | None = None,
     ) -> type[DBCaseConfig]:
@@ -288,6 +343,11 @@ def case_config_cls(  # noqa: PLR0911
 
             return AWSOpenSearchIndexConfig
 
+        if self == DB.Clickhouse:
+            from .clickhouse.config import ClickhouseHNSWConfig
+
+            return ClickhouseHNSWConfig
+
         if self == DB.PgVectorScale:
             from .pgvectorscale.config import _pgvectorscale_case_config
 
@@ -318,6 +378,26 @@ def case_config_cls(  # noqa: PLR0911
 
             return MongoDBIndexConfig
 
+        if self == DB.MariaDB:
+            from .mariadb.config import _mariadb_case_config
+
+            return _mariadb_case_config.get(index_type)
+
+        if self == DB.TiDB:
+            from .tidb.config import TiDBIndexConfig
+
+            return TiDBIndexConfig
+
+        if self == DB.Vespa:
+            from .vespa.config import VespaHNSWConfig
+
+            return VespaHNSWConfig
+
+        if self == DB.LanceDB:
+            from .lancedb.config import _lancedb_case_config
+
+            return _lancedb_case_config.get(index_type)
+
         # DB.Pinecone, DB.Chroma, DB.Redis
         return EmptyDBCaseConfig