Merge branch 'main' into memorymind

zilliztech · Jul 26, 2024 · ffb5ed5 · ffb5ed5
2 parents 11148e2 + c45876c
commit ffb5ed5
Show file tree

Hide file tree

Showing 46 changed files with 1,030 additions and 220 deletions.
diff --git a/README.md b/README.md
@@ -27,19 +27,20 @@ pip install vectordb-bench[pinecone]
 ```
 All the database client supported
 
-|Optional database client|install command|
-|---------------|---------------|
-|pymilvus(*default*)|`pip install vectordb-bench`|
-|all|`pip install vectordb-bench[all]`|
-|qdrant|`pip install vectordb-bench[qdrant]`|
-|pinecone|`pip install vectordb-bench[pinecone]`|
-|weaviate|`pip install vectordb-bench[weaviate]`|
-|elastic|`pip install vectordb-bench[elastic]`|
-|pgvector|`pip install vectordb-bench[pgvector]`|
-|pgvecto.rs|`pip install vectordb-bench[pgvecto_rs]`|
-|redis|`pip install vectordb-bench[redis]`|
-|memorydb| `pip install vectordb-bench[memorydb]`|
-|chromadb|`pip install vectordb-bench[chromadb]`|
+| Optional database client | install command                             |
+|--------------------------|---------------------------------------------|
+| pymilvus(*default*)      | `pip install vectordb-bench`                |
+| all                      | `pip install vectordb-bench[all]`           |
+| qdrant                   | `pip install vectordb-bench[qdrant]`        |
+| pinecone                 | `pip install vectordb-bench[pinecone]`      |
+| weaviate                 | `pip install vectordb-bench[weaviate]`      |
+| elastic                  | `pip install vectordb-bench[elastic]`       |
+| pgvector                 | `pip install vectordb-bench[pgvector]`      |
+| pgvecto.rs               | `pip install vectordb-bench[pgvecto_rs]`    |
+| redis                    | `pip install vectordb-bench[redis]`         |
+| memorydb                 | `pip install vectordb-bench[memorydb]`      |
+| chromadb                 | `pip install vectordb-bench[chromadb]`      |
+| awsopensearch            | `pip install vectordb-bench[awsopensearch]` |
 
 ### Run
 
@@ -282,6 +283,24 @@ Case No. | Case Type | Dataset Size  | Filtering Rate | Results |
 
 Each case provides an in-depth examination of a vector database's abilities, providing you a comprehensive view of the database's performance.
 
+#### Custom Dataset for Performance case
+
+Through the `/custom` page, users can customize their own performance case using local datasets. After saving, the corresponding case can be selected from the `/run_test` page to perform the test.
+
+![image](fig/custom_dataset.png)
+![image](fig/custom_case_run_test.png)
+
+We have strict requirements for the data set format, please follow them.
+- `Folder Path` - The path to the folder containing all the files. Please ensure that all files in the folder are in the `Parquet` format.
+  - Vectors data files: The file must be named `train.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
+  - Query test vectors: The file must be named `test.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
+  - Ground truth file: The file must be named `neighbors.parquet` and should have two columns: `id` corresponding to query vectors and `neighbors_id` as an array of `int`.
+
+- `Train File Count` - If the vector file is too large, you can consider splitting it into multiple files. The naming format for the split files should be `train-[index]-of-[file_count].parquet`. For example, `train-01-of-10.parquet` represents the second file (0-indexed) among 10 split files.
+
+- `Use Shuffled Data` - If you check this option, the vector data files need to be modified. VectorDBBench will load the data labeled with `shuffle`. For example, use `shuffle_train.parquet` instead of `train.parquet` and `shuffle_train-04-of-10.parquet` instead of `train-04-of-10.parquet`. The `id` column in the shuffled data can be in any order.
+
+
 ## Goals
 Our goals of this benchmark are:
 ### Reproducibility & Usability

diff --git a/fig/custom_case_run_test.png b/fig/custom_case_run_test.png
diff --git a/fig/custom_dataset.png b/fig/custom_dataset.png
diff --git a/pyproject.toml b/pyproject.toml
@@ -62,6 +62,8 @@ all = [
     "psycopg2",
     "psycopg",
     "psycopg-binary",
+    "opensearch-dsl==2.1.0",
+    "opensearch-py==2.6.0",
 ]
 
 qdrant = [ "qdrant-client" ]
@@ -73,6 +75,7 @@ pgvecto_rs = [ "psycopg2" ]
 redis = [ "redis" ]
 memorydb = [ "memorydb" ]
 chromadb = [ "chromadb" ]
+awsopensearch = [ "awsopensearch" ]
 zilliz_cloud = []
 
 [project.urls]
@@ -81,4 +84,5 @@ zilliz_cloud = []
 [project.scripts]
 init_bench = "vectordb_bench.__main__:main"
 vectordbbench = "vectordb_bench.cli.vectordbbench:cli"
+
 [tool.setuptools_scm]
diff --git a/vectordb_bench/__init__.py b/vectordb_bench/__init__.py
@@ -35,6 +35,7 @@ class config:
 
 
     K_DEFAULT = 100  # default return top k nearest neighbors during search
+    CUSTOM_CONFIG_DIR = pathlib.Path(__file__).parent.joinpath("custom/custom_case.json")
 
     CAPACITY_TIMEOUT_IN_SECONDS = 24 * 3600 # 24h
     LOAD_TIMEOUT_DEFAULT        = 2.5 * 3600 # 2.5h

diff --git a/vectordb_bench/backend/assembler.py b/vectordb_bench/backend/assembler.py
@@ -14,7 +14,7 @@ class Assembler:
     def assemble(cls, run_id , task: TaskConfig, source: DatasetSource) -> CaseRunner:
         c_cls = task.case_config.case_id.case_cls
 
-        c = c_cls()
+        c = c_cls(task.case_config.custom_case)
         if type(task.db_case_config) != EmptyDBCaseConfig:
             task.db_case_config.metric_type = c.dataset.data.metric_type
 

diff --git a/vectordb_bench/backend/cases.py b/vectordb_bench/backend/cases.py
@@ -4,9 +4,13 @@
 from typing import Type
 
 from vectordb_bench import config
+from vectordb_bench.backend.clients.api import MetricType
 from vectordb_bench.base import BaseModel
+from vectordb_bench.frontend.components.custom.getCustomConfig import (
+    CustomDatasetConfig,
+)
 
-from .dataset import Dataset, DatasetManager
+from .dataset import CustomDataset, Dataset, DatasetManager
 
 
 log = logging.getLogger(__name__)
@@ -44,25 +48,24 @@ class CaseType(Enum):
     Performance1536D50K = 50
 
     Custom = 100
+    PerformanceCustomDataset = 101
 
-    @property
     def case_cls(self, custom_configs: dict | None = None) -> Type["Case"]:
-        if self not in type2case:
-            raise NotImplementedError(f"Case {self} has not implemented. You can add it manually to vectordb_bench.backend.cases.type2case or define a custom_configs['custom_cls']")
-        return type2case[self]
+        if custom_configs is None:
+            return type2case.get(self)()
+        else:
+            return type2case.get(self)(**custom_configs)
 
-    @property
-    def case_name(self) -> str:
-        c = self.case_cls
+    def case_name(self, custom_configs: dict | None = None) -> str:
+        c = self.case_cls(custom_configs)
         if c is not None:
-            return c().name
+            return c.name
         raise ValueError("Case unsupported")
 
-    @property
-    def case_description(self) -> str:
-        c = self.case_cls
+    def case_description(self, custom_configs: dict | None = None) -> str:
+        c = self.case_cls(custom_configs)
         if c is not None:
-            return c().description
+            return c.description
         raise ValueError("Case unsupported")
 
 
@@ -289,26 +292,69 @@ class Performance1536D50K(PerformanceCase):
     optimize_timeout: float | int | None = 15 * 60
 
 
+def metric_type_map(s: str) -> MetricType:
+    if s.lower() == "cosine":
+        return MetricType.COSINE
+    if s.lower() == "l2" or s.lower() == "euclidean":
+        return MetricType.L2
+    if s.lower() == "ip":
+        return MetricType.IP
+    err_msg = f"Not support metric_type: {s}"
+    log.error(err_msg)
+    raise RuntimeError(err_msg)
+
+
+class PerformanceCustomDataset(PerformanceCase):
+    case_id: CaseType = CaseType.PerformanceCustomDataset
+    name: str = "Performance With Custom Dataset"
+    description: str = ""
+    dataset: DatasetManager
+
+    def __init__(
+        self,
+        name,
+        description,
+        load_timeout,
+        optimize_timeout,
+        dataset_config,
+        **kwargs,
+    ):
+        dataset_config = CustomDatasetConfig(**dataset_config)
+        dataset = CustomDataset(
+            name=dataset_config.name,
+            size=dataset_config.size,
+            dim=dataset_config.dim,
+            metric_type=metric_type_map(dataset_config.metric_type),
+            use_shuffled=dataset_config.use_shuffled,
+            with_gt=dataset_config.with_gt,
+            dir=dataset_config.dir,
+            file_num=dataset_config.file_count,
+        )
+        super().__init__(
+            name=name,
+            description=description,
+            load_timeout=load_timeout,
+            optimize_timeout=optimize_timeout,
+            dataset=DatasetManager(data=dataset),
+        )
+
+
 type2case = {
     CaseType.CapacityDim960: CapacityDim960,
     CaseType.CapacityDim128: CapacityDim128,
-
     CaseType.Performance768D100M: Performance768D100M,
     CaseType.Performance768D10M: Performance768D10M,
     CaseType.Performance768D1M: Performance768D1M,
-
     CaseType.Performance768D10M1P: Performance768D10M1P,
     CaseType.Performance768D1M1P: Performance768D1M1P,
     CaseType.Performance768D10M99P: Performance768D10M99P,
     CaseType.Performance768D1M99P: Performance768D1M99P,
-
     CaseType.Performance1536D500K: Performance1536D500K,
     CaseType.Performance1536D5M: Performance1536D5M,
-
     CaseType.Performance1536D500K1P: Performance1536D500K1P,
     CaseType.Performance1536D5M1P: Performance1536D5M1P,
-
     CaseType.Performance1536D500K99P: Performance1536D500K99P,
     CaseType.Performance1536D5M99P: Performance1536D5M99P,
     CaseType.Performance1536D50K: Performance1536D50K,
+    CaseType.PerformanceCustomDataset: PerformanceCustomDataset,
 }
diff --git a/vectordb_bench/backend/clients/__init__.py b/vectordb_bench/backend/clients/__init__.py
@@ -33,6 +33,7 @@ class DB(Enum):
     Redis = "Redis"
     MemoryDB = "MemoryDB"
     Chroma = "Chroma"
+    AWSOpenSearch = "OpenSearch"
     Test = "test"
 
 
@@ -83,6 +84,10 @@ def init_cls(self) -> Type[VectorDB]:
             from .chroma.chroma import ChromaClient
             return ChromaClient
 
+        if self == DB.AWSOpenSearch:
+            from .aws_opensearch.aws_opensearch import AWSOpenSearch
+            return AWSOpenSearch
+
     @property
     def config_cls(self) -> Type[DBConfig]:
         """Import while in use"""
@@ -130,6 +135,10 @@ def config_cls(self) -> Type[DBConfig]:
             from .chroma.config import ChromaConfig
             return ChromaConfig
 
+        if self == DB.AWSOpenSearch:
+            from .aws_opensearch.config import AWSOpenSearchConfig
+            return AWSOpenSearchConfig
+
     def case_config_cls(self, index_type: IndexType | None = None) -> Type[DBCaseConfig]:
         if self == DB.Milvus:
             from .milvus.config import _milvus_case_config
@@ -159,6 +168,10 @@ def case_config_cls(self, index_type: IndexType | None = None) -> Type[DBCaseCon
             from .pgvecto_rs.config import _pgvecto_rs_case_config
             return _pgvecto_rs_case_config.get(index_type)
 
+        if self == DB.AWSOpenSearch:
+            from .aws_opensearch.config import AWSOpenSearchIndexConfig
+            return AWSOpenSearchIndexConfig
+
         # DB.Pinecone, DB.Chroma, DB.Redis
         return EmptyDBCaseConfig