From 2470dbecb59c4772c8ff32291d29ca21a7d6d6d9 Mon Sep 17 00:00:00 2001
From: "min.tian" <min.tian.cn@gmail.com>
Date: Fri, 23 Feb 2024 03:45:47 +0000
Subject: [PATCH 1/3] update hippo client

---
 .devcontainer/devcontainer.json               |  50 +++++
 .devcontainer/postCreateCommand.sh            |  34 +++
 .editorconfig                                 |  18 ++
 .gitignore                                    |  21 ++
 .vscode/launch.json                           |  28 +++
 .vscode/settings.json                         |  10 +
 .vscode/tasks.json                            |  29 +++
 ci-transwarp/Dockerfile                       |  33 +++
 ci-transwarp/Note.md                          |  77 +++++++
 ci-transwarp/pip.conf                         |   5 +
 vectordb_bench/backend/clients/__init__.py    |  13 ++
 .../backend/clients/hippo/config.py           |  67 ++++++
 vectordb_bench/backend/clients/hippo/hippo.py | 212 ++++++++++++++++++
 vectordb_bench/backend/clients/hippo/test.py  |  93 ++++++++
 vectordb_bench/backend/data_source.py         |   3 +-
 .../backend/runner/serial_runner.py           |  11 +-
 vectordb_bench/backend/task_runner.py         |   8 +-
 .../components/run_test/dbSelector.py         |   2 +-
 .../frontend/const/dbCaseConfigs.py           | 183 ++++++++++++++-
 vectordb_bench/frontend/const/styles.py       |   2 +
 vectordb_bench/metric.py                      |   5 +
 vectordb_bench/models.py                      |  14 +-
 22 files changed, 900 insertions(+), 18 deletions(-)
 create mode 100644 .devcontainer/devcontainer.json
 create mode 100644 .devcontainer/postCreateCommand.sh
 create mode 100644 .editorconfig
 create mode 100644 .vscode/launch.json
 create mode 100644 .vscode/settings.json
 create mode 100644 .vscode/tasks.json
 create mode 100644 ci-transwarp/Dockerfile
 create mode 100644 ci-transwarp/Note.md
 create mode 100644 ci-transwarp/pip.conf
 create mode 100644 vectordb_bench/backend/clients/hippo/config.py
 create mode 100644 vectordb_bench/backend/clients/hippo/hippo.py
 create mode 100644 vectordb_bench/backend/clients/hippo/test.py

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
new file mode 100644
index 000000000..b3439a14e
--- /dev/null
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,50 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/devcontainers/templates/tree/main/src/cpp
+{
+    "name": "vectordb bench dev",
+    "image": "172.16.1.99/hippo/vectordb_bench/builder:latest",
+    "runArgs": [
+        "--privileged",
+        "--cap-add=SYS_PTRACE",
+        "--security-opt",
+        "seccomp=unconfined"
+    ],
+    "mounts": [
+        "source=/var/run/docker.sock,target=/var/run/docker.sock,type=bind"
+    ],
+    "workspaceMount": "source=${localWorkspaceFolder},target=/opt/transwarp/vectordb_bench,type=bind,consistency=cached",
+    "workspaceFolder": "/opt/transwarp/vectordb_bench",
+    "customizations": {
+        "vscode": {
+            // Set *default* container specific settings.json values on container create.
+            "settings": {
+                "terminal.integrated.shell.linux": "/bin/bash",
+                "C_Cpp.default.cppStandard": "c++20",
+                "C_Cpp.default.cStandard": "c17",
+                "C_Cpp.default.browse.databaseFilename": "${workspaceFolder}/.vscode/.browse.c_cpp.db",
+            },
+            // Add the IDs of extensions you want installed when the container is created.
+            "extensions": [
+                "foxundermoon.shell-format",
+                "redhat.vscode-yaml",
+                "ms-azuretools.vscode-docker",
+                "EditorConfig.EditorConfig",
+                "codezombiech.gitignore",
+                "yzhang.markdown-all-in-one",
+                "SonarSource.sonarlint-vscode",
+                "GitHub.copilot",
+                "ms-python.python",
+                "ms-python.debugpy",
+                "VisualStudioExptTeam.vscodeintellicode",
+                "donjayamanne.python-environment-manager",
+                "charliermarsh.ruff"
+            ]
+        }
+    },
+    // Use 'forwardPorts' to make a list of ports inside the container available locally.
+    // "forwardPorts": [],
+    // Use 'postCreateCommand' to run commands after the container is created.
+    "postCreateCommand": "bash ./.devcontainer/postCreateCommand.sh"
+    // Comment out connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root.
+    // "remoteUser": "vscode"
+}
\ No newline at end of file
diff --git a/.devcontainer/postCreateCommand.sh b/.devcontainer/postCreateCommand.sh
new file mode 100644
index 000000000..3f8f77fcc
--- /dev/null
+++ b/.devcontainer/postCreateCommand.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+function create_user {
+  USERNAME=dev
+  USER_UID=1000
+  USER_GID=$USER_UID
+
+  # Create the user
+  groupadd --gid $USER_GID $USERNAME
+  useradd --uid $USER_UID --gid $USER_GID -m $USERNAME
+
+  # [Optional] Add sudo support. Omit if you don't need to install software after connecting.
+  apt install -y sudo
+  echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME
+  chmod 0440 /etc/sudoers.d/$USERNAME  
+}
+
+function fix_git {
+  # touch ~/.gitconfig
+
+  git config --global --add safe.directory /opt/transwarp/vectordb_bench
+
+  git config --global --unset http.https://github.com.proxy
+  git config --global --unset https.https://github.com.proxy
+
+}
+
+function main {
+  # create_user
+
+  fix_git
+}
+
+main $@
\ No newline at end of file
diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 000000000..cc30f74dd
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,18 @@
+# EditorConfig is awesome: https://EditorConfig.org
+
+# top-most EditorConfig file
+root = true
+
+[*]
+end_of_line = lf
+charset = utf-8
+trim_trailing_whitespace = false
+insert_final_newline = false
+
+[Dockerfile*]
+indent_style = space
+indent_size = 4
+
+[*.json]
+indent_style = space
+indent_size = 4
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 004524444..0a316c319 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,24 @@ __MACOSX
 build/
 venv/
 .idea/
+
+# result files
+vectordb_bench/results/**
+
+# vscode files
+.vscode/*
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/settings.json
+!.vscode/*.code-snippets
+!.vscode/c_cpp_properties.json
+
+# Local History for Visual Studio Code
+.history/
+
+# Built Visual Studio Code Extensions
+*.vsix
+
+# ruff files
+.ruff_cache
\ No newline at end of file
diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 000000000..fb8612a23
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,28 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python: Streamlit",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "streamlit",
+            "args": [
+                "run",
+                "vectordb_bench/frontend/vdb_benchmark.py",
+                "--logger.level",
+                "info",
+                "--theme.base",
+                "light",
+                "--theme.primaryColor",
+                "#3670F2",
+                "--theme.secondaryBackgroundColor",
+                "#F0F2F6",
+            ],
+            "subProcess": true,
+            "justMyCode": false
+        }
+    ]
+}
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 000000000..88f75752b
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,10 @@
+{
+    "[python]": {
+        "editor.formatOnSave": false,
+        // "editor.codeActionsOnSave": {
+        //     "source.fixAll": "always",
+        //     "source.organizeImports": "always"
+        // },
+        "editor.defaultFormatter": "charliermarsh.ruff"
+    }
+}
\ No newline at end of file
diff --git a/.vscode/tasks.json b/.vscode/tasks.json
new file mode 100644
index 000000000..085a47bd1
--- /dev/null
+++ b/.vscode/tasks.json
@@ -0,0 +1,29 @@
+{
+    // See https://go.microsoft.com/fwlink/?LinkId=733558
+    // for the documentation about the tasks.json format
+    "version": "2.0.0",
+    "tasks": [
+        {
+            "label": "build vectordb bench",
+            "type": "shell",
+            "command": "python",
+            "args": [
+                "-m",
+                "pip",
+                "install",
+                "-e",
+                ".[test]"
+            ],
+            "group": {
+                "kind": "build",
+                "isDefault": true
+            }
+        },
+        {
+            "label": "run vectordb bench",
+            "type": "shell",
+            "command": "init_bench",
+            "problemMatcher": []
+        }
+    ]
+}
\ No newline at end of file
diff --git a/ci-transwarp/Dockerfile b/ci-transwarp/Dockerfile
new file mode 100644
index 000000000..6c21a5219
--- /dev/null
+++ b/ci-transwarp/Dockerfile
@@ -0,0 +1,33 @@
+FROM ubuntu:22.04
+
+ARG build_dir=/opt/transwarp/vectordb_bench
+
+RUN \
+    # basics
+    PKGS="software-properties-common vim sudo locales git" && \
+    apt-get update && \
+    apt-get install -y ${PKGS} && \
+    locale-gen en_US.UTF-8 && \
+    ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/timezone && \
+    ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \
+    # python
+    add-apt-repository -y ppa:deadsnakes/ppa && \
+    apt-get update && \
+    apt-get install -y python3-pip python3.11 python3.11-dev && \
+    apt-get clean && apt-get autoclean
+
+ENV LC_ALL="en_US.UTF-8"
+ENV LANG="en_US.UTF-8"
+
+COPY ci-transwarp/pip.conf /root/.config/pip/pip.conf
+COPY . ${build_dir}
+
+RUN \
+    update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 && \
+    python -m pip install --upgrade pip && \
+    python -m pip install hippo-api==1.2.0rc1 && \
+    cd ${build_dir} && python -m pip install .[test] && cd ${build_dir} && rm -rf ./*
+
+VOLUME [ "${build_dir}" ]
+
+WORKDIR ${build_dir}
diff --git a/ci-transwarp/Note.md b/ci-transwarp/Note.md
new file mode 100644
index 000000000..838d1f783
--- /dev/null
+++ b/ci-transwarp/Note.md
@@ -0,0 +1,77 @@
+CI
+
+---
+
+### 准备数据集
+
+```
+aws s3 ls s3://assets.zilliz.com/benchmark/ --region us-west-2 --recursive --no-sign-request
+
+aws s3 cp s3://assets.zilliz.com/benchmark/cohere_medium_1m cohere_medium_1m  --region us-west-2 --recursive --no-sign-request
+```
+
+### 构建
+
+```shell
+docker build \
+    --network=host \
+    -f ci-transwarp/Dockerfile \
+    -t 172.16.1.99/hippo/vectordb_bench/builder \
+    .
+```
+
+### 运行
+
+```shell
+git clone -b dev "http://gitlab+deploy-token-54:AJJ9dcXoYsHXKaHLdb2A@172.16.1.41/distributed-storage/vectordbbench.git"
+
+# docker run这个在上一个clone出来的目录下跑
+docker run \
+    --network=host \
+    -itd \
+    -v $(pwd):/opt/transwarp/vectordb_bench \
+    -v XXXX:/tmp/vectordb_bench/dataset \
+    172.16.1.99/hippo/vectordb_bench/builder bash
+```
+
+XXXX这个目录是数据集的目录，目录结构大概如下（参考tw-node45节点/mnt/disk1/hippo/dataset/vectordb_bench, tar.gz文件忽略）:
+
+```
+[root@tw-node45 vectordb_bench]# tree
+.
+├── cohere
+│   └── cohere_medium_1m
+│       ├── neighbors_head_1p.parquet
+│       ├── neighbors.parquet
+│       ├── neighbors_tail_1p.parquet
+│       ├── shuffle_train.parquet
+│       ├── test.parquet
+│       └── train.parquet
+├── cohere_medium_1m.tar.gz
+└── openai
+    ├── openai_medium_500k
+    │   ├── neighbors_head_1p.parquet
+    │   ├── neighbors.parquet
+    │   ├── neighbors_tail_1p.parquet
+    │   ├── shuffle_train.parquet
+    │   ├── test.parquet
+    │   └── train.parquet
+    ├── openai_small_50k
+    │   ├── neighbors_head_1p.parquet
+    │   ├── neighbors.parquet
+    │   ├── neighbors_tail_1p.parquet
+    │   ├── shuffle_train.parquet
+    │   ├── test.parquet
+    │   └── train.parquet
+    └── openai_small_50k.tar.gz
+
+```
+
+
+容器里执行:
+
+```shell
+cd /opt/transwarp/vectordb_bench
+python -m pip install .
+init_bench
+```
\ No newline at end of file
diff --git a/ci-transwarp/pip.conf b/ci-transwarp/pip.conf
new file mode 100644
index 000000000..ddc2a8931
--- /dev/null
+++ b/ci-transwarp/pip.conf
@@ -0,0 +1,5 @@
+[global]
+index-url = https://mirrors.aliyun.com/pypi/simple
+
+[install]
+trusted-host = mirrors.aliyun.com
diff --git a/vectordb_bench/backend/clients/__init__.py b/vectordb_bench/backend/clients/__init__.py
index 3df11610b..4662ea53c 100644
--- a/vectordb_bench/backend/clients/__init__.py
+++ b/vectordb_bench/backend/clients/__init__.py
@@ -32,6 +32,7 @@ class DB(Enum):
     PgVectoRS = "PgVectoRS"
     Redis = "Redis"
     Chroma = "Chroma"
+    Hippo = "Hippo"
 
 
     @property
@@ -76,6 +77,10 @@ def init_cls(self) -> Type[VectorDB]:
         if self == DB.Chroma:
             from .chroma.chroma import ChromaClient
             return ChromaClient
+        
+        if self == DB.Hippo:
+            from .hippo.hippo import Hippo
+            return Hippo
 
     @property
     def config_cls(self) -> Type[DBConfig]:
@@ -120,6 +125,10 @@ def config_cls(self) -> Type[DBConfig]:
             from .chroma.config import ChromaConfig
             return ChromaConfig
 
+        if self == DB.Hippo:
+            from .hippo.config import HippoConfig
+            return HippoConfig
+
     def case_config_cls(self, index_type: IndexType | None = None) -> Type[DBCaseConfig]:
         if self == DB.Milvus:
             from .milvus.config import _milvus_case_config
@@ -149,6 +158,10 @@ def case_config_cls(self, index_type: IndexType | None = None) -> Type[DBCaseCon
             from .pgvecto_rs.config import _pgvecto_rs_case_config
             return _pgvecto_rs_case_config.get(index_type)
 
+        if self == DB.Hippo:
+            from .hippo.config import HippoIndexConfig
+            return HippoIndexConfig
+
         # DB.Pinecone, DB.Chroma, DB.Redis
         return EmptyDBCaseConfig
 
diff --git a/vectordb_bench/backend/clients/hippo/config.py b/vectordb_bench/backend/clients/hippo/config.py
new file mode 100644
index 000000000..0c9318ca0
--- /dev/null
+++ b/vectordb_bench/backend/clients/hippo/config.py
@@ -0,0 +1,67 @@
+from pydantic import BaseModel, Field, SecretStr
+from transwarp_hippo_api.hippo_type import IndexType
+from transwarp_hippo_api.hippo_type import MetricType as HippoMetricType
+
+from ..api import DBCaseConfig, DBConfig, MetricType
+
+
+class HippoConfig(DBConfig):
+    ip: SecretStr = ""
+    port: SecretStr = "18902"
+    username: SecretStr = "shiva"
+    password: SecretStr = "shiva"
+    number_of_shards: int = Field(default=1, ge=1)
+    number_of_replicas: int = Field(default=1, ge=1)
+    insert_batch_size: int = Field(default=100, ge=1)
+
+    def to_dict(self) -> dict:
+        return {
+            "host_port": [
+                f"{self.ip.get_secret_value()}:{self.port.get_secret_value()}"
+            ],
+            "username": self.username.get_secret_value(),
+            "pwd": self.password.get_secret_value(),
+            "number_of_shards": self.number_of_shards,
+            "number_of_replicas": self.number_of_replicas,
+            "insert_batch_size": self.insert_batch_size,
+        }
+
+
+class HippoIndexConfig(BaseModel, DBCaseConfig):
+    index: IndexType = IndexType.HNSW  # HNSW, FLAT, IVF_FLAT, IVF_SQ, IVF_PQ, ANNOY
+    metric_type: MetricType | None = None
+    M: int = 30  # [4,96]
+    ef_construction: int = 360  # [8, 512]
+    ef_search: int = 100  # [topk, 32768]
+    nlist: int = 1024  # [1,65536]
+    nprobe: int = 64  # [1, nlist]
+    m: int = 16  # divisible by dim
+    nbits: int = 8  # [1, 16]
+    k_factor: int = 100  # [10, 1000]
+
+    def parse_metric(self) -> HippoMetricType:
+        if self.metric_type == MetricType.COSINE:
+            return HippoMetricType.COSINE
+        if self.metric_type == MetricType.IP:
+            return HippoMetricType.IP
+        if self.metric_type == MetricType.L2:
+            return HippoMetricType.L2
+        return ""
+
+    def index_param(self) -> dict:
+        return {
+            "M": self.M,
+            "ef_construction": self.ef_construction,
+            "ef_search": self.ef_search,
+            "nlist": self.nlist,
+            "nprobe": self.nprobe,
+            "m": self.m,
+            "nbits": self.nbits,
+        }
+
+    def search_param(self) -> dict:
+        return {
+            "ef_search": self.ef_search,
+            "nprobe": self.nprobe,
+            "k_factor": self.k_factor,
+        }
diff --git a/vectordb_bench/backend/clients/hippo/hippo.py b/vectordb_bench/backend/clients/hippo/hippo.py
new file mode 100644
index 000000000..3f17bd302
--- /dev/null
+++ b/vectordb_bench/backend/clients/hippo/hippo.py
@@ -0,0 +1,212 @@
+import logging
+from contextlib import contextmanager
+from typing import Iterable
+
+import numpy as np
+from transwarp_hippo_api.hippo_client import HippoClient, HippoField
+from transwarp_hippo_api.hippo_type import HippoType
+
+from ..api import VectorDB
+from .config import HippoIndexConfig
+
+log = logging.getLogger(__name__)
+
+
+class Hippo(VectorDB):
+    def __init__(
+        self,
+        dim: int,
+        db_config: dict,
+        db_case_config: HippoIndexConfig,
+        drop_old: bool = False,
+        **kwargs,
+    ):
+        """Initialize wrapper around the hippo vector database."""
+        self.name = "Hippo"
+        self.db_config = db_config
+        self.index_config = db_case_config
+
+        self.database_name = "default"
+        self.table_name = "vdbbench_table"
+        self.index_name = "vector_index"
+
+        self.vector_field_name = "vector"
+        self.int_field_name = "label"
+        self.pk_field_name = "pk"
+
+        self.insert_batch_size = db_config.get("insert_batch_size")
+        self.activated = False
+
+        # if `drop_old`, check table and delete table
+        hc = HippoClient(
+            **{
+                k: db_config[k]
+                for k in ["host_port", "username", "pwd"]
+                if k in db_config
+            }
+        )
+        if drop_old:
+            try:
+                table_check = hc.check_table_exists(
+                    self.table_name, database_name=self.database_name
+                )
+                log.info(f"check table exsited: {table_check}")
+            except ValueError as e:
+                log.error("failed to check table exsited; skip", exc_info=e)
+                table_check = False
+
+            if table_check:
+                log.info(f"delete table: {self.table_name}")
+                hc.delete_table(self.table_name, database_name=self.database_name)
+                hc.delete_table_in_trash(
+                    self.table_name, database_name=self.database_name
+                )
+
+            # create table
+            fields = [
+                HippoField(self.pk_field_name, True, HippoType.INT64),
+                HippoField(self.int_field_name, False, HippoType.INT64),
+                HippoField(
+                    self.vector_field_name,
+                    False,
+                    HippoType.FLOAT_VECTOR,
+                    type_params={"dimension": dim},
+                ),
+            ]
+            log.info(f"create table: {self.table_name}")
+            hc.create_table(
+                name=self.table_name,
+                fields=fields,
+                database_name=self.database_name,
+                number_of_shards=db_config.get("number_of_shards"),
+                number_of_replicas=db_config.get("number_of_replicas"),
+            )
+
+            table = hc.get_table(self.table_name, database_name=self.database_name)
+            # create index
+            log.info("create index")
+            table.create_index(
+                field_name=self.vector_field_name,
+                index_name=self.index_name,
+                index_type=self.index_config.index,
+                metric_type=self.index_config.parse_metric(),
+                **self.index_config.index_param(),
+            )
+
+    def need_normalize_cosine(self) -> bool:
+        """Wheather this database need to normalize dataset to support COSINE"""
+        return False
+
+    @contextmanager
+    def init(self):
+        """
+        generate connection
+        Examples:
+            >>> with self.init():
+            >>>     self.insert_embeddings()
+            >>>     self.search_embedding()
+        """
+        from transwarp_hippo_api.hippo_client import HippoClient
+
+        hc = HippoClient(
+            **{
+                k: self.db_config[k]
+                for k in ["host_port", "username", "pwd"]
+                if k in self.db_config
+            }
+        )
+        self.client = hc.get_table(self.table_name, database_name=self.database_name)
+
+        yield
+
+    def _activate_index(self):
+        if not self.activated:
+            try:
+                log.info("start activate index, please wait ...")
+                self.client.activate_index(
+                    self.index_name, wait_for_completion=True, timeout="25h"
+                )
+                log.info("index is actived.")
+            except Exception as e:
+                log.error("failed to activate index; skip", exc_info=e)
+
+            self.activated = True
+
+    def insert_embeddings(
+        self, embeddings: Iterable[list[float]], metadata: list[int], **kwargs
+    ):
+        assert self.client is not None
+        insert_count = 0
+        try:
+            for batch_start_offset in range(0, len(embeddings), self.insert_batch_size):
+                log.info("batch offset: %d", batch_start_offset)
+
+                data = [
+                    list(
+                        metadata[
+                            batch_start_offset : batch_start_offset
+                            + self.insert_batch_size
+                        ]
+                    ),
+                    list(
+                        metadata[
+                            batch_start_offset : batch_start_offset
+                            + self.insert_batch_size
+                        ]
+                    ),
+                    [
+                        i.tolist() if isinstance(i, np.ndarray) else i
+                        for i in embeddings[
+                            batch_start_offset : batch_start_offset
+                            + self.insert_batch_size
+                        ]
+                    ],
+                ]
+
+                self.client.insert_rows(data)
+                insert_count += len(data[0])
+            # if kwargs.get("last_batch"):
+            #     self._activate_index()
+        except Exception as e:
+            log.error("hippp insert error", exc_info=e)
+            return (insert_count, e)
+
+        log.info("total insert: %d", insert_count)
+
+        return (insert_count, None)
+
+    def search_embedding(
+        self,
+        query: list[float],
+        k: int = 100,
+        filters: dict | None = None,
+        timeout: int | None = None,
+    ) -> list[int]:
+        # assert self.col is not None
+
+        dsl = f"{self.int_field_name} >= {filters['id']}" if filters else ""
+        output_fields = [self.int_field_name]
+        result = self.client.query(
+            self.vector_field_name,
+            [query],
+            output_fields,
+            k,
+            dsl=dsl,
+            **self.index_config.search_param(),
+        )
+
+        return result[0][self.int_field_name]
+
+    def optimize(self, **kwargs):
+        self._activate_index()
+
+        if kwargs.get("filters"):
+            log.info(f"create scalar index on field: {self.int_field_name}")
+            self.client.create_scalar_index(
+                field_names=[self.int_field_name],
+                index_name="idx_" + self.int_field_name,
+            )
+            log.info("scalar index created")
+
+    def ready_to_load(self):
+        return
diff --git a/vectordb_bench/backend/clients/hippo/test.py b/vectordb_bench/backend/clients/hippo/test.py
new file mode 100644
index 000000000..2d20d711b
--- /dev/null
+++ b/vectordb_bench/backend/clients/hippo/test.py
@@ -0,0 +1,93 @@
+from transwarp_hippo_api.hippo_client import HippoClient, HippoField
+from transwarp_hippo_api.hippo_type import HippoType, IndexType, MetricType
+import numpy as np
+
+ip = ""
+port = ""
+username = ""
+pwd = ""
+
+dim = 128
+n_train = 10000
+n_test = 100
+
+# connect
+hc = HippoClient([f"{ip}:{port}"], username=username, pwd=pwd)
+
+# create database
+database_name = "default"
+# db = hc.create_database(database_name)
+
+# create table
+table_name = "vdbbench_table"
+# table_check = hc.check_table_exists(table_name, database_name=database_name)
+# if table_check:
+#     hc.delete_table(table_name, database_name=database_name)
+#     hc.delete_table_in_trash(table_name, database_name=database_name)
+vector_field_name = "vector"
+int_field_name = "label"
+pk_field_name = "pk"
+fields = [
+    HippoField(pk_field_name, True, HippoType.INT64),
+    HippoField(int_field_name, False, HippoType.INT64),
+    HippoField(vector_field_name, False, HippoType.FLOAT_VECTOR,
+               type_params={"dimension": dim}),
+]
+client = hc.create_table(name=table_name, fields=fields,
+                         database_name=database_name, number_of_shards=1, number_of_replicas=1)
+
+
+# get table
+client = hc.get_table(table_name, database_name=database_name)
+
+
+# create index
+index_name = "vector_index"
+M = 30  # [4,96]
+ef_construction = 360  # [8, 512]
+ef_search = 100  # [topk, 32768]
+client.create_index(field_name=vector_field_name, index_name=index_name,
+                    index_type=IndexType.HNSW, metric_type=MetricType.L2,
+                    M=M, ef_construction=ef_construction, ef_search=ef_search)
+
+
+# # load?
+# index_loaded = client.load_index(index_name)
+
+# insert
+pk_data = np.arange(n_train)
+int_data = np.random.randint(0, 100, n_train)
+vector_data = np.random.rand(n_train, dim)
+batch_size = 100
+for offset in range(0, n_train, batch_size):
+    start = offset
+    end = offset + batch_size
+    print(f"insert {start}-{end}")
+    data = [
+        pk_data[start:end].tolist(), int_data[start:end].tolist(
+        ), vector_data[start:end].tolist(),
+    ]
+    client.insert_rows(data)
+
+# need activate - like milvus load
+client.activate_index(index_name, wait_for_completion=True, timeout="25h")
+
+# ann search
+query_vectors = np.random.rand(n_test, dim)
+output_fields = [pk_field_name, int_field_name]
+k = 10
+dsl = f"{int_field_name} >= 90"
+result = client.query(vector_field_name, query_vectors.tolist(),
+                      output_fields, topk=k, dsl=dsl)
+print(result[0])
+
+result = client.query(vector_field_name, query_vectors.tolist(),
+                      output_fields, topk=100)
+print(result[0])
+
+# delete table
+hc.delete_table(table_name, database_name=database_name)
+hc.delete_table_in_trash(table_name, database_name=database_name)
+
+# # delete database
+# hc.delete_database(database_name)
diff --git a/vectordb_bench/backend/data_source.py b/vectordb_bench/backend/data_source.py
index 65926ff6b..5f944b065 100644
--- a/vectordb_bench/backend/data_source.py
+++ b/vectordb_bench/backend/data_source.py
@@ -132,7 +132,8 @@ def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path, chec
                 remote_file = pathlib.Path(self.remote_root, dataset, file)
                 local_file = local_ds_root.joinpath(file)
 
-                if (not local_file.exists()) or (not self.validate_file(remote_file, local_file, check_etag)):
+                # if (not local_file.exists()) or (not self.validate_file(remote_file, local_file, check_etag)):
+                if (not local_file.exists()):
                     log.info(f"local file: {local_file} not match with remote: {remote_file}; add to downloading list")
                     downloads.append(remote_file)
 
diff --git a/vectordb_bench/backend/runner/serial_runner.py b/vectordb_bench/backend/runner/serial_runner.py
index aeed0ec74..e11822f9a 100644
--- a/vectordb_bench/backend/runner/serial_runner.py
+++ b/vectordb_bench/backend/runner/serial_runner.py
@@ -47,6 +47,7 @@ def task(self) -> int:
                 log.debug(f"batch dataset size: {len(all_embeddings)}, {len(all_metadata)}")
 
                 last_batch = self.dataset.data.size - count == len(all_metadata)
+                log.info(f"last_batch: {last_batch} data size: {self.dataset.data.size} count: {count}")
                 insert_count, error = self.db.insert_embeddings(
                     embeddings=all_embeddings,
                     metadata=all_metadata,
@@ -114,7 +115,7 @@ def _insert_all_batches(self) -> int:
                     psutil.Process(pid).kill()
                 raise PerformanceTimeoutError(msg) from e
             except Exception as e:
-                log.warning(f"VectorDB load dataset error: {e}")
+                log.error("VectorDB load dataset error: ", exc_info=e)
                 raise e from e
             else:
                 return count
@@ -169,7 +170,7 @@ def __init__(
             self.test_data = test_data
         self.ground_truth = ground_truth
 
-    def search(self, args: tuple[list, pd.DataFrame]):
+    def search(self, args: tuple[list, pd.DataFrame]) -> tuple[float, float, float]:
         log.info(f"{mp.current_process().name:14} start search the entire test_data to get recall and latency")
         with self.db.init():
             test_data, ground_truth = args
@@ -213,14 +214,14 @@ def search(self, args: tuple[list, pd.DataFrame]):
             f"avg_latency={avg_latency}, "
             f"p99={p99}"
          )
-        return (avg_recall, p99)
+        return (avg_recall, p99, avg_latency)
 
 
-    def _run_in_subprocess(self) -> tuple[float, float]:
+    def _run_in_subprocess(self) -> tuple[float, float, float]:
         with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor:
             future = executor.submit(self.search, (self.test_data, self.ground_truth))
             result = future.result()
             return result
 
-    def run(self) -> tuple[float, float]:
+    def run(self) -> tuple[float, float, float]:
         return self._run_in_subprocess()
diff --git a/vectordb_bench/backend/task_runner.py b/vectordb_bench/backend/task_runner.py
index 80c5ac1df..b78a6bab3 100644
--- a/vectordb_bench/backend/task_runner.py
+++ b/vectordb_bench/backend/task_runner.py
@@ -58,7 +58,7 @@ def __eq__(self, obj):
                 self.config.db == obj.config.db and \
                 self.config.db_case_config == obj.config.db_case_config and \
                 self.ca.dataset == obj.ca.dataset
-            return False
+        return False
 
     def display(self) -> dict:
         c_dict = self.ca.dict(include={'label':True, 'filters': True,'dataset':{'data': {'name': True, 'size': True, 'dim': True, 'metric_type': True, 'label': True}} })
@@ -140,7 +140,7 @@ def _run_perf_case(self, drop_old: bool = True) -> Metric:
                 )
 
             self._init_search_runner()
-            m.recall, m.serial_latency_p99 = self._serial_search()
+            m.recall, m.serial_latency_p99, m.serial_latency_avg = self._serial_search()
             m.qps = self._conc_search()
         except Exception as e:
             log.warning(f"Failed to run performance case, reason = {e}")
@@ -161,7 +161,7 @@ def _load_train_data(self):
         finally:
             runner = None
 
-    def _serial_search(self) -> tuple[float, float]:
+    def _serial_search(self) -> tuple[float, float, float]:
         """Performance serial tests, search the entire test data once,
         calculate the recall, serial_latency_p99
 
@@ -193,7 +193,7 @@ def _conc_search(self):
     @utils.time_it
     def _task(self) -> None:
         with self.db.init():
-            self.db.optimize()
+            self.db.optimize(filters=self.ca.filters)
 
     def _optimize(self) -> float:
         with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor:
diff --git a/vectordb_bench/frontend/components/run_test/dbSelector.py b/vectordb_bench/frontend/components/run_test/dbSelector.py
index 61db843f3..40c5ee5f9 100644
--- a/vectordb_bench/frontend/components/run_test/dbSelector.py
+++ b/vectordb_bench/frontend/components/run_test/dbSelector.py
@@ -30,7 +30,7 @@ def dbSelector(st):
     for i, db in enumerate(DB_LIST):
         column = dbContainerColumns[i % DB_SELECTOR_COLUMNS]
         dbIsActived[db] = column.checkbox(db.name)
-        column.image(DB_TO_ICON.get(db, ""))
+        column.image(DB_TO_ICON.get(db, ""), width=100)
     activedDbList = [db for db in DB_LIST if dbIsActived[db]]
 
     return activedDbList
diff --git a/vectordb_bench/frontend/const/dbCaseConfigs.py b/vectordb_bench/frontend/const/dbCaseConfigs.py
index fad5f362d..ad6a3b73f 100644
--- a/vectordb_bench/frontend/const/dbCaseConfigs.py
+++ b/vectordb_bench/frontend/const/dbCaseConfigs.py
@@ -1,10 +1,12 @@
-from enum import IntEnum
 import typing
+from enum import IntEnum
+
 from pydantic import BaseModel
+from transwarp_hippo_api.hippo_type import IndexType as HippoIndexType
+
 from vectordb_bench.backend.cases import CaseLabel, CaseType
 from vectordb_bench.backend.clients import DB
 from vectordb_bench.backend.clients.api import IndexType
-
 from vectordb_bench.models import CaseConfigParamType
 
 MAX_STREAMLIT_INT = (1 << 53) - 1
@@ -419,6 +421,156 @@ class CaseConfigInput(BaseModel):
     },
 )
 
+CaseConfigParamInput_IndexType_Hippo = CaseConfigInput(
+    label=CaseConfigParamType.IndexType,
+    inputType=InputType.Option,
+    inputConfig={
+        "options": [
+            HippoIndexType.HNSW.value,
+            HippoIndexType.FLAT.value,
+            HippoIndexType.IVF_FLAT.value,
+            HippoIndexType.IVF_SQ.value,
+            HippoIndexType.IVF_PQ.value,
+            HippoIndexType.ANNOY.value,
+        ],
+    },
+)
+
+CaseConfigParamInput_M_Hippo = CaseConfigInput(
+    label=CaseConfigParamType.M,
+    inputType=InputType.Number,
+    inputConfig={
+        "min": 4,
+        "max": 64,
+        "value": 30,
+    },
+    isDisplayed=lambda config: config.get(CaseConfigParamType.IndexType, None)
+    == HippoIndexType.HNSW.value,
+)
+
+CaseConfigParamInput_EFConstruction_Hippo = CaseConfigInput(
+    label=CaseConfigParamType.ef_construction,
+    inputType=InputType.Number,
+    inputConfig={
+        "min": 8,
+        "max": 512,
+        "value": 360,
+    },
+    isDisplayed=lambda config: config.get(CaseConfigParamType.IndexType, None)
+    == HippoIndexType.HNSW.value,
+)
+
+CaseConfigParamInput_EFSearch_Hippo = CaseConfigInput(
+    label=CaseConfigParamType.ef_search,
+    inputType=InputType.Number,
+    inputConfig={
+        "min": 100,
+        "max": MAX_STREAMLIT_INT,
+        "value": 100,
+    },
+    isDisplayed=lambda config: config.get(CaseConfigParamType.IndexType, None)
+    == HippoIndexType.HNSW.value,
+)
+
+CaseConfigParamInput_Nlist_Hippo = CaseConfigInput(
+    label=CaseConfigParamType.Nlist,
+    inputType=InputType.Number,
+    inputConfig={
+        "min": 1,
+        "max": 65536,
+        "value": 1024,
+    },
+    isDisplayed=lambda config: config.get(CaseConfigParamType.IndexType, None)
+    in [
+        HippoIndexType.IVF_FLAT.value,
+        HippoIndexType.IVF_SQ.value,
+        HippoIndexType.IVF_PQ.value,
+        # TODO: add ivf_pq_fs
+    ],
+)
+
+CaseConfigParamInput_Nprobe_Hippo = CaseConfigInput(
+    label=CaseConfigParamType.Nprobe,
+    inputType=InputType.Number,
+    inputConfig={
+        "min": 1,
+        "max": 65536,
+        "value": 64,
+    },
+    isDisplayed=lambda config: config.get(CaseConfigParamType.IndexType, None)
+    in [
+        HippoIndexType.IVF_FLAT.value,
+        HippoIndexType.IVF_SQ.value,
+        HippoIndexType.IVF_PQ.value,
+        # TODO: add ivf_pq_fs
+    ],
+)
+
+CaseConfigParamInput_m_Hippo = CaseConfigInput(
+    label=CaseConfigParamType.m,
+    inputType=InputType.Number,
+    inputConfig={
+        "min": 1,
+        "max": 1024,
+        "value": 16,
+    },
+    isDisplayed=lambda config: config.get(CaseConfigParamType.IndexType, None)
+    in [
+        HippoIndexType.IVF_PQ.value,
+        # TODO: add ivf_pq_fs
+    ],
+)
+
+CaseConfigParamInput_nbits_Hippo = CaseConfigInput(
+    label=CaseConfigParamType.nbits,
+    inputType=InputType.Number,
+    inputConfig={
+        "min": 1,
+        "max": 16,
+        "value": 8,
+    },
+    isDisplayed=lambda config: config.get(CaseConfigParamType.IndexType, None)
+    in [
+        HippoIndexType.IVF_PQ.value,
+    ],
+)
+
+CaseConfigParamInput_k_factor_Hippo = CaseConfigInput(
+    label=CaseConfigParamType.k_factor,
+    inputType=InputType.Number,
+    inputConfig={
+        "min": 1,
+        "max": 1000,
+        "value": 100,
+    },
+    isDisplayed=lambda config: config.get(CaseConfigParamType.IndexType, None)
+    in [
+        HippoIndexType.IVF_PQ.value,
+        HippoIndexType.IVF_SQ.value,
+        # TODO: add ivf_pq_fs
+    ],
+)
+
+CaseConfigParamInput_index_slow_refine_Hippo = CaseConfigInput(
+    label=CaseConfigParamType.index_slow_refine,
+    inputType=InputType.Option,
+    inputConfig={"options": [False, True]},
+    isDisplayed=lambda config: config.get(CaseConfigParamType.IndexType, None)
+    in [
+        HippoIndexType.IVF_PQ.value,
+        HippoIndexType.IVF_SQ.value,
+        # TODO: add ivf_pq_fs
+    ],
+)
+
+CaseConfigParamInput_sq_type_Hippo = CaseConfigInput(
+    label=CaseConfigParamType.sq_type,
+    inputType=InputType.Text,
+    inputConfig={"value": ""},
+    isDisplayed=lambda config: config.get(CaseConfigParamType.IndexType, None)
+    == HippoIndexType.IVF_SQ.value,
+)
+
 MilvusLoadConfig = [
     CaseConfigParamInput_IndexType,
     CaseConfigParamInput_M,
@@ -496,6 +648,29 @@ class CaseConfigInput(BaseModel):
     CaseConfigParamInput_ZillizLevel,
 ]
 
+HippoLoadConfig = [
+    CaseConfigParamInput_IndexType_Hippo,
+    CaseConfigParamInput_M_Hippo,
+    CaseConfigParamInput_EFConstruction_Hippo,
+    CaseConfigParamInput_EFSearch_Hippo,
+    CaseConfigParamInput_Nlist_Hippo,
+    CaseConfigParamInput_m_Hippo,
+    CaseConfigParamInput_nbits_Hippo,
+]
+HippoPerformanceConfig = [
+    CaseConfigParamInput_IndexType_Hippo,
+    CaseConfigParamInput_M_Hippo,
+    CaseConfigParamInput_EFConstruction_Hippo,
+    CaseConfigParamInput_EFSearch_Hippo,
+    CaseConfigParamInput_Nlist_Hippo,
+    CaseConfigParamInput_Nprobe_Hippo,
+    CaseConfigParamInput_m_Hippo,
+    CaseConfigParamInput_nbits_Hippo,
+    CaseConfigParamInput_k_factor_Hippo,
+    CaseConfigParamInput_index_slow_refine_Hippo,
+    CaseConfigParamInput_sq_type_Hippo,
+]
+
 CASE_CONFIG_MAP = {
     DB.Milvus: {
         CaseLabel.Load: MilvusLoadConfig,
@@ -520,4 +695,8 @@ class CaseConfigInput(BaseModel):
         CaseLabel.Load: PgVectoRSLoadingConfig,
         CaseLabel.Performance: PgVectoRSPerformanceConfig,
     },
+    DB.Hippo: {
+        CaseLabel.Load: HippoLoadConfig,
+        CaseLabel.Performance: HippoPerformanceConfig,
+    },
 }
diff --git a/vectordb_bench/frontend/const/styles.py b/vectordb_bench/frontend/const/styles.py
index 52d1017a9..ad3e4d9c0 100644
--- a/vectordb_bench/frontend/const/styles.py
+++ b/vectordb_bench/frontend/const/styles.py
@@ -46,6 +46,7 @@ def getPatternShape(i):
     DB.PgVectoRS: "https://assets.zilliz.com/PG_Vector_d464f2ef5f.png",
     DB.Redis: "https://assets.zilliz.com/Redis_Cloud_74b8bfef39.png",
     DB.Chroma: "https://assets.zilliz.com/chroma_ceb3f06ed7.png",
+    DB.Hippo: "https://assets.zilliz.com/hippo_3ce85bc90f.png",
 }
 
 # RedisCloud color: #0D6EFD
@@ -59,4 +60,5 @@ def getPatternShape(i):
     DB.WeaviateCloud.value: "#20C997",
     DB.PgVector.value: "#4C779A",
     DB.Redis.value: "#0D6EFD",
+    DB.Hippo.value: "#333",
 }
diff --git a/vectordb_bench/metric.py b/vectordb_bench/metric.py
index a2b6d6ff0..be90f3adb 100644
--- a/vectordb_bench/metric.py
+++ b/vectordb_bench/metric.py
@@ -18,12 +18,14 @@ class Metric:
     load_duration: float = 0.0  # duration to load all dataset into DB
     qps: float = 0.0
     serial_latency_p99: float = 0.0
+    serial_latency_avg: float = 0.0
     recall: float = 0.0
 
 
 QURIES_PER_DOLLAR_METRIC = "QP$ (Quries per Dollar)"
 LOAD_DURATION_METRIC = "load_duration"
 SERIAL_LATENCY_P99_METRIC = "serial_latency_p99"
+SERIAL_LATENCY_AVG_METRIC = "serial_latency_avg"
 MAX_LOAD_COUNT_METRIC = "max_load_count"
 QPS_METRIC = "qps"
 RECALL_METRIC = "recall"
@@ -31,6 +33,7 @@ class Metric:
 metricUnitMap = {
     LOAD_DURATION_METRIC: "s",
     SERIAL_LATENCY_P99_METRIC: "ms",
+    SERIAL_LATENCY_AVG_METRIC: "ms",
     MAX_LOAD_COUNT_METRIC: "K",
     QURIES_PER_DOLLAR_METRIC: "K",
 }
@@ -38,6 +41,7 @@ class Metric:
 lowerIsBetterMetricList = [
     LOAD_DURATION_METRIC,
     SERIAL_LATENCY_P99_METRIC,
+    SERIAL_LATENCY_AVG_METRIC,
 ]
 
 metricOrder = [
@@ -45,6 +49,7 @@ class Metric:
     RECALL_METRIC,
     LOAD_DURATION_METRIC,
     SERIAL_LATENCY_P99_METRIC,
+    SERIAL_LATENCY_AVG_METRIC,
     MAX_LOAD_COUNT_METRIC,
 ]
 
diff --git a/vectordb_bench/models.py b/vectordb_bench/models.py
index 3c2a5b9aa..da7da9ca6 100644
--- a/vectordb_bench/models.py
+++ b/vectordb_bench/models.py
@@ -1,23 +1,22 @@
 import logging
 import pathlib
 from datetime import date
-from typing import Self
 from enum import Enum
+from typing import Self
 
 import ujson
 
+from . import config
+from .backend.cases import CaseType
 from .backend.clients import (
     DB,
-    DBConfig,
     DBCaseConfig,
+    DBConfig,
     IndexType,
 )
-from .backend.cases import CaseType
 from .base import BaseModel
-from . import config
 from .metric import Metric
 
-
 log = logging.getLogger(__name__)
 
 
@@ -60,6 +59,11 @@ class CaseConfigParamType(Enum):
     cache_dataset_on_device = "cache_dataset_on_device"
     refine_ratio = "refine_ratio"
     level = "level"
+    ef_construction = "ef_construction"
+    ef_search = "ef_search"
+    k_factor = "k_factor"
+    index_slow_refine = "index_slow_refine"
+    sq_type = "sq_type"
 
 
 class CustomizedCase(BaseModel):

From 8bb1b529078952b4c00c7ab80f9b418b395e281c Mon Sep 17 00:00:00 2001
From: "bingtao.yin" <bingtao.yin@transwarp.io>
Date: Fri, 1 Mar 2024 04:52:12 +0000
Subject: [PATCH 2/3] remove some files

---
 .devcontainer/devcontainer.json    | 50 -------------------
 .devcontainer/postCreateCommand.sh | 34 -------------
 .gitignore                         |  8 +---
 ci-transwarp/Dockerfile            | 33 -------------
 ci-transwarp/Note.md               | 77 ------------------------------
 ci-transwarp/pip.conf              |  5 --
 6 files changed, 1 insertion(+), 206 deletions(-)
 delete mode 100644 .devcontainer/devcontainer.json
 delete mode 100644 .devcontainer/postCreateCommand.sh
 delete mode 100644 ci-transwarp/Dockerfile
 delete mode 100644 ci-transwarp/Note.md
 delete mode 100644 ci-transwarp/pip.conf

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
deleted file mode 100644
index b3439a14e..000000000
--- a/.devcontainer/devcontainer.json
+++ /dev/null
@@ -1,50 +0,0 @@
-// For format details, see https://aka.ms/devcontainer.json. For config options, see the
-// README at: https://github.com/devcontainers/templates/tree/main/src/cpp
-{
-    "name": "vectordb bench dev",
-    "image": "172.16.1.99/hippo/vectordb_bench/builder:latest",
-    "runArgs": [
-        "--privileged",
-        "--cap-add=SYS_PTRACE",
-        "--security-opt",
-        "seccomp=unconfined"
-    ],
-    "mounts": [
-        "source=/var/run/docker.sock,target=/var/run/docker.sock,type=bind"
-    ],
-    "workspaceMount": "source=${localWorkspaceFolder},target=/opt/transwarp/vectordb_bench,type=bind,consistency=cached",
-    "workspaceFolder": "/opt/transwarp/vectordb_bench",
-    "customizations": {
-        "vscode": {
-            // Set *default* container specific settings.json values on container create.
-            "settings": {
-                "terminal.integrated.shell.linux": "/bin/bash",
-                "C_Cpp.default.cppStandard": "c++20",
-                "C_Cpp.default.cStandard": "c17",
-                "C_Cpp.default.browse.databaseFilename": "${workspaceFolder}/.vscode/.browse.c_cpp.db",
-            },
-            // Add the IDs of extensions you want installed when the container is created.
-            "extensions": [
-                "foxundermoon.shell-format",
-                "redhat.vscode-yaml",
-                "ms-azuretools.vscode-docker",
-                "EditorConfig.EditorConfig",
-                "codezombiech.gitignore",
-                "yzhang.markdown-all-in-one",
-                "SonarSource.sonarlint-vscode",
-                "GitHub.copilot",
-                "ms-python.python",
-                "ms-python.debugpy",
-                "VisualStudioExptTeam.vscodeintellicode",
-                "donjayamanne.python-environment-manager",
-                "charliermarsh.ruff"
-            ]
-        }
-    },
-    // Use 'forwardPorts' to make a list of ports inside the container available locally.
-    // "forwardPorts": [],
-    // Use 'postCreateCommand' to run commands after the container is created.
-    "postCreateCommand": "bash ./.devcontainer/postCreateCommand.sh"
-    // Comment out connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root.
-    // "remoteUser": "vscode"
-}
\ No newline at end of file
diff --git a/.devcontainer/postCreateCommand.sh b/.devcontainer/postCreateCommand.sh
deleted file mode 100644
index 3f8f77fcc..000000000
--- a/.devcontainer/postCreateCommand.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/bash
-
-function create_user {
-  USERNAME=dev
-  USER_UID=1000
-  USER_GID=$USER_UID
-
-  # Create the user
-  groupadd --gid $USER_GID $USERNAME
-  useradd --uid $USER_UID --gid $USER_GID -m $USERNAME
-
-  # [Optional] Add sudo support. Omit if you don't need to install software after connecting.
-  apt install -y sudo
-  echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME
-  chmod 0440 /etc/sudoers.d/$USERNAME  
-}
-
-function fix_git {
-  # touch ~/.gitconfig
-
-  git config --global --add safe.directory /opt/transwarp/vectordb_bench
-
-  git config --global --unset http.https://github.com.proxy
-  git config --global --unset https.https://github.com.proxy
-
-}
-
-function main {
-  # create_user
-
-  fix_git
-}
-
-main $@
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 0a316c319..8d28d7eeb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,9 +10,6 @@ build/
 venv/
 .idea/
 
-# result files
-vectordb_bench/results/**
-
 # vscode files
 .vscode/*
 !.vscode/tasks.json
@@ -26,7 +23,4 @@ vectordb_bench/results/**
 .history/
 
 # Built Visual Studio Code Extensions
-*.vsix
-
-# ruff files
-.ruff_cache
\ No newline at end of file
+*.vsix
\ No newline at end of file
diff --git a/ci-transwarp/Dockerfile b/ci-transwarp/Dockerfile
deleted file mode 100644
index 6c21a5219..000000000
--- a/ci-transwarp/Dockerfile
+++ /dev/null
@@ -1,33 +0,0 @@
-FROM ubuntu:22.04
-
-ARG build_dir=/opt/transwarp/vectordb_bench
-
-RUN \
-    # basics
-    PKGS="software-properties-common vim sudo locales git" && \
-    apt-get update && \
-    apt-get install -y ${PKGS} && \
-    locale-gen en_US.UTF-8 && \
-    ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/timezone && \
-    ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \
-    # python
-    add-apt-repository -y ppa:deadsnakes/ppa && \
-    apt-get update && \
-    apt-get install -y python3-pip python3.11 python3.11-dev && \
-    apt-get clean && apt-get autoclean
-
-ENV LC_ALL="en_US.UTF-8"
-ENV LANG="en_US.UTF-8"
-
-COPY ci-transwarp/pip.conf /root/.config/pip/pip.conf
-COPY . ${build_dir}
-
-RUN \
-    update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 && \
-    python -m pip install --upgrade pip && \
-    python -m pip install hippo-api==1.2.0rc1 && \
-    cd ${build_dir} && python -m pip install .[test] && cd ${build_dir} && rm -rf ./*
-
-VOLUME [ "${build_dir}" ]
-
-WORKDIR ${build_dir}
diff --git a/ci-transwarp/Note.md b/ci-transwarp/Note.md
deleted file mode 100644
index 838d1f783..000000000
--- a/ci-transwarp/Note.md
+++ /dev/null
@@ -1,77 +0,0 @@
-CI
-
----
-
-### 准备数据集
-
-```
-aws s3 ls s3://assets.zilliz.com/benchmark/ --region us-west-2 --recursive --no-sign-request
-
-aws s3 cp s3://assets.zilliz.com/benchmark/cohere_medium_1m cohere_medium_1m  --region us-west-2 --recursive --no-sign-request
-```
-
-### 构建
-
-```shell
-docker build \
-    --network=host \
-    -f ci-transwarp/Dockerfile \
-    -t 172.16.1.99/hippo/vectordb_bench/builder \
-    .
-```
-
-### 运行
-
-```shell
-git clone -b dev "http://gitlab+deploy-token-54:AJJ9dcXoYsHXKaHLdb2A@172.16.1.41/distributed-storage/vectordbbench.git"
-
-# docker run这个在上一个clone出来的目录下跑
-docker run \
-    --network=host \
-    -itd \
-    -v $(pwd):/opt/transwarp/vectordb_bench \
-    -v XXXX:/tmp/vectordb_bench/dataset \
-    172.16.1.99/hippo/vectordb_bench/builder bash
-```
-
-XXXX这个目录是数据集的目录，目录结构大概如下（参考tw-node45节点/mnt/disk1/hippo/dataset/vectordb_bench, tar.gz文件忽略）:
-
-```
-[root@tw-node45 vectordb_bench]# tree
-.
-├── cohere
-│   └── cohere_medium_1m
-│       ├── neighbors_head_1p.parquet
-│       ├── neighbors.parquet
-│       ├── neighbors_tail_1p.parquet
-│       ├── shuffle_train.parquet
-│       ├── test.parquet
-│       └── train.parquet
-├── cohere_medium_1m.tar.gz
-└── openai
-    ├── openai_medium_500k
-    │   ├── neighbors_head_1p.parquet
-    │   ├── neighbors.parquet
-    │   ├── neighbors_tail_1p.parquet
-    │   ├── shuffle_train.parquet
-    │   ├── test.parquet
-    │   └── train.parquet
-    ├── openai_small_50k
-    │   ├── neighbors_head_1p.parquet
-    │   ├── neighbors.parquet
-    │   ├── neighbors_tail_1p.parquet
-    │   ├── shuffle_train.parquet
-    │   ├── test.parquet
-    │   └── train.parquet
-    └── openai_small_50k.tar.gz
-
-```
-
-
-容器里执行:
-
-```shell
-cd /opt/transwarp/vectordb_bench
-python -m pip install .
-init_bench
-```
\ No newline at end of file
diff --git a/ci-transwarp/pip.conf b/ci-transwarp/pip.conf
deleted file mode 100644
index ddc2a8931..000000000
--- a/ci-transwarp/pip.conf
+++ /dev/null
@@ -1,5 +0,0 @@
-[global]
-index-url = https://mirrors.aliyun.com/pypi/simple
-
-[install]
-trusted-host = mirrors.aliyun.com

From b648245040b688765f2459d8ef427f476e6c658a Mon Sep 17 00:00:00 2001
From: "bingtao.yin" <bingtao.yin@transwarp.io>
Date: Fri, 1 Mar 2024 05:01:46 +0000
Subject: [PATCH 3/3] enable validate file

---
 vectordb_bench/backend/data_source.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vectordb_bench/backend/data_source.py b/vectordb_bench/backend/data_source.py
index 5f944b065..65926ff6b 100644
--- a/vectordb_bench/backend/data_source.py
+++ b/vectordb_bench/backend/data_source.py
@@ -132,8 +132,7 @@ def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path, chec
                 remote_file = pathlib.Path(self.remote_root, dataset, file)
                 local_file = local_ds_root.joinpath(file)
 
-                # if (not local_file.exists()) or (not self.validate_file(remote_file, local_file, check_etag)):
-                if (not local_file.exists()):
+                if (not local_file.exists()) or (not self.validate_file(remote_file, local_file, check_etag)):
                     log.info(f"local file: {local_file} not match with remote: {remote_file}; add to downloading list")
                     downloads.append(remote_file)