From 2470dbecb59c4772c8ff32291d29ca21a7d6d6d9 Mon Sep 17 00:00:00 2001 From: "min.tian" Date: Fri, 23 Feb 2024 03:45:47 +0000 Subject: [PATCH 1/3] update hippo client --- .devcontainer/devcontainer.json | 50 +++++ .devcontainer/postCreateCommand.sh | 34 +++ .editorconfig | 18 ++ .gitignore | 21 ++ .vscode/launch.json | 28 +++ .vscode/settings.json | 10 + .vscode/tasks.json | 29 +++ ci-transwarp/Dockerfile | 33 +++ ci-transwarp/Note.md | 77 +++++++ ci-transwarp/pip.conf | 5 + vectordb_bench/backend/clients/__init__.py | 13 ++ .../backend/clients/hippo/config.py | 67 ++++++ vectordb_bench/backend/clients/hippo/hippo.py | 212 ++++++++++++++++++ vectordb_bench/backend/clients/hippo/test.py | 93 ++++++++ vectordb_bench/backend/data_source.py | 3 +- .../backend/runner/serial_runner.py | 11 +- vectordb_bench/backend/task_runner.py | 8 +- .../components/run_test/dbSelector.py | 2 +- .../frontend/const/dbCaseConfigs.py | 183 ++++++++++++++- vectordb_bench/frontend/const/styles.py | 2 + vectordb_bench/metric.py | 5 + vectordb_bench/models.py | 14 +- 22 files changed, 900 insertions(+), 18 deletions(-) create mode 100644 .devcontainer/devcontainer.json create mode 100644 .devcontainer/postCreateCommand.sh create mode 100644 .editorconfig create mode 100644 .vscode/launch.json create mode 100644 .vscode/settings.json create mode 100644 .vscode/tasks.json create mode 100644 ci-transwarp/Dockerfile create mode 100644 ci-transwarp/Note.md create mode 100644 ci-transwarp/pip.conf create mode 100644 vectordb_bench/backend/clients/hippo/config.py create mode 100644 vectordb_bench/backend/clients/hippo/hippo.py create mode 100644 vectordb_bench/backend/clients/hippo/test.py diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 000000000..b3439a14e --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,50 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +// README at: https://github.com/devcontainers/templates/tree/main/src/cpp +{ + "name": "vectordb bench dev", + "image": "172.16.1.99/hippo/vectordb_bench/builder:latest", + "runArgs": [ + "--privileged", + "--cap-add=SYS_PTRACE", + "--security-opt", + "seccomp=unconfined" + ], + "mounts": [ + "source=/var/run/docker.sock,target=/var/run/docker.sock,type=bind" + ], + "workspaceMount": "source=${localWorkspaceFolder},target=/opt/transwarp/vectordb_bench,type=bind,consistency=cached", + "workspaceFolder": "/opt/transwarp/vectordb_bench", + "customizations": { + "vscode": { + // Set *default* container specific settings.json values on container create. + "settings": { + "terminal.integrated.shell.linux": "/bin/bash", + "C_Cpp.default.cppStandard": "c++20", + "C_Cpp.default.cStandard": "c17", + "C_Cpp.default.browse.databaseFilename": "${workspaceFolder}/.vscode/.browse.c_cpp.db", + }, + // Add the IDs of extensions you want installed when the container is created. + "extensions": [ + "foxundermoon.shell-format", + "redhat.vscode-yaml", + "ms-azuretools.vscode-docker", + "EditorConfig.EditorConfig", + "codezombiech.gitignore", + "yzhang.markdown-all-in-one", + "SonarSource.sonarlint-vscode", + "GitHub.copilot", + "ms-python.python", + "ms-python.debugpy", + "VisualStudioExptTeam.vscodeintellicode", + "donjayamanne.python-environment-manager", + "charliermarsh.ruff" + ] + } + }, + // Use 'forwardPorts' to make a list of ports inside the container available locally. + // "forwardPorts": [], + // Use 'postCreateCommand' to run commands after the container is created. + "postCreateCommand": "bash ./.devcontainer/postCreateCommand.sh" + // Comment out connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root. + // "remoteUser": "vscode" +} \ No newline at end of file diff --git a/.devcontainer/postCreateCommand.sh b/.devcontainer/postCreateCommand.sh new file mode 100644 index 000000000..3f8f77fcc --- /dev/null +++ b/.devcontainer/postCreateCommand.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +function create_user { + USERNAME=dev + USER_UID=1000 + USER_GID=$USER_UID + + # Create the user + groupadd --gid $USER_GID $USERNAME + useradd --uid $USER_UID --gid $USER_GID -m $USERNAME + + # [Optional] Add sudo support. Omit if you don't need to install software after connecting. + apt install -y sudo + echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME + chmod 0440 /etc/sudoers.d/$USERNAME +} + +function fix_git { + # touch ~/.gitconfig + + git config --global --add safe.directory /opt/transwarp/vectordb_bench + + git config --global --unset http.https://github.com.proxy + git config --global --unset https.https://github.com.proxy + +} + +function main { + # create_user + + fix_git +} + +main $@ \ No newline at end of file diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 000000000..cc30f74dd --- /dev/null +++ b/.editorconfig @@ -0,0 +1,18 @@ +# EditorConfig is awesome: https://EditorConfig.org + +# top-most EditorConfig file +root = true + +[*] +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = false +insert_final_newline = false + +[Dockerfile*] +indent_style = space +indent_size = 4 + +[*.json] +indent_style = space +indent_size = 4 \ No newline at end of file diff --git a/.gitignore b/.gitignore index 004524444..0a316c319 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,24 @@ __MACOSX build/ venv/ .idea/ + +# result files +vectordb_bench/results/** + +# vscode files +.vscode/* +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/settings.json +!.vscode/*.code-snippets +!.vscode/c_cpp_properties.json + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + +# ruff files +.ruff_cache \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 000000000..fb8612a23 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,28 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python: Streamlit", + "type": "debugpy", + "request": "launch", + "module": "streamlit", + "args": [ + "run", + "vectordb_bench/frontend/vdb_benchmark.py", + "--logger.level", + "info", + "--theme.base", + "light", + "--theme.primaryColor", + "#3670F2", + "--theme.secondaryBackgroundColor", + "#F0F2F6", + ], + "subProcess": true, + "justMyCode": false + } + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 000000000..88f75752b --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,10 @@ +{ + "[python]": { + "editor.formatOnSave": false, + // "editor.codeActionsOnSave": { + // "source.fixAll": "always", + // "source.organizeImports": "always" + // }, + "editor.defaultFormatter": "charliermarsh.ruff" + } +} \ No newline at end of file diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 000000000..085a47bd1 --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,29 @@ +{ + // See https://go.microsoft.com/fwlink/?LinkId=733558 + // for the documentation about the tasks.json format + "version": "2.0.0", + "tasks": [ + { + "label": "build vectordb bench", + "type": "shell", + "command": "python", + "args": [ + "-m", + "pip", + "install", + "-e", + ".[test]" + ], + "group": { + "kind": "build", + "isDefault": true + } + }, + { + "label": "run vectordb bench", + "type": "shell", + "command": "init_bench", + "problemMatcher": [] + } + ] +} \ No newline at end of file diff --git a/ci-transwarp/Dockerfile b/ci-transwarp/Dockerfile new file mode 100644 index 000000000..6c21a5219 --- /dev/null +++ b/ci-transwarp/Dockerfile @@ -0,0 +1,33 @@ +FROM ubuntu:22.04 + +ARG build_dir=/opt/transwarp/vectordb_bench + +RUN \ + # basics + PKGS="software-properties-common vim sudo locales git" && \ + apt-get update && \ + apt-get install -y ${PKGS} && \ + locale-gen en_US.UTF-8 && \ + ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/timezone && \ + ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \ + # python + add-apt-repository -y ppa:deadsnakes/ppa && \ + apt-get update && \ + apt-get install -y python3-pip python3.11 python3.11-dev && \ + apt-get clean && apt-get autoclean + +ENV LC_ALL="en_US.UTF-8" +ENV LANG="en_US.UTF-8" + +COPY ci-transwarp/pip.conf /root/.config/pip/pip.conf +COPY . ${build_dir} + +RUN \ + update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 && \ + python -m pip install --upgrade pip && \ + python -m pip install hippo-api==1.2.0rc1 && \ + cd ${build_dir} && python -m pip install .[test] && cd ${build_dir} && rm -rf ./* + +VOLUME [ "${build_dir}" ] + +WORKDIR ${build_dir} diff --git a/ci-transwarp/Note.md b/ci-transwarp/Note.md new file mode 100644 index 000000000..838d1f783 --- /dev/null +++ b/ci-transwarp/Note.md @@ -0,0 +1,77 @@ +CI + +--- + +### 准备数据集 + +``` +aws s3 ls s3://assets.zilliz.com/benchmark/ --region us-west-2 --recursive --no-sign-request + +aws s3 cp s3://assets.zilliz.com/benchmark/cohere_medium_1m cohere_medium_1m --region us-west-2 --recursive --no-sign-request +``` + +### 构建 + +```shell +docker build \ + --network=host \ + -f ci-transwarp/Dockerfile \ + -t 172.16.1.99/hippo/vectordb_bench/builder \ + . +``` + +### 运行 + +```shell +git clone -b dev "http://gitlab+deploy-token-54:AJJ9dcXoYsHXKaHLdb2A@172.16.1.41/distributed-storage/vectordbbench.git" + +# docker run这个在上一个clone出来的目录下跑 +docker run \ + --network=host \ + -itd \ + -v $(pwd):/opt/transwarp/vectordb_bench \ + -v XXXX:/tmp/vectordb_bench/dataset \ + 172.16.1.99/hippo/vectordb_bench/builder bash +``` + +XXXX这个目录是数据集的目录,目录结构大概如下(参考tw-node45节点/mnt/disk1/hippo/dataset/vectordb_bench, tar.gz文件忽略): + +``` +[root@tw-node45 vectordb_bench]# tree +. +├── cohere +│   └── cohere_medium_1m +│   ├── neighbors_head_1p.parquet +│   ├── neighbors.parquet +│   ├── neighbors_tail_1p.parquet +│   ├── shuffle_train.parquet +│   ├── test.parquet +│   └── train.parquet +├── cohere_medium_1m.tar.gz +└── openai + ├── openai_medium_500k + │   ├── neighbors_head_1p.parquet + │   ├── neighbors.parquet + │   ├── neighbors_tail_1p.parquet + │   ├── shuffle_train.parquet + │   ├── test.parquet + │   └── train.parquet + ├── openai_small_50k + │   ├── neighbors_head_1p.parquet + │   ├── neighbors.parquet + │   ├── neighbors_tail_1p.parquet + │   ├── shuffle_train.parquet + │   ├── test.parquet + │   └── train.parquet + └── openai_small_50k.tar.gz + +``` + + +容器里执行: + +```shell +cd /opt/transwarp/vectordb_bench +python -m pip install . +init_bench +``` \ No newline at end of file diff --git a/ci-transwarp/pip.conf b/ci-transwarp/pip.conf new file mode 100644 index 000000000..ddc2a8931 --- /dev/null +++ b/ci-transwarp/pip.conf @@ -0,0 +1,5 @@ +[global] +index-url = https://mirrors.aliyun.com/pypi/simple + +[install] +trusted-host = mirrors.aliyun.com diff --git a/vectordb_bench/backend/clients/__init__.py b/vectordb_bench/backend/clients/__init__.py index 3df11610b..4662ea53c 100644 --- a/vectordb_bench/backend/clients/__init__.py +++ b/vectordb_bench/backend/clients/__init__.py @@ -32,6 +32,7 @@ class DB(Enum): PgVectoRS = "PgVectoRS" Redis = "Redis" Chroma = "Chroma" + Hippo = "Hippo" @property @@ -76,6 +77,10 @@ def init_cls(self) -> Type[VectorDB]: if self == DB.Chroma: from .chroma.chroma import ChromaClient return ChromaClient + + if self == DB.Hippo: + from .hippo.hippo import Hippo + return Hippo @property def config_cls(self) -> Type[DBConfig]: @@ -120,6 +125,10 @@ def config_cls(self) -> Type[DBConfig]: from .chroma.config import ChromaConfig return ChromaConfig + if self == DB.Hippo: + from .hippo.config import HippoConfig + return HippoConfig + def case_config_cls(self, index_type: IndexType | None = None) -> Type[DBCaseConfig]: if self == DB.Milvus: from .milvus.config import _milvus_case_config @@ -149,6 +158,10 @@ def case_config_cls(self, index_type: IndexType | None = None) -> Type[DBCaseCon from .pgvecto_rs.config import _pgvecto_rs_case_config return _pgvecto_rs_case_config.get(index_type) + if self == DB.Hippo: + from .hippo.config import HippoIndexConfig + return HippoIndexConfig + # DB.Pinecone, DB.Chroma, DB.Redis return EmptyDBCaseConfig diff --git a/vectordb_bench/backend/clients/hippo/config.py b/vectordb_bench/backend/clients/hippo/config.py new file mode 100644 index 000000000..0c9318ca0 --- /dev/null +++ b/vectordb_bench/backend/clients/hippo/config.py @@ -0,0 +1,67 @@ +from pydantic import BaseModel, Field, SecretStr +from transwarp_hippo_api.hippo_type import IndexType +from transwarp_hippo_api.hippo_type import MetricType as HippoMetricType + +from ..api import DBCaseConfig, DBConfig, MetricType + + +class HippoConfig(DBConfig): + ip: SecretStr = "" + port: SecretStr = "18902" + username: SecretStr = "shiva" + password: SecretStr = "shiva" + number_of_shards: int = Field(default=1, ge=1) + number_of_replicas: int = Field(default=1, ge=1) + insert_batch_size: int = Field(default=100, ge=1) + + def to_dict(self) -> dict: + return { + "host_port": [ + f"{self.ip.get_secret_value()}:{self.port.get_secret_value()}" + ], + "username": self.username.get_secret_value(), + "pwd": self.password.get_secret_value(), + "number_of_shards": self.number_of_shards, + "number_of_replicas": self.number_of_replicas, + "insert_batch_size": self.insert_batch_size, + } + + +class HippoIndexConfig(BaseModel, DBCaseConfig): + index: IndexType = IndexType.HNSW # HNSW, FLAT, IVF_FLAT, IVF_SQ, IVF_PQ, ANNOY + metric_type: MetricType | None = None + M: int = 30 # [4,96] + ef_construction: int = 360 # [8, 512] + ef_search: int = 100 # [topk, 32768] + nlist: int = 1024 # [1,65536] + nprobe: int = 64 # [1, nlist] + m: int = 16 # divisible by dim + nbits: int = 8 # [1, 16] + k_factor: int = 100 # [10, 1000] + + def parse_metric(self) -> HippoMetricType: + if self.metric_type == MetricType.COSINE: + return HippoMetricType.COSINE + if self.metric_type == MetricType.IP: + return HippoMetricType.IP + if self.metric_type == MetricType.L2: + return HippoMetricType.L2 + return "" + + def index_param(self) -> dict: + return { + "M": self.M, + "ef_construction": self.ef_construction, + "ef_search": self.ef_search, + "nlist": self.nlist, + "nprobe": self.nprobe, + "m": self.m, + "nbits": self.nbits, + } + + def search_param(self) -> dict: + return { + "ef_search": self.ef_search, + "nprobe": self.nprobe, + "k_factor": self.k_factor, + } diff --git a/vectordb_bench/backend/clients/hippo/hippo.py b/vectordb_bench/backend/clients/hippo/hippo.py new file mode 100644 index 000000000..3f17bd302 --- /dev/null +++ b/vectordb_bench/backend/clients/hippo/hippo.py @@ -0,0 +1,212 @@ +import logging +from contextlib import contextmanager +from typing import Iterable + +import numpy as np +from transwarp_hippo_api.hippo_client import HippoClient, HippoField +from transwarp_hippo_api.hippo_type import HippoType + +from ..api import VectorDB +from .config import HippoIndexConfig + +log = logging.getLogger(__name__) + + +class Hippo(VectorDB): + def __init__( + self, + dim: int, + db_config: dict, + db_case_config: HippoIndexConfig, + drop_old: bool = False, + **kwargs, + ): + """Initialize wrapper around the hippo vector database.""" + self.name = "Hippo" + self.db_config = db_config + self.index_config = db_case_config + + self.database_name = "default" + self.table_name = "vdbbench_table" + self.index_name = "vector_index" + + self.vector_field_name = "vector" + self.int_field_name = "label" + self.pk_field_name = "pk" + + self.insert_batch_size = db_config.get("insert_batch_size") + self.activated = False + + # if `drop_old`, check table and delete table + hc = HippoClient( + **{ + k: db_config[k] + for k in ["host_port", "username", "pwd"] + if k in db_config + } + ) + if drop_old: + try: + table_check = hc.check_table_exists( + self.table_name, database_name=self.database_name + ) + log.info(f"check table exsited: {table_check}") + except ValueError as e: + log.error("failed to check table exsited; skip", exc_info=e) + table_check = False + + if table_check: + log.info(f"delete table: {self.table_name}") + hc.delete_table(self.table_name, database_name=self.database_name) + hc.delete_table_in_trash( + self.table_name, database_name=self.database_name + ) + + # create table + fields = [ + HippoField(self.pk_field_name, True, HippoType.INT64), + HippoField(self.int_field_name, False, HippoType.INT64), + HippoField( + self.vector_field_name, + False, + HippoType.FLOAT_VECTOR, + type_params={"dimension": dim}, + ), + ] + log.info(f"create table: {self.table_name}") + hc.create_table( + name=self.table_name, + fields=fields, + database_name=self.database_name, + number_of_shards=db_config.get("number_of_shards"), + number_of_replicas=db_config.get("number_of_replicas"), + ) + + table = hc.get_table(self.table_name, database_name=self.database_name) + # create index + log.info("create index") + table.create_index( + field_name=self.vector_field_name, + index_name=self.index_name, + index_type=self.index_config.index, + metric_type=self.index_config.parse_metric(), + **self.index_config.index_param(), + ) + + def need_normalize_cosine(self) -> bool: + """Wheather this database need to normalize dataset to support COSINE""" + return False + + @contextmanager + def init(self): + """ + generate connection + Examples: + >>> with self.init(): + >>> self.insert_embeddings() + >>> self.search_embedding() + """ + from transwarp_hippo_api.hippo_client import HippoClient + + hc = HippoClient( + **{ + k: self.db_config[k] + for k in ["host_port", "username", "pwd"] + if k in self.db_config + } + ) + self.client = hc.get_table(self.table_name, database_name=self.database_name) + + yield + + def _activate_index(self): + if not self.activated: + try: + log.info("start activate index, please wait ...") + self.client.activate_index( + self.index_name, wait_for_completion=True, timeout="25h" + ) + log.info("index is actived.") + except Exception as e: + log.error("failed to activate index; skip", exc_info=e) + + self.activated = True + + def insert_embeddings( + self, embeddings: Iterable[list[float]], metadata: list[int], **kwargs + ): + assert self.client is not None + insert_count = 0 + try: + for batch_start_offset in range(0, len(embeddings), self.insert_batch_size): + log.info("batch offset: %d", batch_start_offset) + + data = [ + list( + metadata[ + batch_start_offset : batch_start_offset + + self.insert_batch_size + ] + ), + list( + metadata[ + batch_start_offset : batch_start_offset + + self.insert_batch_size + ] + ), + [ + i.tolist() if isinstance(i, np.ndarray) else i + for i in embeddings[ + batch_start_offset : batch_start_offset + + self.insert_batch_size + ] + ], + ] + + self.client.insert_rows(data) + insert_count += len(data[0]) + # if kwargs.get("last_batch"): + # self._activate_index() + except Exception as e: + log.error("hippp insert error", exc_info=e) + return (insert_count, e) + + log.info("total insert: %d", insert_count) + + return (insert_count, None) + + def search_embedding( + self, + query: list[float], + k: int = 100, + filters: dict | None = None, + timeout: int | None = None, + ) -> list[int]: + # assert self.col is not None + + dsl = f"{self.int_field_name} >= {filters['id']}" if filters else "" + output_fields = [self.int_field_name] + result = self.client.query( + self.vector_field_name, + [query], + output_fields, + k, + dsl=dsl, + **self.index_config.search_param(), + ) + + return result[0][self.int_field_name] + + def optimize(self, **kwargs): + self._activate_index() + + if kwargs.get("filters"): + log.info(f"create scalar index on field: {self.int_field_name}") + self.client.create_scalar_index( + field_names=[self.int_field_name], + index_name="idx_" + self.int_field_name, + ) + log.info("scalar index created") + + def ready_to_load(self): + return diff --git a/vectordb_bench/backend/clients/hippo/test.py b/vectordb_bench/backend/clients/hippo/test.py new file mode 100644 index 000000000..2d20d711b --- /dev/null +++ b/vectordb_bench/backend/clients/hippo/test.py @@ -0,0 +1,93 @@ +from transwarp_hippo_api.hippo_client import HippoClient, HippoField +from transwarp_hippo_api.hippo_type import HippoType, IndexType, MetricType +import numpy as np + +ip = "" +port = "" +username = "" +pwd = "" + +dim = 128 +n_train = 10000 +n_test = 100 + +# connect +hc = HippoClient([f"{ip}:{port}"], username=username, pwd=pwd) + +# create database +database_name = "default" +# db = hc.create_database(database_name) + +# create table +table_name = "vdbbench_table" +# table_check = hc.check_table_exists(table_name, database_name=database_name) +# if table_check: +# hc.delete_table(table_name, database_name=database_name) +# hc.delete_table_in_trash(table_name, database_name=database_name) +vector_field_name = "vector" +int_field_name = "label" +pk_field_name = "pk" +fields = [ + HippoField(pk_field_name, True, HippoType.INT64), + HippoField(int_field_name, False, HippoType.INT64), + HippoField(vector_field_name, False, HippoType.FLOAT_VECTOR, + type_params={"dimension": dim}), +] +client = hc.create_table(name=table_name, fields=fields, + database_name=database_name, number_of_shards=1, number_of_replicas=1) + + +# get table +client = hc.get_table(table_name, database_name=database_name) + + +# create index +index_name = "vector_index" +M = 30 # [4,96] +ef_construction = 360 # [8, 512] +ef_search = 100 # [topk, 32768] +client.create_index(field_name=vector_field_name, index_name=index_name, + index_type=IndexType.HNSW, metric_type=MetricType.L2, + M=M, ef_construction=ef_construction, ef_search=ef_search) + + +# # load? +# index_loaded = client.load_index(index_name) + +# insert +pk_data = np.arange(n_train) +int_data = np.random.randint(0, 100, n_train) +vector_data = np.random.rand(n_train, dim) +batch_size = 100 +for offset in range(0, n_train, batch_size): + start = offset + end = offset + batch_size + print(f"insert {start}-{end}") + data = [ + pk_data[start:end].tolist(), int_data[start:end].tolist( + ), vector_data[start:end].tolist(), + ] + client.insert_rows(data) + +# need activate - like milvus load +client.activate_index(index_name, wait_for_completion=True, timeout="25h") + +# ann search +query_vectors = np.random.rand(n_test, dim) +output_fields = [pk_field_name, int_field_name] +k = 10 +dsl = f"{int_field_name} >= 90" +result = client.query(vector_field_name, query_vectors.tolist(), + output_fields, topk=k, dsl=dsl) +print(result[0]) + +result = client.query(vector_field_name, query_vectors.tolist(), + output_fields, topk=100) +print(result[0]) + +# delete table +hc.delete_table(table_name, database_name=database_name) +hc.delete_table_in_trash(table_name, database_name=database_name) + +# # delete database +# hc.delete_database(database_name) diff --git a/vectordb_bench/backend/data_source.py b/vectordb_bench/backend/data_source.py index 65926ff6b..5f944b065 100644 --- a/vectordb_bench/backend/data_source.py +++ b/vectordb_bench/backend/data_source.py @@ -132,7 +132,8 @@ def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path, chec remote_file = pathlib.Path(self.remote_root, dataset, file) local_file = local_ds_root.joinpath(file) - if (not local_file.exists()) or (not self.validate_file(remote_file, local_file, check_etag)): + # if (not local_file.exists()) or (not self.validate_file(remote_file, local_file, check_etag)): + if (not local_file.exists()): log.info(f"local file: {local_file} not match with remote: {remote_file}; add to downloading list") downloads.append(remote_file) diff --git a/vectordb_bench/backend/runner/serial_runner.py b/vectordb_bench/backend/runner/serial_runner.py index aeed0ec74..e11822f9a 100644 --- a/vectordb_bench/backend/runner/serial_runner.py +++ b/vectordb_bench/backend/runner/serial_runner.py @@ -47,6 +47,7 @@ def task(self) -> int: log.debug(f"batch dataset size: {len(all_embeddings)}, {len(all_metadata)}") last_batch = self.dataset.data.size - count == len(all_metadata) + log.info(f"last_batch: {last_batch} data size: {self.dataset.data.size} count: {count}") insert_count, error = self.db.insert_embeddings( embeddings=all_embeddings, metadata=all_metadata, @@ -114,7 +115,7 @@ def _insert_all_batches(self) -> int: psutil.Process(pid).kill() raise PerformanceTimeoutError(msg) from e except Exception as e: - log.warning(f"VectorDB load dataset error: {e}") + log.error("VectorDB load dataset error: ", exc_info=e) raise e from e else: return count @@ -169,7 +170,7 @@ def __init__( self.test_data = test_data self.ground_truth = ground_truth - def search(self, args: tuple[list, pd.DataFrame]): + def search(self, args: tuple[list, pd.DataFrame]) -> tuple[float, float, float]: log.info(f"{mp.current_process().name:14} start search the entire test_data to get recall and latency") with self.db.init(): test_data, ground_truth = args @@ -213,14 +214,14 @@ def search(self, args: tuple[list, pd.DataFrame]): f"avg_latency={avg_latency}, " f"p99={p99}" ) - return (avg_recall, p99) + return (avg_recall, p99, avg_latency) - def _run_in_subprocess(self) -> tuple[float, float]: + def _run_in_subprocess(self) -> tuple[float, float, float]: with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor: future = executor.submit(self.search, (self.test_data, self.ground_truth)) result = future.result() return result - def run(self) -> tuple[float, float]: + def run(self) -> tuple[float, float, float]: return self._run_in_subprocess() diff --git a/vectordb_bench/backend/task_runner.py b/vectordb_bench/backend/task_runner.py index 80c5ac1df..b78a6bab3 100644 --- a/vectordb_bench/backend/task_runner.py +++ b/vectordb_bench/backend/task_runner.py @@ -58,7 +58,7 @@ def __eq__(self, obj): self.config.db == obj.config.db and \ self.config.db_case_config == obj.config.db_case_config and \ self.ca.dataset == obj.ca.dataset - return False + return False def display(self) -> dict: c_dict = self.ca.dict(include={'label':True, 'filters': True,'dataset':{'data': {'name': True, 'size': True, 'dim': True, 'metric_type': True, 'label': True}} }) @@ -140,7 +140,7 @@ def _run_perf_case(self, drop_old: bool = True) -> Metric: ) self._init_search_runner() - m.recall, m.serial_latency_p99 = self._serial_search() + m.recall, m.serial_latency_p99, m.serial_latency_avg = self._serial_search() m.qps = self._conc_search() except Exception as e: log.warning(f"Failed to run performance case, reason = {e}") @@ -161,7 +161,7 @@ def _load_train_data(self): finally: runner = None - def _serial_search(self) -> tuple[float, float]: + def _serial_search(self) -> tuple[float, float, float]: """Performance serial tests, search the entire test data once, calculate the recall, serial_latency_p99 @@ -193,7 +193,7 @@ def _conc_search(self): @utils.time_it def _task(self) -> None: with self.db.init(): - self.db.optimize() + self.db.optimize(filters=self.ca.filters) def _optimize(self) -> float: with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor: diff --git a/vectordb_bench/frontend/components/run_test/dbSelector.py b/vectordb_bench/frontend/components/run_test/dbSelector.py index 61db843f3..40c5ee5f9 100644 --- a/vectordb_bench/frontend/components/run_test/dbSelector.py +++ b/vectordb_bench/frontend/components/run_test/dbSelector.py @@ -30,7 +30,7 @@ def dbSelector(st): for i, db in enumerate(DB_LIST): column = dbContainerColumns[i % DB_SELECTOR_COLUMNS] dbIsActived[db] = column.checkbox(db.name) - column.image(DB_TO_ICON.get(db, "")) + column.image(DB_TO_ICON.get(db, ""), width=100) activedDbList = [db for db in DB_LIST if dbIsActived[db]] return activedDbList diff --git a/vectordb_bench/frontend/const/dbCaseConfigs.py b/vectordb_bench/frontend/const/dbCaseConfigs.py index fad5f362d..ad6a3b73f 100644 --- a/vectordb_bench/frontend/const/dbCaseConfigs.py +++ b/vectordb_bench/frontend/const/dbCaseConfigs.py @@ -1,10 +1,12 @@ -from enum import IntEnum import typing +from enum import IntEnum + from pydantic import BaseModel +from transwarp_hippo_api.hippo_type import IndexType as HippoIndexType + from vectordb_bench.backend.cases import CaseLabel, CaseType from vectordb_bench.backend.clients import DB from vectordb_bench.backend.clients.api import IndexType - from vectordb_bench.models import CaseConfigParamType MAX_STREAMLIT_INT = (1 << 53) - 1 @@ -419,6 +421,156 @@ class CaseConfigInput(BaseModel): }, ) +CaseConfigParamInput_IndexType_Hippo = CaseConfigInput( + label=CaseConfigParamType.IndexType, + inputType=InputType.Option, + inputConfig={ + "options": [ + HippoIndexType.HNSW.value, + HippoIndexType.FLAT.value, + HippoIndexType.IVF_FLAT.value, + HippoIndexType.IVF_SQ.value, + HippoIndexType.IVF_PQ.value, + HippoIndexType.ANNOY.value, + ], + }, +) + +CaseConfigParamInput_M_Hippo = CaseConfigInput( + label=CaseConfigParamType.M, + inputType=InputType.Number, + inputConfig={ + "min": 4, + "max": 64, + "value": 30, + }, + isDisplayed=lambda config: config.get(CaseConfigParamType.IndexType, None) + == HippoIndexType.HNSW.value, +) + +CaseConfigParamInput_EFConstruction_Hippo = CaseConfigInput( + label=CaseConfigParamType.ef_construction, + inputType=InputType.Number, + inputConfig={ + "min": 8, + "max": 512, + "value": 360, + }, + isDisplayed=lambda config: config.get(CaseConfigParamType.IndexType, None) + == HippoIndexType.HNSW.value, +) + +CaseConfigParamInput_EFSearch_Hippo = CaseConfigInput( + label=CaseConfigParamType.ef_search, + inputType=InputType.Number, + inputConfig={ + "min": 100, + "max": MAX_STREAMLIT_INT, + "value": 100, + }, + isDisplayed=lambda config: config.get(CaseConfigParamType.IndexType, None) + == HippoIndexType.HNSW.value, +) + +CaseConfigParamInput_Nlist_Hippo = CaseConfigInput( + label=CaseConfigParamType.Nlist, + inputType=InputType.Number, + inputConfig={ + "min": 1, + "max": 65536, + "value": 1024, + }, + isDisplayed=lambda config: config.get(CaseConfigParamType.IndexType, None) + in [ + HippoIndexType.IVF_FLAT.value, + HippoIndexType.IVF_SQ.value, + HippoIndexType.IVF_PQ.value, + # TODO: add ivf_pq_fs + ], +) + +CaseConfigParamInput_Nprobe_Hippo = CaseConfigInput( + label=CaseConfigParamType.Nprobe, + inputType=InputType.Number, + inputConfig={ + "min": 1, + "max": 65536, + "value": 64, + }, + isDisplayed=lambda config: config.get(CaseConfigParamType.IndexType, None) + in [ + HippoIndexType.IVF_FLAT.value, + HippoIndexType.IVF_SQ.value, + HippoIndexType.IVF_PQ.value, + # TODO: add ivf_pq_fs + ], +) + +CaseConfigParamInput_m_Hippo = CaseConfigInput( + label=CaseConfigParamType.m, + inputType=InputType.Number, + inputConfig={ + "min": 1, + "max": 1024, + "value": 16, + }, + isDisplayed=lambda config: config.get(CaseConfigParamType.IndexType, None) + in [ + HippoIndexType.IVF_PQ.value, + # TODO: add ivf_pq_fs + ], +) + +CaseConfigParamInput_nbits_Hippo = CaseConfigInput( + label=CaseConfigParamType.nbits, + inputType=InputType.Number, + inputConfig={ + "min": 1, + "max": 16, + "value": 8, + }, + isDisplayed=lambda config: config.get(CaseConfigParamType.IndexType, None) + in [ + HippoIndexType.IVF_PQ.value, + ], +) + +CaseConfigParamInput_k_factor_Hippo = CaseConfigInput( + label=CaseConfigParamType.k_factor, + inputType=InputType.Number, + inputConfig={ + "min": 1, + "max": 1000, + "value": 100, + }, + isDisplayed=lambda config: config.get(CaseConfigParamType.IndexType, None) + in [ + HippoIndexType.IVF_PQ.value, + HippoIndexType.IVF_SQ.value, + # TODO: add ivf_pq_fs + ], +) + +CaseConfigParamInput_index_slow_refine_Hippo = CaseConfigInput( + label=CaseConfigParamType.index_slow_refine, + inputType=InputType.Option, + inputConfig={"options": [False, True]}, + isDisplayed=lambda config: config.get(CaseConfigParamType.IndexType, None) + in [ + HippoIndexType.IVF_PQ.value, + HippoIndexType.IVF_SQ.value, + # TODO: add ivf_pq_fs + ], +) + +CaseConfigParamInput_sq_type_Hippo = CaseConfigInput( + label=CaseConfigParamType.sq_type, + inputType=InputType.Text, + inputConfig={"value": ""}, + isDisplayed=lambda config: config.get(CaseConfigParamType.IndexType, None) + == HippoIndexType.IVF_SQ.value, +) + MilvusLoadConfig = [ CaseConfigParamInput_IndexType, CaseConfigParamInput_M, @@ -496,6 +648,29 @@ class CaseConfigInput(BaseModel): CaseConfigParamInput_ZillizLevel, ] +HippoLoadConfig = [ + CaseConfigParamInput_IndexType_Hippo, + CaseConfigParamInput_M_Hippo, + CaseConfigParamInput_EFConstruction_Hippo, + CaseConfigParamInput_EFSearch_Hippo, + CaseConfigParamInput_Nlist_Hippo, + CaseConfigParamInput_m_Hippo, + CaseConfigParamInput_nbits_Hippo, +] +HippoPerformanceConfig = [ + CaseConfigParamInput_IndexType_Hippo, + CaseConfigParamInput_M_Hippo, + CaseConfigParamInput_EFConstruction_Hippo, + CaseConfigParamInput_EFSearch_Hippo, + CaseConfigParamInput_Nlist_Hippo, + CaseConfigParamInput_Nprobe_Hippo, + CaseConfigParamInput_m_Hippo, + CaseConfigParamInput_nbits_Hippo, + CaseConfigParamInput_k_factor_Hippo, + CaseConfigParamInput_index_slow_refine_Hippo, + CaseConfigParamInput_sq_type_Hippo, +] + CASE_CONFIG_MAP = { DB.Milvus: { CaseLabel.Load: MilvusLoadConfig, @@ -520,4 +695,8 @@ class CaseConfigInput(BaseModel): CaseLabel.Load: PgVectoRSLoadingConfig, CaseLabel.Performance: PgVectoRSPerformanceConfig, }, + DB.Hippo: { + CaseLabel.Load: HippoLoadConfig, + CaseLabel.Performance: HippoPerformanceConfig, + }, } diff --git a/vectordb_bench/frontend/const/styles.py b/vectordb_bench/frontend/const/styles.py index 52d1017a9..ad3e4d9c0 100644 --- a/vectordb_bench/frontend/const/styles.py +++ b/vectordb_bench/frontend/const/styles.py @@ -46,6 +46,7 @@ def getPatternShape(i): DB.PgVectoRS: "https://assets.zilliz.com/PG_Vector_d464f2ef5f.png", DB.Redis: "https://assets.zilliz.com/Redis_Cloud_74b8bfef39.png", DB.Chroma: "https://assets.zilliz.com/chroma_ceb3f06ed7.png", + DB.Hippo: "https://assets.zilliz.com/hippo_3ce85bc90f.png", } # RedisCloud color: #0D6EFD @@ -59,4 +60,5 @@ def getPatternShape(i): DB.WeaviateCloud.value: "#20C997", DB.PgVector.value: "#4C779A", DB.Redis.value: "#0D6EFD", + DB.Hippo.value: "#333", } diff --git a/vectordb_bench/metric.py b/vectordb_bench/metric.py index a2b6d6ff0..be90f3adb 100644 --- a/vectordb_bench/metric.py +++ b/vectordb_bench/metric.py @@ -18,12 +18,14 @@ class Metric: load_duration: float = 0.0 # duration to load all dataset into DB qps: float = 0.0 serial_latency_p99: float = 0.0 + serial_latency_avg: float = 0.0 recall: float = 0.0 QURIES_PER_DOLLAR_METRIC = "QP$ (Quries per Dollar)" LOAD_DURATION_METRIC = "load_duration" SERIAL_LATENCY_P99_METRIC = "serial_latency_p99" +SERIAL_LATENCY_AVG_METRIC = "serial_latency_avg" MAX_LOAD_COUNT_METRIC = "max_load_count" QPS_METRIC = "qps" RECALL_METRIC = "recall" @@ -31,6 +33,7 @@ class Metric: metricUnitMap = { LOAD_DURATION_METRIC: "s", SERIAL_LATENCY_P99_METRIC: "ms", + SERIAL_LATENCY_AVG_METRIC: "ms", MAX_LOAD_COUNT_METRIC: "K", QURIES_PER_DOLLAR_METRIC: "K", } @@ -38,6 +41,7 @@ class Metric: lowerIsBetterMetricList = [ LOAD_DURATION_METRIC, SERIAL_LATENCY_P99_METRIC, + SERIAL_LATENCY_AVG_METRIC, ] metricOrder = [ @@ -45,6 +49,7 @@ class Metric: RECALL_METRIC, LOAD_DURATION_METRIC, SERIAL_LATENCY_P99_METRIC, + SERIAL_LATENCY_AVG_METRIC, MAX_LOAD_COUNT_METRIC, ] diff --git a/vectordb_bench/models.py b/vectordb_bench/models.py index 3c2a5b9aa..da7da9ca6 100644 --- a/vectordb_bench/models.py +++ b/vectordb_bench/models.py @@ -1,23 +1,22 @@ import logging import pathlib from datetime import date -from typing import Self from enum import Enum +from typing import Self import ujson +from . import config +from .backend.cases import CaseType from .backend.clients import ( DB, - DBConfig, DBCaseConfig, + DBConfig, IndexType, ) -from .backend.cases import CaseType from .base import BaseModel -from . import config from .metric import Metric - log = logging.getLogger(__name__) @@ -60,6 +59,11 @@ class CaseConfigParamType(Enum): cache_dataset_on_device = "cache_dataset_on_device" refine_ratio = "refine_ratio" level = "level" + ef_construction = "ef_construction" + ef_search = "ef_search" + k_factor = "k_factor" + index_slow_refine = "index_slow_refine" + sq_type = "sq_type" class CustomizedCase(BaseModel): From 8bb1b529078952b4c00c7ab80f9b418b395e281c Mon Sep 17 00:00:00 2001 From: "bingtao.yin" Date: Fri, 1 Mar 2024 04:52:12 +0000 Subject: [PATCH 2/3] remove some files --- .devcontainer/devcontainer.json | 50 ------------------- .devcontainer/postCreateCommand.sh | 34 ------------- .gitignore | 8 +--- ci-transwarp/Dockerfile | 33 ------------- ci-transwarp/Note.md | 77 ------------------------------ ci-transwarp/pip.conf | 5 -- 6 files changed, 1 insertion(+), 206 deletions(-) delete mode 100644 .devcontainer/devcontainer.json delete mode 100644 .devcontainer/postCreateCommand.sh delete mode 100644 ci-transwarp/Dockerfile delete mode 100644 ci-transwarp/Note.md delete mode 100644 ci-transwarp/pip.conf diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json deleted file mode 100644 index b3439a14e..000000000 --- a/.devcontainer/devcontainer.json +++ /dev/null @@ -1,50 +0,0 @@ -// For format details, see https://aka.ms/devcontainer.json. For config options, see the -// README at: https://github.com/devcontainers/templates/tree/main/src/cpp -{ - "name": "vectordb bench dev", - "image": "172.16.1.99/hippo/vectordb_bench/builder:latest", - "runArgs": [ - "--privileged", - "--cap-add=SYS_PTRACE", - "--security-opt", - "seccomp=unconfined" - ], - "mounts": [ - "source=/var/run/docker.sock,target=/var/run/docker.sock,type=bind" - ], - "workspaceMount": "source=${localWorkspaceFolder},target=/opt/transwarp/vectordb_bench,type=bind,consistency=cached", - "workspaceFolder": "/opt/transwarp/vectordb_bench", - "customizations": { - "vscode": { - // Set *default* container specific settings.json values on container create. - "settings": { - "terminal.integrated.shell.linux": "/bin/bash", - "C_Cpp.default.cppStandard": "c++20", - "C_Cpp.default.cStandard": "c17", - "C_Cpp.default.browse.databaseFilename": "${workspaceFolder}/.vscode/.browse.c_cpp.db", - }, - // Add the IDs of extensions you want installed when the container is created. - "extensions": [ - "foxundermoon.shell-format", - "redhat.vscode-yaml", - "ms-azuretools.vscode-docker", - "EditorConfig.EditorConfig", - "codezombiech.gitignore", - "yzhang.markdown-all-in-one", - "SonarSource.sonarlint-vscode", - "GitHub.copilot", - "ms-python.python", - "ms-python.debugpy", - "VisualStudioExptTeam.vscodeintellicode", - "donjayamanne.python-environment-manager", - "charliermarsh.ruff" - ] - } - }, - // Use 'forwardPorts' to make a list of ports inside the container available locally. - // "forwardPorts": [], - // Use 'postCreateCommand' to run commands after the container is created. - "postCreateCommand": "bash ./.devcontainer/postCreateCommand.sh" - // Comment out connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root. - // "remoteUser": "vscode" -} \ No newline at end of file diff --git a/.devcontainer/postCreateCommand.sh b/.devcontainer/postCreateCommand.sh deleted file mode 100644 index 3f8f77fcc..000000000 --- a/.devcontainer/postCreateCommand.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash - -function create_user { - USERNAME=dev - USER_UID=1000 - USER_GID=$USER_UID - - # Create the user - groupadd --gid $USER_GID $USERNAME - useradd --uid $USER_UID --gid $USER_GID -m $USERNAME - - # [Optional] Add sudo support. Omit if you don't need to install software after connecting. - apt install -y sudo - echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME - chmod 0440 /etc/sudoers.d/$USERNAME -} - -function fix_git { - # touch ~/.gitconfig - - git config --global --add safe.directory /opt/transwarp/vectordb_bench - - git config --global --unset http.https://github.com.proxy - git config --global --unset https.https://github.com.proxy - -} - -function main { - # create_user - - fix_git -} - -main $@ \ No newline at end of file diff --git a/.gitignore b/.gitignore index 0a316c319..8d28d7eeb 100644 --- a/.gitignore +++ b/.gitignore @@ -10,9 +10,6 @@ build/ venv/ .idea/ -# result files -vectordb_bench/results/** - # vscode files .vscode/* !.vscode/tasks.json @@ -26,7 +23,4 @@ vectordb_bench/results/** .history/ # Built Visual Studio Code Extensions -*.vsix - -# ruff files -.ruff_cache \ No newline at end of file +*.vsix \ No newline at end of file diff --git a/ci-transwarp/Dockerfile b/ci-transwarp/Dockerfile deleted file mode 100644 index 6c21a5219..000000000 --- a/ci-transwarp/Dockerfile +++ /dev/null @@ -1,33 +0,0 @@ -FROM ubuntu:22.04 - -ARG build_dir=/opt/transwarp/vectordb_bench - -RUN \ - # basics - PKGS="software-properties-common vim sudo locales git" && \ - apt-get update && \ - apt-get install -y ${PKGS} && \ - locale-gen en_US.UTF-8 && \ - ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/timezone && \ - ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \ - # python - add-apt-repository -y ppa:deadsnakes/ppa && \ - apt-get update && \ - apt-get install -y python3-pip python3.11 python3.11-dev && \ - apt-get clean && apt-get autoclean - -ENV LC_ALL="en_US.UTF-8" -ENV LANG="en_US.UTF-8" - -COPY ci-transwarp/pip.conf /root/.config/pip/pip.conf -COPY . ${build_dir} - -RUN \ - update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 && \ - python -m pip install --upgrade pip && \ - python -m pip install hippo-api==1.2.0rc1 && \ - cd ${build_dir} && python -m pip install .[test] && cd ${build_dir} && rm -rf ./* - -VOLUME [ "${build_dir}" ] - -WORKDIR ${build_dir} diff --git a/ci-transwarp/Note.md b/ci-transwarp/Note.md deleted file mode 100644 index 838d1f783..000000000 --- a/ci-transwarp/Note.md +++ /dev/null @@ -1,77 +0,0 @@ -CI - ---- - -### 准备数据集 - -``` -aws s3 ls s3://assets.zilliz.com/benchmark/ --region us-west-2 --recursive --no-sign-request - -aws s3 cp s3://assets.zilliz.com/benchmark/cohere_medium_1m cohere_medium_1m --region us-west-2 --recursive --no-sign-request -``` - -### 构建 - -```shell -docker build \ - --network=host \ - -f ci-transwarp/Dockerfile \ - -t 172.16.1.99/hippo/vectordb_bench/builder \ - . -``` - -### 运行 - -```shell -git clone -b dev "http://gitlab+deploy-token-54:AJJ9dcXoYsHXKaHLdb2A@172.16.1.41/distributed-storage/vectordbbench.git" - -# docker run这个在上一个clone出来的目录下跑 -docker run \ - --network=host \ - -itd \ - -v $(pwd):/opt/transwarp/vectordb_bench \ - -v XXXX:/tmp/vectordb_bench/dataset \ - 172.16.1.99/hippo/vectordb_bench/builder bash -``` - -XXXX这个目录是数据集的目录,目录结构大概如下(参考tw-node45节点/mnt/disk1/hippo/dataset/vectordb_bench, tar.gz文件忽略): - -``` -[root@tw-node45 vectordb_bench]# tree -. -├── cohere -│   └── cohere_medium_1m -│   ├── neighbors_head_1p.parquet -│   ├── neighbors.parquet -│   ├── neighbors_tail_1p.parquet -│   ├── shuffle_train.parquet -│   ├── test.parquet -│   └── train.parquet -├── cohere_medium_1m.tar.gz -└── openai - ├── openai_medium_500k - │   ├── neighbors_head_1p.parquet - │   ├── neighbors.parquet - │   ├── neighbors_tail_1p.parquet - │   ├── shuffle_train.parquet - │   ├── test.parquet - │   └── train.parquet - ├── openai_small_50k - │   ├── neighbors_head_1p.parquet - │   ├── neighbors.parquet - │   ├── neighbors_tail_1p.parquet - │   ├── shuffle_train.parquet - │   ├── test.parquet - │   └── train.parquet - └── openai_small_50k.tar.gz - -``` - - -容器里执行: - -```shell -cd /opt/transwarp/vectordb_bench -python -m pip install . -init_bench -``` \ No newline at end of file diff --git a/ci-transwarp/pip.conf b/ci-transwarp/pip.conf deleted file mode 100644 index ddc2a8931..000000000 --- a/ci-transwarp/pip.conf +++ /dev/null @@ -1,5 +0,0 @@ -[global] -index-url = https://mirrors.aliyun.com/pypi/simple - -[install] -trusted-host = mirrors.aliyun.com From b648245040b688765f2459d8ef427f476e6c658a Mon Sep 17 00:00:00 2001 From: "bingtao.yin" Date: Fri, 1 Mar 2024 05:01:46 +0000 Subject: [PATCH 3/3] enable validate file --- vectordb_bench/backend/data_source.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vectordb_bench/backend/data_source.py b/vectordb_bench/backend/data_source.py index 5f944b065..65926ff6b 100644 --- a/vectordb_bench/backend/data_source.py +++ b/vectordb_bench/backend/data_source.py @@ -132,8 +132,7 @@ def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path, chec remote_file = pathlib.Path(self.remote_root, dataset, file) local_file = local_ds_root.joinpath(file) - # if (not local_file.exists()) or (not self.validate_file(remote_file, local_file, check_etag)): - if (not local_file.exists()): + if (not local_file.exists()) or (not self.validate_file(remote_file, local_file, check_etag)): log.info(f"local file: {local_file} not match with remote: {remote_file}; add to downloading list") downloads.append(remote_file)