Skip to content

Commit

Permalink
enhance: Support for windows
Browse files Browse the repository at this point in the history
Add github actions to make sure about it

See also: #285

Signed-off-by: yangxuan <[email protected]>
  • Loading branch information
XuanYang-cn committed Mar 13, 2024
1 parent c6af695 commit 08b5313
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 18 deletions.
36 changes: 36 additions & 0 deletions .github/workflows/pull_request.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: Test on pull request

on:
pull_request:
branches:
- main

jobs:
build:
name: Run Python Tests
strategy:
matrix:
python-version: [3.11, 3.12]
os: [ubuntu-latest, windows-latest]
runs-on: ${{ matrix.os }}

steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Setup Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

- name: Fetch tags
run: |
git fetch --prune --unshallow --tags
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -e ".[test]"
- name: Test with pytest
run: |
make unittest
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
unittest:
PYTHONPATH=`pwd` python3 -m pytest tests/test_dataset.py::TestDataSet::test_download_small -svv
39 changes: 26 additions & 13 deletions tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def test_iter_cohere(self):
dur_iter = time.time() - before
log.warning(f"iter through cohere_10m cost={dur_iter/60}min")

# pytest -sv tests/test_dataset.py::TestDataSet::test_iter_laion
# pytest -sv tests/test_dataset.py::TestDataSet::test_iter_laion
def test_iter_laion(self):
laion_100m = Dataset.LAION.manager(100_000_000)
from vectordb_bench.backend.data_source import DatasetSource
Expand All @@ -50,17 +50,30 @@ def test_iter_laion(self):
dur_iter = time.time() - before
log.warning(f"iter through laion_100m cost={dur_iter/60}min")

# https://github.com/zilliztech/VectorDBBench/issues/285
# TODO: ok
def test_iter_openai(self):

openai_500k = Dataset.OPENAI.manager(500_000)
openai_500k.prepare(source=DatasetSource.AliyunOSS, check=False)
def test_download_small(self):
openai_50k = Dataset.OPENAI.manager(50_000)
files = [
"test.parquet",
"neighbors.parquet",
"neighbors_head_1p.parquet",
"neighbors_tail_1p.parquet",
]

import time
before = time.time()
for i in openai_500k:
log.debug(i.head(1))
file_path = openai_50k.data_dir.joinpath("test.parquet")
import os

DatasetSource.S3.reader().read(
openai_50k.data.dir_name.lower(),
files=files,
local_ds_root=openai_50k.data_dir,
check_etag=False,
)

os.remove(file_path)
DatasetSource.AliyunOSS.reader().read(
openai_50k.data.dir_name.lower(),
files=files,
local_ds_root=openai_50k.data_dir,
check_etag=False,
)

dur_iter = time.time() - before
log.warning(f"iter through openai 500K cost={dur_iter/60}min, source=AliyunOSS")
10 changes: 5 additions & 5 deletions vectordb_bench/backend/data_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,11 @@ def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path, chec
if not local_ds_root.exists():
log.info(f"local dataset root path not exist, creating it: {local_ds_root}")
local_ds_root.mkdir(parents=True)
downloads = [(pathlib.Path("benchmark", dataset, f), local_ds_root.joinpath(f)) for f in files]
downloads = [(pathlib.PosixPath("benchmark", dataset, f), local_ds_root.joinpath(f)) for f in files]

else:
for file in files:
remote_file = pathlib.Path("benchmark", dataset, file)
remote_file = pathlib.PosixPath("benchmark", dataset, file)
local_file = local_ds_root.joinpath(file)

# Don't check etags for Dataset from Aliyun OSS
Expand Down Expand Up @@ -125,11 +125,11 @@ def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path, chec
if not local_ds_root.exists():
log.info(f"local dataset root path not exist, creating it: {local_ds_root}")
local_ds_root.mkdir(parents=True)
downloads = [pathlib.Path(self.remote_root, dataset, f) for f in files]
downloads = [pathlib.PosixPath(self.remote_root, dataset, f) for f in files]

else:
for file in files:
remote_file = pathlib.Path(self.remote_root, dataset, file)
remote_file = pathlib.PosixPath(self.remote_root, dataset, file)
local_file = local_ds_root.joinpath(file)

if (not local_file.exists()) or (not self.validate_file(remote_file, local_file, check_etag)):
Expand All @@ -147,7 +147,7 @@ def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path, chec
log.info(f"Succeed to download all files, downloaded file count = {len(downloads)}")


def validate_file(self, remote: pathlib.Path, local: pathlib.Path, check_etag: bool) -> bool:
def validate_file(self, remote: pathlib.PosixPath, local: pathlib.Path, check_etag: bool) -> bool:
# info() uses ls() inside, maybe we only need to ls once
info = self.fs.info(remote)

Expand Down

0 comments on commit 08b5313

Please sign in to comment.