Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Custom dataset testing #9

Open
wants to merge 21 commits into
base: diskann-testing
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
3905f46
Add separate configs for index and search for multiple datasets.
Sheharyar570 Oct 14, 2024
4eb23bb
Update run script to handle mulitiple configs in a directory.
Sheharyar570 Oct 14, 2024
f662dd1
resolved comments
Sheharyar570 Oct 15, 2024
10e1d74
Updated script to only generate dataset of specified file_count.
Sheharyar570 Oct 15, 2024
a5cb055
Added config files of large datasets
Sheharyar570 Oct 15, 2024
b605c83
Updated scripts and configs to create dataset on runtime.
Sheharyar570 Oct 15, 2024
12a439a
Updated dataset directory paths
Sheharyar570 Oct 15, 2024
d990ac5
Added run configs upto 5 million custom dataset
Sheharyar570 Oct 15, 2024
b6dd4d5
Increased run count to 3 for search configs
Sheharyar570 Oct 15, 2024
17b34e0
Updated db labels in configs.
Sheharyar570 Oct 15, 2024
0ab30ce
Updated db-label in config.
Sheharyar570 Oct 15, 2024
2603723
Dividing configs based on dataset size for building indexes in parallel
Sheharyar570 Oct 15, 2024
22e2fd9
Fixed create_dataset_args directory paths
Sheharyar570 Oct 15, 2024
34e6b45
copy neigbors.parquet in created_dataset directory
Sheharyar570 Oct 15, 2024
130425e
Divided run config in 2 folders to run in parallel on 2 instances.
Sheharyar570 Oct 16, 2024
43c18e6
Updated instance type in all configs.
Sheharyar570 Oct 16, 2024
2efaf44
Added prewarm query result in logs.
Sheharyar570 Oct 17, 2024
2fda975
Merge branch 'diskann-testing' of https://github.com/EmumbaOrg/Vector…
Sheharyar570 Oct 17, 2024
7a695e5
Updated db label in all configs
Sheharyar570 Oct 17, 2024
aaff72d
Updated custom dataset configs
Sheharyar570 Nov 4, 2024
c675feb
set shuffled_data to false
Sheharyar570 Nov 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 54 additions & 20 deletions create_dataset_subsets.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,63 @@
import os
import shutil
import argparse
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

def get_file_name(file_name: str, file_prefix: str, files_count: int) -> str:
if files_count == 1:
return file_prefix + ".parquet"
file_name = file_name.split("of-")[0]
return file_name + "of-" + str(files_count).zfill(2) + ".parquet"

def create_subsets(base_dir, save_dir_path, subset_prefix, file_prefix, step_size):
files = sorted([f for f in os.listdir(base_dir) if f.startswith(file_prefix)])
def create_dataset(base_dir, save_dir_path, subset_prefix, file_prefix, file_count, row_count=500_000):
logger.info(f"Starting dataset creation with {file_count} files.")

# Sort the files and pick only the first 'file_count' files
files = sorted([f for f in os.listdir(base_dir) if f.startswith(file_prefix)])[:file_count]
num_files = len(files)

if num_files == 0:
logger.warning("No files found with the specified prefix.")
return

logger.info(f"Found {num_files} files. Creating dataset...")

# Create the directory for the dataset
subset_dir = os.path.join(save_dir_path, f"{subset_prefix}_{file_count * row_count // 1000}k")
os.makedirs(subset_dir, exist_ok=True)
logger.info(f"Created directory for the dataset: {subset_dir}")

# Copy the first 'file_count' files into the subset directory
for file in files:
src_file = os.path.join(base_dir, file)
dst_file = os.path.join(subset_dir, get_file_name(file, file_prefix, file_count))
shutil.copy(src_file, dst_file)
logger.info(f"Copied {file} to {dst_file}")

for i in range(1, num_files + 1):
subset_dir = os.path.join(save_dir_path, f"{subset_prefix}_{i * step_size // 1000}k")
os.makedirs(subset_dir, exist_ok=True)

for j in range(i):
src_file = os.path.join(base_dir, files[j])
dst_file = os.path.join(subset_dir, get_file_name(files[j], file_prefix, i))
shutil.copy(src_file, dst_file)
src_test_file = os.path.join(base_dir, "test.parquet")
dst_test_file = os.path.join(subset_dir, "test.parquet")
shutil.copy(src_test_file, dst_test_file)
# Also copy the test.parquet file
src_test_file = os.path.join(base_dir, "test.parquet")
dst_test_file = os.path.join(subset_dir, "test.parquet")
shutil.copy(src_test_file, dst_test_file)
logger.info(f"Copied test.parquet to {subset_dir}")

src_test_file = os.path.join(base_dir, "neighbors.parquet")
dst_test_file = os.path.join(subset_dir, "neighbors.parquet")
shutil.copy(src_test_file, dst_test_file)
logger.info(f"Copied neighbors.parquet to {subset_dir}")

logger.info(f"Dataset creation completed. {file_count} files have been copied to {subset_dir}.")

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Create subsets of Parquet files using Dask.")
parser.add_argument("--directory", type=str, help="Path to the directory containing Parquet files.")
parser.add_argument("--save-dir-path", type=str, help="Directory path where data will be saved")
parser.add_argument("--dataset-name-prefix", type=str, help="Name prefix of the folder where each subset will be saved.")
parser = argparse.ArgumentParser(description="Create a dataset with a specified number of Parquet files.")
parser.add_argument("--directory", type=str, required=True, help="Path to the directory containing Parquet files.")
parser.add_argument("--save-dir-path", type=str, required=True, help="Directory path where the dataset will be saved.")
parser.add_argument("--dataset-name-prefix", type=str, help="Name prefix for the dataset folder.")
parser.add_argument("--is-shuffled", type=bool, help="Whether the files are shuffled or not.")
parser.add_argument("--file-count", type=int, required=True, help="Number of Parquet files to include in the dataset.")
args = parser.parse_args()

file_prefix = (
Expand All @@ -48,7 +75,14 @@ def create_subsets(base_dir, save_dir_path, subset_prefix, file_prefix, step_siz
if args.save_dir_path
else args.directory
)
step_size = 500_000 # 500k

create_subsets(args.directory, save_dir_path, subset_prefix, file_prefix, step_size)
print(f'Finished creating subsets of Parquet files in {args.directory}.')
if os.path.exists(save_dir_path) and os.listdir(save_dir_path):
shutil.rmtree(save_dir_path)
logger.info(f"Deleted existing directory: {save_dir_path}")

# Log the input parameters
logger.info(f"Parameters received: directory={args.directory}, save_dir_path={args.save_dir_path}, file_count={args.file_count}, dataset_name_prefix={subset_prefix}, is_shuffled={args.is_shuffled}")

# Create the dataset with the specified file_count
create_dataset(args.directory, save_dir_path, subset_prefix, file_prefix, args.file_count)
logger.info(f'Finished creating a dataset with {args.file_count} Parquet files.')
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
{
"database": {
"host": "localhost1",
"username": "postgres",
"password": "postgres",
"db_name": "ann-1000k",
"instance_type": "Standard_D8ds_v5",
"provider": "azure",
"enable_seqscan": "on"
},
"cases": [
{
"db-label": "memory-comparison-1000k",
"drop_old": true,
"load": true,
"search-serial": false,
"search-concurrent": false,
"case-type": "PerformanceCustomDataset",
"maintenance-work-mem": "16GB",
"max-parallel-workers": 7,
"ef-search": [40],
"ef-construction": 128,
"m": 32,
"num-concurrency": "1,10,20,30,40,50,60,70,80,90,100",
"concurrency-duration": 30,
"k": 10,
"custom-case-name": "hnsw-1536D-1m",
"custom-dataset-name": "custom-openai",
"custom-dataset-dir": "openai_1000k",
"custom-dataset-size": 1000000,
"custom-dataset-dim": 1536,
"custom-dataset-file-count": 2,
"custom-dataset-use-shuffled": false,
"create-dataset-args": {
"directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m",
"save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/",
"is-shuffled": false
},
"run_count": 1
}
]
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
{
"database": {
"host": "localhost1",
"username": "postgres",
"password": "postgres",
"db_name": "ann-2000k",
"instance_type": "Standard_D8ds_v5",
"provider": "azure",
"enable_seqscan": "on"
},
"cases": [
{
"db-label": "memory-comparison-2000k",
"drop_old": true,
"load": true,
"search-serial": false,
"search-concurrent": false,
"case-type": "PerformanceCustomDataset",
"maintenance-work-mem": "16GB",
"max-parallel-workers": 7,
"ef-search": [40],
"ef-construction": 128,
"m": 32,
"num-concurrency": "1,10,20,30,40,50,60,70,80,90,100",
"concurrency-duration": 30,
"k": 10,
"custom-case-name": "hnsw-1536D-2m",
"custom-dataset-name": "custom-openai",
"custom-dataset-dir": "openai_2000k",
"custom-dataset-size": 2000000,
"custom-dataset-dim": 1536,
"custom-dataset-file-count": 4,
"custom-dataset-use-shuffled": false,
"create-dataset-args": {
"directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m",
"save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/",
"is-shuffled": false
},
"run_count": 1
}
]
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
{
"database": {
"host": "localhost2",
"username": "postgres",
"password": "postgres",
"db_name": "ann-3500k",
"instance_type": "Standard_D8ds_v5",
"provider": "azure",
"enable_seqscan": "on"
},
"cases": [
{
"db-label": "memory-comparison-3500k",
"drop_old": true,
"load": true,
"search-serial": false,
"search-concurrent": false,
"case-type": "PerformanceCustomDataset",
"maintenance-work-mem": "16GB",
"max-parallel-workers": 7,
"ef-search": [40],
"ef-construction": 128,
"m": 32,
"num-concurrency": "1,10,20,30,40,50,60,70,80,90,100",
"concurrency-duration": 30,
"k": 10,
"custom-case-name": "hnsw-1536D-3_5m",
"custom-dataset-name": "custom-openai",
"custom-dataset-dir": "openai_3500k",
"custom-dataset-size": 3500000,
"custom-dataset-dim": 1536,
"custom-dataset-file-count": 7,
"custom-dataset-use-shuffled": false,
"create-dataset-args": {
"directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m",
"save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/",
"is-shuffled": false
},
"run_count": 1
}
]
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
{
"database": {
"host": "localhost1",
"username": "postgres",
"password": "postgres",
"db_name": "ann-4000k",
"instance_type": "Standard_D8ds_v5",
"provider": "azure",
"enable_seqscan": "on"
},
"cases": [
{
"db-label": "memory-comparison-4000k",
"drop_old": true,
"load": true,
"search-serial": false,
"search-concurrent": false,
"case-type": "PerformanceCustomDataset",
"maintenance-work-mem": "16GB",
"max-parallel-workers": 7,
"ef-search": [40],
"ef-construction": 128,
"m": 32,
"num-concurrency": "1,10,20,30,40,50,60,70,80,90,100",
"concurrency-duration": 30,
"k": 10,
"custom-case-name": "hnsw-1536D-4m",
"custom-dataset-name": "custom-openai",
"custom-dataset-dir": "openai_4000k",
"custom-dataset-size": 4000000,
"custom-dataset-dim": 1536,
"custom-dataset-file-count": 8,
"custom-dataset-use-shuffled": false,
"create-dataset-args": {
"directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m",
"save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/",
"is-shuffled": false
},
"run_count": 1
}
]
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
{
"database": {
"host": "localhost2",
"username": "postgres",
"password": "postgres",
"db_name": "ann-1500k",
"instance_type": "Standard_D8ds_v5",
"provider": "azure",
"enable_seqscan": "on"
},
"cases": [
{
"db-label": "memory-comparison-1500k",
"drop_old": true,
"load": true,
"search-serial": false,
"search-concurrent": false,
"case-type": "PerformanceCustomDataset",
"maintenance-work-mem": "16GB",
"max-parallel-workers": 7,
"ef-search": [40],
"ef-construction": 128,
"m": 32,
"num-concurrency": "1,10,20,30,40,50,60,70,80,90,100",
"concurrency-duration": 30,
"k": 10,
"custom-case-name": "hnsw-1536D-1_5m",
"custom-dataset-name": "custom-openai",
"custom-dataset-dir": "openai_1500k",
"custom-dataset-size": 1500000,
"custom-dataset-dim": 1536,
"custom-dataset-file-count": 3,
"custom-dataset-use-shuffled": false,
"create-dataset-args": {
"directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m",
"save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/",
"is-shuffled": false
},
"run_count": 1
}
]
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
{
"database": {
"host": "localhost2",
"username": "postgres",
"password": "postgres",
"db_name": "ann-2500k",
"instance_type": "Standard_D8ds_v5",
"provider": "azure",
"enable_seqscan": "on"
},
"cases": [
{
"db-label": "memory-comparison-2500k",
"drop_old": true,
"load": true,
"search-serial": false,
"search-concurrent": false,
"case-type": "PerformanceCustomDataset",
"maintenance-work-mem": "16GB",
"max-parallel-workers": 7,
"ef-search": [40],
"ef-construction": 128,
"m": 32,
"num-concurrency": "1,10,20,30,40,50,60,70,80,90,100",
"concurrency-duration": 30,
"k": 10,
"custom-case-name": "hnsw-1536D-2_5m",
"custom-dataset-name": "custom-openai",
"custom-dataset-dir": "openai_2500k",
"custom-dataset-size": 2500000,
"custom-dataset-dim": 1536,
"custom-dataset-file-count": 5,
"custom-dataset-use-shuffled": false,
"create-dataset-args": {
"directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m",
"save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/",
"is-shuffled": false
},
"run_count": 1
}
]
}

Loading