diff --git a/create_dataset_subsets.py b/create_dataset_subsets.py index efb1d3c82..55c147374 100644 --- a/create_dataset_subsets.py +++ b/create_dataset_subsets.py @@ -1,7 +1,11 @@ import os import shutil import argparse +import logging +# Set up logging +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") +logger = logging.getLogger(__name__) def get_file_name(file_name: str, file_prefix: str, files_count: int) -> str: if files_count == 1: @@ -9,28 +13,51 @@ def get_file_name(file_name: str, file_prefix: str, files_count: int) -> str: file_name = file_name.split("of-")[0] return file_name + "of-" + str(files_count).zfill(2) + ".parquet" -def create_subsets(base_dir, save_dir_path, subset_prefix, file_prefix, step_size): - files = sorted([f for f in os.listdir(base_dir) if f.startswith(file_prefix)]) +def create_dataset(base_dir, save_dir_path, subset_prefix, file_prefix, file_count, row_count=500_000): + logger.info(f"Starting dataset creation with {file_count} files.") + + # Sort the files and pick only the first 'file_count' files + files = sorted([f for f in os.listdir(base_dir) if f.startswith(file_prefix)])[:file_count] num_files = len(files) + + if num_files == 0: + logger.warning("No files found with the specified prefix.") + return + + logger.info(f"Found {num_files} files. Creating dataset...") + + # Create the directory for the dataset + subset_dir = os.path.join(save_dir_path, f"{subset_prefix}_{file_count * row_count // 1000}k") + os.makedirs(subset_dir, exist_ok=True) + logger.info(f"Created directory for the dataset: {subset_dir}") + + # Copy the first 'file_count' files into the subset directory + for file in files: + src_file = os.path.join(base_dir, file) + dst_file = os.path.join(subset_dir, get_file_name(file, file_prefix, file_count)) + shutil.copy(src_file, dst_file) + logger.info(f"Copied {file} to {dst_file}") - for i in range(1, num_files + 1): - subset_dir = os.path.join(save_dir_path, f"{subset_prefix}_{i * step_size // 1000}k") - os.makedirs(subset_dir, exist_ok=True) - - for j in range(i): - src_file = os.path.join(base_dir, files[j]) - dst_file = os.path.join(subset_dir, get_file_name(files[j], file_prefix, i)) - shutil.copy(src_file, dst_file) - src_test_file = os.path.join(base_dir, "test.parquet") - dst_test_file = os.path.join(subset_dir, "test.parquet") - shutil.copy(src_test_file, dst_test_file) + # Also copy the test.parquet file + src_test_file = os.path.join(base_dir, "test.parquet") + dst_test_file = os.path.join(subset_dir, "test.parquet") + shutil.copy(src_test_file, dst_test_file) + logger.info(f"Copied test.parquet to {subset_dir}") + + src_test_file = os.path.join(base_dir, "neighbors.parquet") + dst_test_file = os.path.join(subset_dir, "neighbors.parquet") + shutil.copy(src_test_file, dst_test_file) + logger.info(f"Copied neighbors.parquet to {subset_dir}") + + logger.info(f"Dataset creation completed. {file_count} files have been copied to {subset_dir}.") if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Create subsets of Parquet files using Dask.") - parser.add_argument("--directory", type=str, help="Path to the directory containing Parquet files.") - parser.add_argument("--save-dir-path", type=str, help="Directory path where data will be saved") - parser.add_argument("--dataset-name-prefix", type=str, help="Name prefix of the folder where each subset will be saved.") + parser = argparse.ArgumentParser(description="Create a dataset with a specified number of Parquet files.") + parser.add_argument("--directory", type=str, required=True, help="Path to the directory containing Parquet files.") + parser.add_argument("--save-dir-path", type=str, required=True, help="Directory path where the dataset will be saved.") + parser.add_argument("--dataset-name-prefix", type=str, help="Name prefix for the dataset folder.") parser.add_argument("--is-shuffled", type=bool, help="Whether the files are shuffled or not.") + parser.add_argument("--file-count", type=int, required=True, help="Number of Parquet files to include in the dataset.") args = parser.parse_args() file_prefix = ( @@ -48,7 +75,14 @@ def create_subsets(base_dir, save_dir_path, subset_prefix, file_prefix, step_siz if args.save_dir_path else args.directory ) - step_size = 500_000 # 500k - create_subsets(args.directory, save_dir_path, subset_prefix, file_prefix, step_size) - print(f'Finished creating subsets of Parquet files in {args.directory}.') \ No newline at end of file + if os.path.exists(save_dir_path) and os.listdir(save_dir_path): + shutil.rmtree(save_dir_path) + logger.info(f"Deleted existing directory: {save_dir_path}") + + # Log the input parameters + logger.info(f"Parameters received: directory={args.directory}, save_dir_path={args.save_dir_path}, file_count={args.file_count}, dataset_name_prefix={subset_prefix}, is_shuffled={args.is_shuffled}") + + # Create the dataset with the specified file_count + create_dataset(args.directory, save_dir_path, subset_prefix, file_prefix, args.file_count) + logger.info(f'Finished creating a dataset with {args.file_count} Parquet files.') \ No newline at end of file diff --git a/custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-1000k.json b/custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-1000k.json new file mode 100644 index 000000000..c3674f572 --- /dev/null +++ b/custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-1000k.json @@ -0,0 +1,43 @@ +{ + "database": { + "host": "localhost1", + "username": "postgres", + "password": "postgres", + "db_name": "ann-1000k", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison-1000k", + "drop_old": true, + "load": true, + "search-serial": false, + "search-concurrent": false, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-1m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_1000k", + "custom-dataset-size": 1000000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 2, + "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", + "is-shuffled": false + }, + "run_count": 1 + } + ] + } + \ No newline at end of file diff --git a/custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-2000k.json b/custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-2000k.json new file mode 100644 index 000000000..0d95f1197 --- /dev/null +++ b/custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-2000k.json @@ -0,0 +1,43 @@ +{ + "database": { + "host": "localhost1", + "username": "postgres", + "password": "postgres", + "db_name": "ann-2000k", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison-2000k", + "drop_old": true, + "load": true, + "search-serial": false, + "search-concurrent": false, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-2m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_2000k", + "custom-dataset-size": 2000000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 4, + "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", + "is-shuffled": false + }, + "run_count": 1 + } + ] + } + \ No newline at end of file diff --git a/custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-3500k.json b/custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-3500k.json new file mode 100644 index 000000000..1cd60da9e --- /dev/null +++ b/custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-3500k.json @@ -0,0 +1,43 @@ +{ + "database": { + "host": "localhost2", + "username": "postgres", + "password": "postgres", + "db_name": "ann-3500k", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison-3500k", + "drop_old": true, + "load": true, + "search-serial": false, + "search-concurrent": false, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-3_5m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_3500k", + "custom-dataset-size": 3500000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 7, + "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", + "is-shuffled": false + }, + "run_count": 1 + } + ] + } + \ No newline at end of file diff --git a/custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-4000k.json b/custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-4000k.json new file mode 100644 index 000000000..017c2bdcf --- /dev/null +++ b/custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-4000k.json @@ -0,0 +1,43 @@ +{ + "database": { + "host": "localhost1", + "username": "postgres", + "password": "postgres", + "db_name": "ann-4000k", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison-4000k", + "drop_old": true, + "load": true, + "search-serial": false, + "search-concurrent": false, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-4m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_4000k", + "custom-dataset-size": 4000000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 8, + "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", + "is-shuffled": false + }, + "run_count": 1 + } + ] + } + \ No newline at end of file diff --git a/custom-run-build-index-configs-2/config-custom-dataset-small-hnsw-1500k.json b/custom-run-build-index-configs-2/config-custom-dataset-small-hnsw-1500k.json new file mode 100644 index 000000000..d726ea1fd --- /dev/null +++ b/custom-run-build-index-configs-2/config-custom-dataset-small-hnsw-1500k.json @@ -0,0 +1,43 @@ +{ + "database": { + "host": "localhost2", + "username": "postgres", + "password": "postgres", + "db_name": "ann-1500k", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison-1500k", + "drop_old": true, + "load": true, + "search-serial": false, + "search-concurrent": false, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-1_5m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_1500k", + "custom-dataset-size": 1500000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 3, + "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", + "is-shuffled": false + }, + "run_count": 1 + } + ] + } + \ No newline at end of file diff --git a/custom-run-build-index-configs-2/config-custom-dataset-small-hnsw-2500k.json b/custom-run-build-index-configs-2/config-custom-dataset-small-hnsw-2500k.json new file mode 100644 index 000000000..801005c0c --- /dev/null +++ b/custom-run-build-index-configs-2/config-custom-dataset-small-hnsw-2500k.json @@ -0,0 +1,43 @@ +{ + "database": { + "host": "localhost2", + "username": "postgres", + "password": "postgres", + "db_name": "ann-2500k", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison-2500k", + "drop_old": true, + "load": true, + "search-serial": false, + "search-concurrent": false, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-2_5m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_2500k", + "custom-dataset-size": 2500000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 5, + "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", + "is-shuffled": false + }, + "run_count": 1 + } + ] + } + \ No newline at end of file diff --git a/custom-run-build-index-configs-2/config-custom-dataset-small-hnsw-4500k.json b/custom-run-build-index-configs-2/config-custom-dataset-small-hnsw-4500k.json new file mode 100644 index 000000000..e31cf3bfa --- /dev/null +++ b/custom-run-build-index-configs-2/config-custom-dataset-small-hnsw-4500k.json @@ -0,0 +1,43 @@ +{ + "database": { + "host": "localhost2", + "username": "postgres", + "password": "postgres", + "db_name": "ann-4500k", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison-4500k", + "drop_old": true, + "load": true, + "search-serial": false, + "search-concurrent": false, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-4_5m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_4500k", + "custom-dataset-size": 4500000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 9, + "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", + "is-shuffled": false + }, + "run_count": 1 + } + ] + } + \ No newline at end of file diff --git a/custom-run-build-index-configs-3/config-custom-dataset-small-hnsw-3000k.json b/custom-run-build-index-configs-3/config-custom-dataset-small-hnsw-3000k.json new file mode 100644 index 000000000..496d868db --- /dev/null +++ b/custom-run-build-index-configs-3/config-custom-dataset-small-hnsw-3000k.json @@ -0,0 +1,43 @@ +{ + "database": { + "host": "localhost1", + "username": "postgres", + "password": "postgres", + "db_name": "ann-3000k", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison-3000k", + "drop_old": true, + "load": true, + "search-serial": false, + "search-concurrent": false, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-3m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_3000k", + "custom-dataset-size": 3000000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 6, + "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", + "is-shuffled": false + }, + "run_count": 1 + } + ] + } + \ No newline at end of file diff --git a/custom-run-build-index-configs-3/config-custom-dataset-small-hnsw-5000k.json b/custom-run-build-index-configs-3/config-custom-dataset-small-hnsw-5000k.json new file mode 100644 index 000000000..0656dd083 --- /dev/null +++ b/custom-run-build-index-configs-3/config-custom-dataset-small-hnsw-5000k.json @@ -0,0 +1,43 @@ +{ + "database": { + "host": "localhost1", + "username": "postgres", + "password": "postgres", + "db_name": "ann-5000k", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison-5000k", + "drop_old": true, + "load": true, + "search-serial": false, + "search-concurrent": false, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-5m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_5000k", + "custom-dataset-size": 5000000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 10, + "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", + "is-shuffled": false + }, + "run_count": 1 + } + ] + } + \ No newline at end of file diff --git a/custom-run-build-index-configs-3/config-custom-dataset-small-hnsw-500k.json b/custom-run-build-index-configs-3/config-custom-dataset-small-hnsw-500k.json new file mode 100644 index 000000000..61c3cd037 --- /dev/null +++ b/custom-run-build-index-configs-3/config-custom-dataset-small-hnsw-500k.json @@ -0,0 +1,43 @@ +{ + "database": { + "host": "localhost1", + "username": "postgres", + "password": "postgres", + "db_name": "ann-500k", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison-500k", + "drop_old": true, + "load": true, + "search-serial": false, + "search-concurrent": false, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-500K", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_500k", + "custom-dataset-size": 500000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 1, + "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", + "is-shuffled": false + }, + "run_count": 1 + } + ] + } + \ No newline at end of file diff --git a/custom-run-configs-1/config-custom-dataset-small-hnsw-1000k.json b/custom-run-configs-1/config-custom-dataset-small-hnsw-1000k.json new file mode 100644 index 000000000..cba9c28c2 --- /dev/null +++ b/custom-run-configs-1/config-custom-dataset-small-hnsw-1000k.json @@ -0,0 +1,43 @@ +{ + "database": { + "host": "localhost1", + "username": "postgres", + "password": "postgres", + "db_name": "ann-1000k", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison-1000k", + "drop_old": false, + "load": false, + "search-serial": true, + "search-concurrent": true, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-1m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_1000k", + "custom-dataset-size": 1000000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 2, + "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", + "is-shuffled": false + }, + "run_count": 3 + } + ] + } + \ No newline at end of file diff --git a/custom-run-configs-1/config-custom-dataset-small-hnsw-2000k.json b/custom-run-configs-1/config-custom-dataset-small-hnsw-2000k.json new file mode 100644 index 000000000..fc56c9280 --- /dev/null +++ b/custom-run-configs-1/config-custom-dataset-small-hnsw-2000k.json @@ -0,0 +1,43 @@ +{ + "database": { + "host": "localhost1", + "username": "postgres", + "password": "postgres", + "db_name": "ann-2000k", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison-2000k", + "drop_old": false, + "load": false, + "search-serial": true, + "search-concurrent": true, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-2m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_2000k", + "custom-dataset-size": 2000000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 4, + "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", + "is-shuffled": false + }, + "run_count": 3 + } + ] + } + \ No newline at end of file diff --git a/custom-run-configs-1/config-custom-dataset-small-hnsw-3500k.json b/custom-run-configs-1/config-custom-dataset-small-hnsw-3500k.json new file mode 100644 index 000000000..3f1145cf5 --- /dev/null +++ b/custom-run-configs-1/config-custom-dataset-small-hnsw-3500k.json @@ -0,0 +1,43 @@ +{ + "database": { + "host": "localhost2", + "username": "postgres", + "password": "postgres", + "db_name": "ann-3500k", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison-3500k", + "drop_old": false, + "load": false, + "search-serial": true, + "search-concurrent": true, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-3_5m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_3500k", + "custom-dataset-size": 3500000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 7, + "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", + "is-shuffled": false + }, + "run_count": 3 + } + ] + } + \ No newline at end of file diff --git a/custom-run-configs-1/config-custom-dataset-small-hnsw-4000k.json b/custom-run-configs-1/config-custom-dataset-small-hnsw-4000k.json new file mode 100644 index 000000000..a74a21264 --- /dev/null +++ b/custom-run-configs-1/config-custom-dataset-small-hnsw-4000k.json @@ -0,0 +1,43 @@ +{ + "database": { + "host": "localhost1", + "username": "postgres", + "password": "postgres", + "db_name": "ann-4000k", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison-4000k", + "drop_old": false, + "load": false, + "search-serial": true, + "search-concurrent": true, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-4m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_4000k", + "custom-dataset-size": 4000000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 8, + "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", + "is-shuffled": false + }, + "run_count": 3 + } + ] + } + \ No newline at end of file diff --git a/custom-run-configs-2/config-custom-dataset-small-hnsw-1500k.json b/custom-run-configs-2/config-custom-dataset-small-hnsw-1500k.json new file mode 100644 index 000000000..7e3c29493 --- /dev/null +++ b/custom-run-configs-2/config-custom-dataset-small-hnsw-1500k.json @@ -0,0 +1,43 @@ +{ + "database": { + "host": "localhost2", + "username": "postgres", + "password": "postgres", + "db_name": "ann-1500k", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison-1500k", + "drop_old": false, + "load": false, + "search-serial": true, + "search-concurrent": true, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-1_5m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_1500k", + "custom-dataset-size": 1500000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 3, + "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", + "is-shuffled": false + }, + "run_count": 3 + } + ] + } + \ No newline at end of file diff --git a/custom-run-configs-2/config-custom-dataset-small-hnsw-2500k.json b/custom-run-configs-2/config-custom-dataset-small-hnsw-2500k.json new file mode 100644 index 000000000..4aed76165 --- /dev/null +++ b/custom-run-configs-2/config-custom-dataset-small-hnsw-2500k.json @@ -0,0 +1,43 @@ +{ + "database": { + "host": "localhost2", + "username": "postgres", + "password": "postgres", + "db_name": "ann-2500k", + "instance_type": "db.m6i.large", + "provider": "aws", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison-2500k", + "drop_old": false, + "load": false, + "search-serial": true, + "search-concurrent": true, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-2_5m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_2500k", + "custom-dataset-size": 2500000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 5, + "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", + "is-shuffled": false + }, + "run_count": 3 + } + ] + } + \ No newline at end of file diff --git a/custom-run-configs-2/config-custom-dataset-small-hnsw-4500k.json b/custom-run-configs-2/config-custom-dataset-small-hnsw-4500k.json new file mode 100644 index 000000000..be671b538 --- /dev/null +++ b/custom-run-configs-2/config-custom-dataset-small-hnsw-4500k.json @@ -0,0 +1,43 @@ +{ + "database": { + "host": "localhost2", + "username": "postgres", + "password": "postgres", + "db_name": "ann-4500k", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison-4500k", + "drop_old": false, + "load": false, + "search-serial": true, + "search-concurrent": true, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-4_5m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_4500k", + "custom-dataset-size": 4500000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 9, + "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", + "is-shuffled": false + }, + "run_count": 3 + } + ] + } + \ No newline at end of file diff --git a/custom-run-configs-3/config-custom-dataset-small-hnsw-3000k.json b/custom-run-configs-3/config-custom-dataset-small-hnsw-3000k.json new file mode 100644 index 000000000..1cdfb7beb --- /dev/null +++ b/custom-run-configs-3/config-custom-dataset-small-hnsw-3000k.json @@ -0,0 +1,43 @@ +{ + "database": { + "host": "localhost1", + "username": "postgres", + "password": "postgres", + "db_name": "ann-3000k", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison-3000k", + "drop_old": false, + "load": false, + "search-serial": true, + "search-concurrent": true, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-3m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_3000k", + "custom-dataset-size": 3000000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 6, + "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", + "is-shuffled": false + }, + "run_count": 3 + } + ] + } + \ No newline at end of file diff --git a/custom-run-configs-3/config-custom-dataset-small-hnsw-5000k.json b/custom-run-configs-3/config-custom-dataset-small-hnsw-5000k.json new file mode 100644 index 000000000..b1244c0cb --- /dev/null +++ b/custom-run-configs-3/config-custom-dataset-small-hnsw-5000k.json @@ -0,0 +1,43 @@ +{ + "database": { + "host": "localhost1", + "username": "postgres", + "password": "postgres", + "db_name": "ann-5000k", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison-5000k", + "drop_old": false, + "load": false, + "search-serial": true, + "search-concurrent": true, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-5m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_5000k", + "custom-dataset-size": 5000000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 10, + "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", + "is-shuffled": false + }, + "run_count": 3 + } + ] + } + \ No newline at end of file diff --git a/custom-run-configs-3/config-custom-dataset-small-hnsw-500k.json b/custom-run-configs-3/config-custom-dataset-small-hnsw-500k.json new file mode 100644 index 000000000..f3ef4ba40 --- /dev/null +++ b/custom-run-configs-3/config-custom-dataset-small-hnsw-500k.json @@ -0,0 +1,43 @@ +{ + "database": { + "host": "localhost1", + "username": "postgres", + "password": "postgres", + "db_name": "ann-500k", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison-500k", + "drop_old": false, + "load": false, + "search-serial": true, + "search-concurrent": true, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-500K", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_500k", + "custom-dataset-size": 500000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 1, + "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", + "is-shuffled": false + }, + "run_count": 3 + } + ] + } + \ No newline at end of file diff --git a/run-custom-dataset.py b/run-custom-dataset.py index a5d1213d5..01979ff46 100644 --- a/run-custom-dataset.py +++ b/run-custom-dataset.py @@ -1,3 +1,4 @@ +import argparse import json import time from contextlib import redirect_stdout @@ -6,6 +7,7 @@ import psycopg from psycopg import sql import os +import shutil os.environ["LOG_LEVEL"] = "DEBUG" @@ -40,11 +42,62 @@ def setup_database(config): cursor = conn.cursor() cursor.execute("CREATE EXTENSION IF NOT EXISTS vector;") cursor.execute("CREATE EXTENSION IF NOT EXISTS pg_buffercache;") + cursor.execute("CREATE EXTENSION IF NOT EXISTS pg_prewarm;") conn.commit() conn.close() except Exception as e: print(f"Setup failed: {e}") + +def create_dataset(args: dict) -> bool: + """ + This function creates a dataset from the original dataset using script + create_dataset_subsets.py and pass required arguments to it. + """ + file_count = args.get("file-count") + is_shuffled = args.get("is-shuffled") + directory = args.get("directory") + output_dir = args.get("save-dir-path") + + try: + # Define the command to run the create_dataset_subsets.py script + command = [ + "python3", "create_dataset_subsets.py", + "--directory", directory, + "--save-dir-path", output_dir, + "--file-count", str(file_count), + ] + print(f"Running command: {' '.join(command)}") + + file_prefix = "train" + if is_shuffled: + file_prefix = "shuffle_train" + command += ["--is-shuffled", "True"] + subprocess.run(command, check=True) + print("Check if dataset was created successfully.") + + created_files_count = sum([1 for _, _, files in os.walk(output_dir) for f in files if f.startswith(file_prefix)]) + print(f"Number of files in the output dataset directory: {created_files_count}") + + if created_files_count != file_count: + raise Exception("Incorrect number of files.") + print("Dataset creation successful.") + except (subprocess.CalledProcessError, Exception) as e: + print(f"Dataset creation failed: {e}") + return False + + return True + +def delete_dataset(dataset_dir: str): + try: + if os.path.exists(dataset_dir): + shutil.rmtree(dataset_dir) + print(f"Deleted directory: {dataset_dir}") + else: + print(f"Directory does not exist: {dataset_dir}") + except Exception as e: + print(f"Failed to delete directory: {e}") + def teardown_database(config): # Optionally drop the database after the test pass @@ -132,6 +185,24 @@ def query_configurations(config): print(f"Failed to query configurations: {e}") return {} +def pre_warm(config): + print(f"Running pre warm for database:{config['db_name']}") + try: + conn = psycopg.connect( + dbname=config['db_name'], + user=config['username'], + password=config['password'], + host=config['host'], + ) + cursor = conn.cursor() + cursor.execute("SELECT pg_prewarm('public.pgvector_index') as block_loaded") + conn.commit() + + result = cursor.fetchone() + print(f"Pre-warm blocks loaded: {result[0]}") + conn.close() + except Exception as e: + print(f"Failed to pre-warm the database: {e}") def run_benchmark(case, db_config): base_command = [ @@ -225,6 +296,7 @@ def run_benchmark(case, db_config): print(f"{key}: {value}") get_stats(db_config) f.flush() + pre_warm(db_config) print(f"Running command: {' '.join(command)}") f.flush() @@ -246,16 +318,31 @@ def run_benchmark(case, db_config): time.sleep(60) def main(): - config = load_config("config.json") - start_time = time.time() - for case in config['cases']: - print(f"Running case: {case['db-label']}") - setup_database(config) - - run_benchmark(case, config['database']) - end_time = time.time() - execution_time = end_time - start_time - print(f"COMPLETED ALL EXECUTIONS. total_duration={execution_time}") + parser = argparse.ArgumentParser(description="Run benchmarks on a custom dataset.") + parser.add_argument("--config-dir-path", type=str, help="Path to the config files directory.") + args = parser.parse_args() + + for dir_path, _, file_names in os.walk(args.config_dir_path): + for file_name in file_names: + config = load_config(os.path.join(dir_path, file_name)) + start_time = time.time() + for case in config['cases']: + print(f"Running case: {case['db-label']}") + setup_database(config) + + create_dataset_args = case['create-dataset-args'] + create_dataset_args["file-count"] = case["custom-dataset-file-count"] + dataset_created = create_dataset(create_dataset_args) + if not dataset_created: + print(f"Failed to create dataset for case: {case['custom-case-name']} -- Skipping execution.") + continue + + run_benchmark(case, config['database']) + teardown_database(config) + delete_dataset(create_dataset_args["save-dir-path"]) + end_time = time.time() + execution_time = end_time - start_time + print(f"COMPLETED ALL EXECUTIONS of config {file_name}. total_duration={execution_time}") if __name__ == "__main__": main() diff --git a/sample-configs/config-custom-dataset-small-hnsw.json b/sample-configs/config-custom-dataset-small-hnsw.json index 8eb2b865b..707ec41fa 100644 --- a/sample-configs/config-custom-dataset-small-hnsw.json +++ b/sample-configs/config-custom-dataset-small-hnsw.json @@ -4,7 +4,7 @@ "username": "postgres", "password": "postgres", "db_name": "ann", - "instance_type": "db.m6i.large", + "instance_type": "db.m6i.xlarge", "provider": "aws", "enable_seqscan": "on" }, diff --git a/vectordb_bench/__init__.py b/vectordb_bench/__init__.py index 3d8419a4f..3795535ed 100644 --- a/vectordb_bench/__init__.py +++ b/vectordb_bench/__init__.py @@ -21,7 +21,7 @@ class config: NUM_PER_BATCH = env.int("NUM_PER_BATCH", 5000) DROP_OLD = env.bool("DROP_OLD", True) - USE_SHUFFLED_DATA = env.bool("USE_SHUFFLED_DATA", True) + USE_SHUFFLED_DATA = env.bool("USE_SHUFFLED_DATA", False) NUM_CONCURRENCY = env.list("NUM_CONCURRENCY", [1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100], subcast=int )