From 3905f46d2f200886d8a4adf9fc352457ef9ef0a2 Mon Sep 17 00:00:00 2001 From: Sheharyar Ahmad Date: Mon, 14 Oct 2024 18:58:24 +0500 Subject: [PATCH 01/20] Add separate configs for index and search for multiple datasets. --- ...onfig-custom-dataset-small-hnsw-1000k.json | 38 +++++++++++++++++++ ...onfig-custom-dataset-small-hnsw-1500k.json | 38 +++++++++++++++++++ ...onfig-custom-dataset-small-hnsw-2000k.json | 38 +++++++++++++++++++ ...onfig-custom-dataset-small-hnsw-2500k.json | 38 +++++++++++++++++++ ...config-custom-dataset-small-hnsw-500k.json | 38 +++++++++++++++++++ ...onfig-custom-dataset-small-hnsw-1000k.json | 38 +++++++++++++++++++ ...onfig-custom-dataset-small-hnsw-1500k.json | 38 +++++++++++++++++++ ...onfig-custom-dataset-small-hnsw-2000k.json | 38 +++++++++++++++++++ ...onfig-custom-dataset-small-hnsw-2500k.json | 38 +++++++++++++++++++ ...config-custom-dataset-small-hnsw-500k.json | 38 +++++++++++++++++++ 10 files changed, 380 insertions(+) create mode 100644 custom-run-build-index-configs/config-custom-dataset-small-hnsw-1000k.json create mode 100644 custom-run-build-index-configs/config-custom-dataset-small-hnsw-1500k.json create mode 100644 custom-run-build-index-configs/config-custom-dataset-small-hnsw-2000k.json create mode 100644 custom-run-build-index-configs/config-custom-dataset-small-hnsw-2500k.json create mode 100644 custom-run-build-index-configs/config-custom-dataset-small-hnsw-500k.json create mode 100644 custom-run-configs/config-custom-dataset-small-hnsw-1000k.json create mode 100644 custom-run-configs/config-custom-dataset-small-hnsw-1500k.json create mode 100644 custom-run-configs/config-custom-dataset-small-hnsw-2000k.json create mode 100644 custom-run-configs/config-custom-dataset-small-hnsw-2500k.json create mode 100644 custom-run-configs/config-custom-dataset-small-hnsw-500k.json diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1000k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1000k.json new file mode 100644 index 000000000..6a468866b --- /dev/null +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1000k.json @@ -0,0 +1,38 @@ +{ + "database": { + "host": "localhost1", + "username": "postgres", + "password": "postgres", + "db_name": "ann2", + "instance_type": "db.m6i.large", + "provider": "aws", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison-run-seqon", + "drop_old": true, + "load": true, + "search-serial": false, + "search-concurrent": false, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "20GB", + "max-parallel-workers": 7, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-1m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_1000k", + "custom-dataset-size": 1000000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 2, + "custom-dataset-use-shuffled": false, + "run_count": 1 + } + ] + } + \ No newline at end of file diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1500k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1500k.json new file mode 100644 index 000000000..4194ad0e4 --- /dev/null +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1500k.json @@ -0,0 +1,38 @@ +{ + "database": { + "host": "localhost2", + "username": "postgres", + "password": "postgres", + "db_name": "ann", + "instance_type": "db.m6i.large", + "provider": "aws", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison-run-seqon", + "drop_old": true, + "load": true, + "search-serial": false, + "search-concurrent": false, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "20GB", + "max-parallel-workers": 7, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-1_5m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_1500k", + "custom-dataset-size": 1500000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 3, + "custom-dataset-use-shuffled": false, + "run_count": 1 + } + ] + } + \ No newline at end of file diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2000k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2000k.json new file mode 100644 index 000000000..147101672 --- /dev/null +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2000k.json @@ -0,0 +1,38 @@ +{ + "database": { + "host": "localhost1", + "username": "postgres", + "password": "postgres", + "db_name": "ann3", + "instance_type": "db.m6i.large", + "provider": "aws", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison-run-seqon", + "drop_old": true, + "load": true, + "search-serial": false, + "search-concurrent": false, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "20GB", + "max-parallel-workers": 7, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-2m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_2000k", + "custom-dataset-size": 2000000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 4, + "custom-dataset-use-shuffled": false, + "run_count": 1 + } + ] + } + \ No newline at end of file diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2500k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2500k.json new file mode 100644 index 000000000..e950b60e5 --- /dev/null +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2500k.json @@ -0,0 +1,38 @@ +{ + "database": { + "host": "localhost2", + "username": "postgres", + "password": "postgres", + "db_name": "ann2", + "instance_type": "db.m6i.large", + "provider": "aws", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison-run-seqon", + "drop_old": true, + "load": true, + "search-serial": false, + "search-concurrent": false, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "20GB", + "max-parallel-workers": 7, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-2_5m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_2500k", + "custom-dataset-size": 2500000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 5, + "custom-dataset-use-shuffled": false, + "run_count": 1 + } + ] + } + \ No newline at end of file diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-500k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-500k.json new file mode 100644 index 000000000..d2a489f3b --- /dev/null +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-500k.json @@ -0,0 +1,38 @@ +{ + "database": { + "host": "localhost1", + "username": "postgres", + "password": "postgres", + "db_name": "ann", + "instance_type": "db.m6i.large", + "provider": "aws", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison-run-seqon", + "drop_old": true, + "load": true, + "search-serial": false, + "search-concurrent": false, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "20GB", + "max-parallel-workers": 7, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-500K", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_500k", + "custom-dataset-size": 500000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 1, + "custom-dataset-use-shuffled": false, + "run_count": 1 + } + ] + } + \ No newline at end of file diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json b/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json new file mode 100644 index 000000000..352361e4f --- /dev/null +++ b/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json @@ -0,0 +1,38 @@ +{ + "database": { + "host": "localhost1", + "username": "postgres", + "password": "postgres", + "db_name": "ann2", + "instance_type": "db.m6i.large", + "provider": "aws", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison-run-seqon", + "drop_old": false, + "load": false, + "search-serial": true, + "search-concurrent": true, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "8GB", + "max-parallel-workers": 3, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-1m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_1000k", + "custom-dataset-size": 1000000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 2, + "custom-dataset-use-shuffled": false, + "run_count": 1 + } + ] + } + \ No newline at end of file diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json new file mode 100644 index 000000000..a85fec9d8 --- /dev/null +++ b/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json @@ -0,0 +1,38 @@ +{ + "database": { + "host": "localhost2", + "username": "postgres", + "password": "postgres", + "db_name": "ann", + "instance_type": "db.m6i.large", + "provider": "aws", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison-run-seqon", + "drop_old": false, + "load": false, + "search-serial": true, + "search-concurrent": true, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "8GB", + "max-parallel-workers": 3, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-1_5m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_1500k", + "custom-dataset-size": 1500000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 3, + "custom-dataset-use-shuffled": false, + "run_count": 1 + } + ] + } + \ No newline at end of file diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-2000k.json b/custom-run-configs/config-custom-dataset-small-hnsw-2000k.json new file mode 100644 index 000000000..3fa809c27 --- /dev/null +++ b/custom-run-configs/config-custom-dataset-small-hnsw-2000k.json @@ -0,0 +1,38 @@ +{ + "database": { + "host": "localhost1", + "username": "postgres", + "password": "postgres", + "db_name": "ann3", + "instance_type": "db.m6i.large", + "provider": "aws", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison-run-seqon", + "drop_old": false, + "load": false, + "search-serial": true, + "search-concurrent": true, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "8GB", + "max-parallel-workers": 3, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-2m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_2000k", + "custom-dataset-size": 2000000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 4, + "custom-dataset-use-shuffled": false, + "run_count": 1 + } + ] + } + \ No newline at end of file diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json new file mode 100644 index 000000000..814ceb598 --- /dev/null +++ b/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json @@ -0,0 +1,38 @@ +{ + "database": { + "host": "localhost2", + "username": "postgres", + "password": "postgres", + "db_name": "ann2", + "instance_type": "db.m6i.large", + "provider": "aws", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison-run-seqon", + "drop_old": false, + "load": false, + "search-serial": true, + "search-concurrent": true, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "8GB", + "max-parallel-workers": 3, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-2_5m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_2500k", + "custom-dataset-size": 2500000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 5, + "custom-dataset-use-shuffled": false, + "run_count": 1 + } + ] + } + \ No newline at end of file diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-500k.json new file mode 100644 index 000000000..39cfb13e6 --- /dev/null +++ b/custom-run-configs/config-custom-dataset-small-hnsw-500k.json @@ -0,0 +1,38 @@ +{ + "database": { + "host": "localhost1", + "username": "postgres", + "password": "postgres", + "db_name": "ann", + "instance_type": "db.m6i.large", + "provider": "aws", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison-run-seqon", + "drop_old": false, + "load": false, + "search-serial": true, + "search-concurrent": true, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "8GB", + "max-parallel-workers": 3, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-500K", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_500k", + "custom-dataset-size": 500000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 1, + "custom-dataset-use-shuffled": false, + "run_count": 1 + } + ] + } + \ No newline at end of file From 4eb23bb52f5f15f00ba7a520cda0a0543f0aea34 Mon Sep 17 00:00:00 2001 From: Sheharyar Ahmad Date: Mon, 14 Oct 2024 18:58:47 +0500 Subject: [PATCH 02/20] Update run script to handle mulitiple configs in a directory. --- run-custom-dataset.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/run-custom-dataset.py b/run-custom-dataset.py index a5d1213d5..b3f71ae29 100644 --- a/run-custom-dataset.py +++ b/run-custom-dataset.py @@ -1,3 +1,4 @@ +import argparse import json import time from contextlib import redirect_stdout @@ -246,16 +247,23 @@ def run_benchmark(case, db_config): time.sleep(60) def main(): - config = load_config("config.json") - start_time = time.time() - for case in config['cases']: - print(f"Running case: {case['db-label']}") - setup_database(config) - - run_benchmark(case, config['database']) - end_time = time.time() - execution_time = end_time - start_time - print(f"COMPLETED ALL EXECUTIONS. total_duration={execution_time}") + parser = argparse.ArgumentParser(description="Run benchmarks on a custom dataset.") + parser.add_argument("--config-dir-path", type=str, help="Path to the config files directory.") + args = parser.parse_args() + + for dir_path, _, file_names in os.walk(args.config_dir_path): + for file_name in file_names: + config = load_config(os.path.join(dir_path, file_name)) + start_time = time.time() + for case in config['cases']: + print(f"Running case: {case['db-label']}") + setup_database(config) + + run_benchmark(case, config['database']) + teardown_database(config) + end_time = time.time() + execution_time = end_time - start_time + print(f"COMPLETED ALL EXECUTIONS of config {file_name}. total_duration={execution_time}") if __name__ == "__main__": main() From f662dd142e3c7446f29bdf74fa624d3b58944b01 Mon Sep 17 00:00:00 2001 From: Sheharyar Ahmad Date: Tue, 15 Oct 2024 17:37:55 +0500 Subject: [PATCH 03/20] resolved comments --- .../config-custom-dataset-small-hnsw-1000k.json | 8 ++++---- .../config-custom-dataset-small-hnsw-1500k.json | 8 ++++---- .../config-custom-dataset-small-hnsw-2000k.json | 8 ++++---- .../config-custom-dataset-small-hnsw-2500k.json | 8 ++++---- .../config-custom-dataset-small-hnsw-500k.json | 8 ++++---- .../config-custom-dataset-small-hnsw-1000k.json | 4 ++-- .../config-custom-dataset-small-hnsw-1500k.json | 4 ++-- .../config-custom-dataset-small-hnsw-2000k.json | 4 ++-- .../config-custom-dataset-small-hnsw-2500k.json | 4 ++-- .../config-custom-dataset-small-hnsw-500k.json | 4 ++-- 10 files changed, 30 insertions(+), 30 deletions(-) diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1000k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1000k.json index 6a468866b..12a041ab6 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1000k.json +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1000k.json @@ -3,21 +3,21 @@ "host": "localhost1", "username": "postgres", "password": "postgres", - "db_name": "ann2", + "db_name": "ann-1000k", "instance_type": "db.m6i.large", "provider": "aws", "enable_seqscan": "on" }, "cases": [ { - "db-label": "memory-comparison-run-seqon", + "db-label": "memory-comparison", "drop_old": true, "load": true, "search-serial": false, "search-concurrent": false, "case-type": "PerformanceCustomDataset", - "maintenance-work-mem": "20GB", - "max-parallel-workers": 7, + "maintenance-work-mem": "42GB", + "max-parallel-workers": 15, "ef-search": [40], "ef-construction": 128, "m": 32, diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1500k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1500k.json index 4194ad0e4..f978cdca5 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1500k.json +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1500k.json @@ -3,21 +3,21 @@ "host": "localhost2", "username": "postgres", "password": "postgres", - "db_name": "ann", + "db_name": "ann-1500k", "instance_type": "db.m6i.large", "provider": "aws", "enable_seqscan": "on" }, "cases": [ { - "db-label": "memory-comparison-run-seqon", + "db-label": "memory-comparison", "drop_old": true, "load": true, "search-serial": false, "search-concurrent": false, "case-type": "PerformanceCustomDataset", - "maintenance-work-mem": "20GB", - "max-parallel-workers": 7, + "maintenance-work-mem": "42GB", + "max-parallel-workers": 15, "ef-search": [40], "ef-construction": 128, "m": 32, diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2000k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2000k.json index 147101672..3b8068674 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2000k.json +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2000k.json @@ -3,21 +3,21 @@ "host": "localhost1", "username": "postgres", "password": "postgres", - "db_name": "ann3", + "db_name": "ann-2000k", "instance_type": "db.m6i.large", "provider": "aws", "enable_seqscan": "on" }, "cases": [ { - "db-label": "memory-comparison-run-seqon", + "db-label": "memory-comparison", "drop_old": true, "load": true, "search-serial": false, "search-concurrent": false, "case-type": "PerformanceCustomDataset", - "maintenance-work-mem": "20GB", - "max-parallel-workers": 7, + "maintenance-work-mem": "42GB", + "max-parallel-workers": 15, "ef-search": [40], "ef-construction": 128, "m": 32, diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2500k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2500k.json index e950b60e5..5612e29db 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2500k.json +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2500k.json @@ -3,21 +3,21 @@ "host": "localhost2", "username": "postgres", "password": "postgres", - "db_name": "ann2", + "db_name": "ann-2500k", "instance_type": "db.m6i.large", "provider": "aws", "enable_seqscan": "on" }, "cases": [ { - "db-label": "memory-comparison-run-seqon", + "db-label": "memory-comparison", "drop_old": true, "load": true, "search-serial": false, "search-concurrent": false, "case-type": "PerformanceCustomDataset", - "maintenance-work-mem": "20GB", - "max-parallel-workers": 7, + "maintenance-work-mem": "42GB", + "max-parallel-workers": 15, "ef-search": [40], "ef-construction": 128, "m": 32, diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-500k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-500k.json index d2a489f3b..dc86e930a 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-500k.json +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-500k.json @@ -3,21 +3,21 @@ "host": "localhost1", "username": "postgres", "password": "postgres", - "db_name": "ann", + "db_name": "ann-500k", "instance_type": "db.m6i.large", "provider": "aws", "enable_seqscan": "on" }, "cases": [ { - "db-label": "memory-comparison-run-seqon", + "db-label": "memory-comparison", "drop_old": true, "load": true, "search-serial": false, "search-concurrent": false, "case-type": "PerformanceCustomDataset", - "maintenance-work-mem": "20GB", - "max-parallel-workers": 7, + "maintenance-work-mem": "42GB", + "max-parallel-workers": 15, "ef-search": [40], "ef-construction": 128, "m": 32, diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json b/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json index 352361e4f..bed3ebad1 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json @@ -3,14 +3,14 @@ "host": "localhost1", "username": "postgres", "password": "postgres", - "db_name": "ann2", + "db_name": "ann-1000k", "instance_type": "db.m6i.large", "provider": "aws", "enable_seqscan": "on" }, "cases": [ { - "db-label": "memory-comparison-run-seqon", + "db-label": "memory-comparison", "drop_old": false, "load": false, "search-serial": true, diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json index a85fec9d8..41fd90c94 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json @@ -3,14 +3,14 @@ "host": "localhost2", "username": "postgres", "password": "postgres", - "db_name": "ann", + "db_name": "ann-1500k", "instance_type": "db.m6i.large", "provider": "aws", "enable_seqscan": "on" }, "cases": [ { - "db-label": "memory-comparison-run-seqon", + "db-label": "memory-comparison", "drop_old": false, "load": false, "search-serial": true, diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-2000k.json b/custom-run-configs/config-custom-dataset-small-hnsw-2000k.json index 3fa809c27..f488c736a 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-2000k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-2000k.json @@ -3,14 +3,14 @@ "host": "localhost1", "username": "postgres", "password": "postgres", - "db_name": "ann3", + "db_name": "ann-2000k", "instance_type": "db.m6i.large", "provider": "aws", "enable_seqscan": "on" }, "cases": [ { - "db-label": "memory-comparison-run-seqon", + "db-label": "memory-comparison", "drop_old": false, "load": false, "search-serial": true, diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json index 814ceb598..7111acc5f 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json @@ -3,14 +3,14 @@ "host": "localhost2", "username": "postgres", "password": "postgres", - "db_name": "ann2", + "db_name": "ann-2500k", "instance_type": "db.m6i.large", "provider": "aws", "enable_seqscan": "on" }, "cases": [ { - "db-label": "memory-comparison-run-seqon", + "db-label": "memory-comparison", "drop_old": false, "load": false, "search-serial": true, diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-500k.json index 39cfb13e6..0867ef241 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-500k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-500k.json @@ -3,14 +3,14 @@ "host": "localhost1", "username": "postgres", "password": "postgres", - "db_name": "ann", + "db_name": "ann-500k", "instance_type": "db.m6i.large", "provider": "aws", "enable_seqscan": "on" }, "cases": [ { - "db-label": "memory-comparison-run-seqon", + "db-label": "memory-comparison", "drop_old": false, "load": false, "search-serial": true, From 10e1d7495bfbbf788444e8fdc2e4017f4ce45e3f Mon Sep 17 00:00:00 2001 From: Sheharyar Ahmad Date: Tue, 15 Oct 2024 17:38:49 +0500 Subject: [PATCH 04/20] Updated script to only generate dataset of specified file_count. --- create_dataset_subsets.py | 66 +++++++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 20 deletions(-) diff --git a/create_dataset_subsets.py b/create_dataset_subsets.py index efb1d3c82..c1263cabf 100644 --- a/create_dataset_subsets.py +++ b/create_dataset_subsets.py @@ -1,7 +1,11 @@ import os import shutil import argparse +import logging +# Set up logging +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") +logger = logging.getLogger(__name__) def get_file_name(file_name: str, file_prefix: str, files_count: int) -> str: if files_count == 1: @@ -9,28 +13,47 @@ def get_file_name(file_name: str, file_prefix: str, files_count: int) -> str: file_name = file_name.split("of-")[0] return file_name + "of-" + str(files_count).zfill(2) + ".parquet" -def create_subsets(base_dir, save_dir_path, subset_prefix, file_prefix, step_size): - files = sorted([f for f in os.listdir(base_dir) if f.startswith(file_prefix)]) +def create_dataset(base_dir, save_dir_path, subset_prefix, file_prefix, file_count, row_count=500_00): + logger.info(f"Starting dataset creation with {file_count} files.") + + # Sort the files and pick only the first 'file_count' files + files = sorted([f for f in os.listdir(base_dir) if f.startswith(file_prefix)])[:file_count] num_files = len(files) + + if num_files == 0: + logger.warning("No files found with the specified prefix.") + return + + logger.info(f"Found {num_files} files. Creating dataset...") + + # Create the directory for the dataset + subset_dir = os.path.join(save_dir_path, f"{subset_prefix}_{file_count * row_count // 1000}k") + os.makedirs(subset_dir, exist_ok=True) + logger.info(f"Created directory for the dataset: {subset_dir}") + + # Copy the first 'file_count' files into the subset directory + for file in files: + src_file = os.path.join(base_dir, file) + dst_file = os.path.join(subset_dir, get_file_name(file, file_prefix, file_count)) + shutil.copy(src_file, dst_file) + logger.info(f"Copied {file} to {dst_file}") - for i in range(1, num_files + 1): - subset_dir = os.path.join(save_dir_path, f"{subset_prefix}_{i * step_size // 1000}k") - os.makedirs(subset_dir, exist_ok=True) - - for j in range(i): - src_file = os.path.join(base_dir, files[j]) - dst_file = os.path.join(subset_dir, get_file_name(files[j], file_prefix, i)) - shutil.copy(src_file, dst_file) - src_test_file = os.path.join(base_dir, "test.parquet") - dst_test_file = os.path.join(subset_dir, "test.parquet") - shutil.copy(src_test_file, dst_test_file) + # Also copy the test.parquet file + src_test_file = os.path.join(base_dir, "test.parquet") + dst_test_file = os.path.join(subset_dir, "test.parquet") + shutil.copy(src_test_file, dst_test_file) + logger.info(f"Copied test.parquet to {subset_dir}") + + logger.info(f"Dataset creation completed. {file_count} files have been copied to {subset_dir}.") if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Create subsets of Parquet files using Dask.") - parser.add_argument("--directory", type=str, help="Path to the directory containing Parquet files.") - parser.add_argument("--save-dir-path", type=str, help="Directory path where data will be saved") - parser.add_argument("--dataset-name-prefix", type=str, help="Name prefix of the folder where each subset will be saved.") + parser = argparse.ArgumentParser(description="Create a dataset with a specified number of Parquet files.") + parser.add_argument("--directory", type=str, required=True, help="Path to the directory containing Parquet files.") + parser.add_argument("--save-dir-path", type=str, required=True, help="Directory path where the dataset will be saved.") + parser.add_argument("--dataset-name-prefix", type=str, help="Name prefix for the dataset folder.") parser.add_argument("--is-shuffled", type=bool, help="Whether the files are shuffled or not.") + parser.add_argument("--file-count", type=int, required=True, help="Number of Parquet files to include in the dataset.") + args = parser.parse_args() file_prefix = ( @@ -48,7 +71,10 @@ def create_subsets(base_dir, save_dir_path, subset_prefix, file_prefix, step_siz if args.save_dir_path else args.directory ) - step_size = 500_000 # 500k - create_subsets(args.directory, save_dir_path, subset_prefix, file_prefix, step_size) - print(f'Finished creating subsets of Parquet files in {args.directory}.') \ No newline at end of file + # Log the input parameters + logger.info(f"Parameters received: directory={args.directory}, save_dir_path={args.save_dir_path}, file_count={args.file_count}, dataset_name_prefix={subset_prefix}, is_shuffled={args.is_shuffled}") + + # Create the dataset with the specified file_count + create_dataset(args.directory, save_dir_path, subset_prefix, file_prefix, args.file_count) + logger.info(f'Finished creating a dataset with {args.file_count} Parquet files.') \ No newline at end of file From a5cb055db2ef05674e7e99f6f3d5fb01e8eeac1d Mon Sep 17 00:00:00 2001 From: Sheharyar Ahmad Date: Tue, 15 Oct 2024 17:41:46 +0500 Subject: [PATCH 05/20] Added config files of large datasets --- ...onfig-custom-dataset-small-hnsw-3000k.json | 38 +++++++++++++++++++ ...onfig-custom-dataset-small-hnsw-3500k.json | 38 +++++++++++++++++++ ...onfig-custom-dataset-small-hnsw-4000k.json | 38 +++++++++++++++++++ ...onfig-custom-dataset-small-hnsw-4500k.json | 38 +++++++++++++++++++ ...onfig-custom-dataset-small-hnsw-5000k.json | 38 +++++++++++++++++++ 5 files changed, 190 insertions(+) create mode 100644 custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3000k.json create mode 100644 custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3500k.json create mode 100644 custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4000k.json create mode 100644 custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4500k.json create mode 100644 custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-5000k.json diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3000k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3000k.json new file mode 100644 index 000000000..681462f70 --- /dev/null +++ b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3000k.json @@ -0,0 +1,38 @@ +{ + "database": { + "host": "localhost1", + "username": "postgres", + "password": "postgres", + "db_name": "ann-3000k", + "instance_type": "db.m6i.large", + "provider": "aws", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison", + "drop_old": true, + "load": true, + "search-serial": false, + "search-concurrent": false, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "42GB", + "max-parallel-workers": 15, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-3m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_3000k", + "custom-dataset-size": 3000000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 6, + "custom-dataset-use-shuffled": false, + "run_count": 1 + } + ] + } + \ No newline at end of file diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3500k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3500k.json new file mode 100644 index 000000000..5da2a8fa7 --- /dev/null +++ b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3500k.json @@ -0,0 +1,38 @@ +{ + "database": { + "host": "localhost2", + "username": "postgres", + "password": "postgres", + "db_name": "ann-3500k", + "instance_type": "db.m6i.large", + "provider": "aws", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison", + "drop_old": true, + "load": true, + "search-serial": false, + "search-concurrent": false, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "42GB", + "max-parallel-workers": 15, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-3_5m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_3500k", + "custom-dataset-size": 3500000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 7, + "custom-dataset-use-shuffled": false, + "run_count": 1 + } + ] + } + \ No newline at end of file diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4000k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4000k.json new file mode 100644 index 000000000..8ff014218 --- /dev/null +++ b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4000k.json @@ -0,0 +1,38 @@ +{ + "database": { + "host": "localhost1", + "username": "postgres", + "password": "postgres", + "db_name": "ann-4000k", + "instance_type": "db.m6i.large", + "provider": "aws", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison", + "drop_old": true, + "load": true, + "search-serial": false, + "search-concurrent": false, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "42GB", + "max-parallel-workers": 15, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-4m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_4000k", + "custom-dataset-size": 4000000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 8, + "custom-dataset-use-shuffled": false, + "run_count": 1 + } + ] + } + \ No newline at end of file diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4500k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4500k.json new file mode 100644 index 000000000..f57ae2c9e --- /dev/null +++ b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4500k.json @@ -0,0 +1,38 @@ +{ + "database": { + "host": "localhost2", + "username": "postgres", + "password": "postgres", + "db_name": "ann-4500k", + "instance_type": "db.m6i.large", + "provider": "aws", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison", + "drop_old": true, + "load": true, + "search-serial": false, + "search-concurrent": false, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "42GB", + "max-parallel-workers": 15, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-4_5m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_4500k", + "custom-dataset-size": 4500000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 9, + "custom-dataset-use-shuffled": false, + "run_count": 1 + } + ] + } + \ No newline at end of file diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-5000k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-5000k.json new file mode 100644 index 000000000..a9b3e20c3 --- /dev/null +++ b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-5000k.json @@ -0,0 +1,38 @@ +{ + "database": { + "host": "localhost1", + "username": "postgres", + "password": "postgres", + "db_name": "ann-5000k", + "instance_type": "db.m6i.large", + "provider": "aws", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison", + "drop_old": true, + "load": true, + "search-serial": false, + "search-concurrent": false, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "42GB", + "max-parallel-workers": 15, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-5m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_5000k", + "custom-dataset-size": 5000000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 10, + "custom-dataset-use-shuffled": false, + "run_count": 1 + } + ] + } + \ No newline at end of file From b605c835808a6c27e877bb6620cc92122730a5fa Mon Sep 17 00:00:00 2001 From: Sheharyar Ahmad Date: Tue, 15 Oct 2024 19:19:08 +0500 Subject: [PATCH 06/20] Updated scripts and configs to create dataset on runtime. --- create_dataset_subsets.py | 7 +- ...onfig-custom-dataset-small-hnsw-3000k.json | 5 ++ ...onfig-custom-dataset-small-hnsw-3500k.json | 5 ++ ...onfig-custom-dataset-small-hnsw-4000k.json | 5 ++ ...onfig-custom-dataset-small-hnsw-4500k.json | 5 ++ ...onfig-custom-dataset-small-hnsw-5000k.json | 5 ++ ...onfig-custom-dataset-small-hnsw-1000k.json | 5 ++ ...onfig-custom-dataset-small-hnsw-1500k.json | 5 ++ ...onfig-custom-dataset-small-hnsw-2000k.json | 5 ++ ...onfig-custom-dataset-small-hnsw-2500k.json | 5 ++ ...config-custom-dataset-small-hnsw-500k.json | 5 ++ ...onfig-custom-dataset-small-hnsw-1000k.json | 5 ++ ...onfig-custom-dataset-small-hnsw-1500k.json | 5 ++ ...onfig-custom-dataset-small-hnsw-2000k.json | 5 ++ ...onfig-custom-dataset-small-hnsw-2500k.json | 5 ++ ...config-custom-dataset-small-hnsw-500k.json | 5 ++ run-custom-dataset.py | 77 +++++++++++++++++++ 17 files changed, 157 insertions(+), 2 deletions(-) diff --git a/create_dataset_subsets.py b/create_dataset_subsets.py index c1263cabf..21b131a84 100644 --- a/create_dataset_subsets.py +++ b/create_dataset_subsets.py @@ -13,7 +13,7 @@ def get_file_name(file_name: str, file_prefix: str, files_count: int) -> str: file_name = file_name.split("of-")[0] return file_name + "of-" + str(files_count).zfill(2) + ".parquet" -def create_dataset(base_dir, save_dir_path, subset_prefix, file_prefix, file_count, row_count=500_00): +def create_dataset(base_dir, save_dir_path, subset_prefix, file_prefix, file_count, row_count=500_000): logger.info(f"Starting dataset creation with {file_count} files.") # Sort the files and pick only the first 'file_count' files @@ -53,7 +53,6 @@ def create_dataset(base_dir, save_dir_path, subset_prefix, file_prefix, file_cou parser.add_argument("--dataset-name-prefix", type=str, help="Name prefix for the dataset folder.") parser.add_argument("--is-shuffled", type=bool, help="Whether the files are shuffled or not.") parser.add_argument("--file-count", type=int, required=True, help="Number of Parquet files to include in the dataset.") - args = parser.parse_args() file_prefix = ( @@ -72,6 +71,10 @@ def create_dataset(base_dir, save_dir_path, subset_prefix, file_prefix, file_cou else args.directory ) + if os.path.exists(save_dir_path) and os.listdir(save_dir_path): + shutil.rmtree(save_dir_path) + logger.info(f"Deleted existing directory: {save_dir_path}") + # Log the input parameters logger.info(f"Parameters received: directory={args.directory}, save_dir_path={args.save_dir_path}, file_count={args.file_count}, dataset_name_prefix={subset_prefix}, is_shuffled={args.is_shuffled}") diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3000k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3000k.json index 681462f70..e82e520fc 100644 --- a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3000k.json +++ b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3000k.json @@ -31,6 +31,11 @@ "custom-dataset-dim": 1536, "custom-dataset-file-count": 6, "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/azureuser/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/azureuser/vectordb_bench/datasets/custom-openai/", + "is-shuffled": false + }, "run_count": 1 } ] diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3500k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3500k.json index 5da2a8fa7..b56c50230 100644 --- a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3500k.json +++ b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3500k.json @@ -31,6 +31,11 @@ "custom-dataset-dim": 1536, "custom-dataset-file-count": 7, "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/azureuser/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/azureuser/vectordb_bench/datasets/custom-openai/", + "is-shuffled": false + }, "run_count": 1 } ] diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4000k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4000k.json index 8ff014218..9d4d7e671 100644 --- a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4000k.json +++ b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4000k.json @@ -31,6 +31,11 @@ "custom-dataset-dim": 1536, "custom-dataset-file-count": 8, "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/azureuser/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/azureuser/vectordb_bench/datasets/custom-openai/", + "is-shuffled": false + }, "run_count": 1 } ] diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4500k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4500k.json index f57ae2c9e..331d818cd 100644 --- a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4500k.json +++ b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4500k.json @@ -31,6 +31,11 @@ "custom-dataset-dim": 1536, "custom-dataset-file-count": 9, "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/azureuser/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/azureuser/vectordb_bench/datasets/custom-openai/", + "is-shuffled": false + }, "run_count": 1 } ] diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-5000k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-5000k.json index a9b3e20c3..9dc745bc5 100644 --- a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-5000k.json +++ b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-5000k.json @@ -31,6 +31,11 @@ "custom-dataset-dim": 1536, "custom-dataset-file-count": 10, "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/azureuser/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/azureuser/vectordb_bench/datasets/custom-openai/", + "is-shuffled": false + }, "run_count": 1 } ] diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1000k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1000k.json index 12a041ab6..ccda5c237 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1000k.json +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1000k.json @@ -31,6 +31,11 @@ "custom-dataset-dim": 1536, "custom-dataset-file-count": 2, "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/azureuser/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/azureuser/vectordb_bench/datasets/custom-openai/", + "is-shuffled": false + }, "run_count": 1 } ] diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1500k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1500k.json index f978cdca5..381d5c982 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1500k.json +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1500k.json @@ -31,6 +31,11 @@ "custom-dataset-dim": 1536, "custom-dataset-file-count": 3, "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/azureuser/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/azureuser/vectordb_bench/datasets/custom-openai/", + "is-shuffled": false + }, "run_count": 1 } ] diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2000k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2000k.json index 3b8068674..ada201112 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2000k.json +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2000k.json @@ -31,6 +31,11 @@ "custom-dataset-dim": 1536, "custom-dataset-file-count": 4, "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/azureuser/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/azureuser/vectordb_bench/datasets/custom-openai/", + "is-shuffled": false + }, "run_count": 1 } ] diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2500k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2500k.json index 5612e29db..1d1f669b5 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2500k.json +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2500k.json @@ -31,6 +31,11 @@ "custom-dataset-dim": 1536, "custom-dataset-file-count": 5, "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/azureuser/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/azureuser/vectordb_bench/datasets/custom-openai/", + "is-shuffled": false + }, "run_count": 1 } ] diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-500k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-500k.json index dc86e930a..74514cf2a 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-500k.json +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-500k.json @@ -31,6 +31,11 @@ "custom-dataset-dim": 1536, "custom-dataset-file-count": 1, "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/azureuser/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/azureuser/vectordb_bench/datasets/custom-openai/", + "is-shuffled": false + }, "run_count": 1 } ] diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json b/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json index bed3ebad1..f1cc2534e 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json @@ -31,6 +31,11 @@ "custom-dataset-dim": 1536, "custom-dataset-file-count": 2, "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/azureuser/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/azureuser/vectordb_bench/datasets/custom-openai/", + "is-shuffled": false + }, "run_count": 1 } ] diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json index 41fd90c94..93aa2d4db 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json @@ -31,6 +31,11 @@ "custom-dataset-dim": 1536, "custom-dataset-file-count": 3, "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/azureuser/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/azureuser/vectordb_bench/datasets/custom-openai/", + "is-shuffled": false + }, "run_count": 1 } ] diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-2000k.json b/custom-run-configs/config-custom-dataset-small-hnsw-2000k.json index f488c736a..31d62fec7 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-2000k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-2000k.json @@ -31,6 +31,11 @@ "custom-dataset-dim": 1536, "custom-dataset-file-count": 4, "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/azureuser/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/azureuser/vectordb_bench/datasets/custom-openai/", + "is-shuffled": false + }, "run_count": 1 } ] diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json index 7111acc5f..08b380cc0 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json @@ -31,6 +31,11 @@ "custom-dataset-dim": 1536, "custom-dataset-file-count": 5, "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/azureuser/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/azureuser/vectordb_bench/datasets/custom-openai/", + "is-shuffled": false + }, "run_count": 1 } ] diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-500k.json index 0867ef241..8e173c630 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-500k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-500k.json @@ -31,6 +31,11 @@ "custom-dataset-dim": 1536, "custom-dataset-file-count": 1, "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/azureuser/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/azureuser/vectordb_bench/datasets/custom-openai/", + "is-shuffled": false + }, "run_count": 1 } ] diff --git a/run-custom-dataset.py b/run-custom-dataset.py index b3f71ae29..bde14a6be 100644 --- a/run-custom-dataset.py +++ b/run-custom-dataset.py @@ -7,6 +7,7 @@ import psycopg from psycopg import sql import os +import shutil os.environ["LOG_LEVEL"] = "DEBUG" @@ -41,11 +42,62 @@ def setup_database(config): cursor = conn.cursor() cursor.execute("CREATE EXTENSION IF NOT EXISTS vector;") cursor.execute("CREATE EXTENSION IF NOT EXISTS pg_buffercache;") + cursor.execute("CREATE EXTENSION IF NOT EXISTS pg_prewarm;") conn.commit() conn.close() except Exception as e: print(f"Setup failed: {e}") + +def create_dataset(args: dict) -> bool: + """ + This function creates a dataset from the original dataset using script + create_dataset_subsets.py and pass required arguments to it. + """ + file_count = args.get("file-count") + is_shuffled = args.get("is-shuffled") + directory = args.get("directory") + output_dir = args.get("save-dir-path") + + try: + # Define the command to run the create_dataset_subsets.py script + command = [ + "python3", "create_dataset_subsets.py", + "--directory", directory, + "--save-dir-path", output_dir, + "--file-count", str(file_count), + ] + print(f"Running command: {' '.join(command)}") + + file_prefix = "train" + if is_shuffled: + file_prefix = "shuffle_train" + command += ["--is-shuffled", "True"] + subprocess.run(command, check=True) + print("Check if dataset was created successfully.") + + created_files_count = sum([1 for _, _, files in os.walk(output_dir) for f in files if f.startswith(file_prefix)]) + print(f"Number of files in the output dataset directory: {created_files_count}") + + if created_files_count != file_count: + raise Exception("Incorrect number of files.") + print("Dataset creation successful.") + except (subprocess.CalledProcessError, Exception) as e: + print(f"Dataset creation failed: {e}") + return False + + return True + +def delete_dataset(dataset_dir: str): + try: + if os.path.exists(dataset_dir): + shutil.rmtree(dataset_dir) + print(f"Deleted directory: {dataset_dir}") + else: + print(f"Directory does not exist: {dataset_dir}") + except Exception as e: + print(f"Failed to delete directory: {e}") + def teardown_database(config): # Optionally drop the database after the test pass @@ -133,6 +185,21 @@ def query_configurations(config): print(f"Failed to query configurations: {e}") return {} +def pre_warm(config): + print("Running pre warm") + try: + conn = psycopg.connect( + dbname=config['db_name'], + user=config['username'], + password=config['password'], + host=config['host'], + ) + cursor = conn.cursor() + cursor.execute("SELECT pg_prewarm('public.pgvector_index')"); + conn.close() + print("Pre-warm completed") + except Exception: + pass def run_benchmark(case, db_config): base_command = [ @@ -226,6 +293,8 @@ def run_benchmark(case, db_config): print(f"{key}: {value}") get_stats(db_config) f.flush() + pre_warm(db_config) + print(f"Running with prewarm") print(f"Running command: {' '.join(command)}") f.flush() @@ -258,9 +327,17 @@ def main(): for case in config['cases']: print(f"Running case: {case['db-label']}") setup_database(config) + + create_dataset_args = case['create-dataset-args'] + create_dataset_args["file-count"] = case["custom-dataset-file-count"] + dataset_created = create_dataset(create_dataset_args) + if not dataset_created: + print(f"Failed to create dataset for case: {case['custom-case-name']} -- Skipping execution.") + continue run_benchmark(case, config['database']) teardown_database(config) + delete_dataset(create_dataset_args["save-dir-path"]) end_time = time.time() execution_time = end_time - start_time print(f"COMPLETED ALL EXECUTIONS of config {file_name}. total_duration={execution_time}") From 12a439a7167439810d3266309c8b278a52e1be70 Mon Sep 17 00:00:00 2001 From: Sheharyar Ahmad Date: Tue, 15 Oct 2024 19:24:30 +0500 Subject: [PATCH 07/20] Updated dataset directory paths --- .../config-custom-dataset-small-hnsw-3000k.json | 4 ++-- .../config-custom-dataset-small-hnsw-3500k.json | 4 ++-- .../config-custom-dataset-small-hnsw-4000k.json | 4 ++-- .../config-custom-dataset-small-hnsw-4500k.json | 4 ++-- .../config-custom-dataset-small-hnsw-5000k.json | 4 ++-- .../config-custom-dataset-small-hnsw-1000k.json | 4 ++-- .../config-custom-dataset-small-hnsw-1500k.json | 4 ++-- .../config-custom-dataset-small-hnsw-2000k.json | 4 ++-- .../config-custom-dataset-small-hnsw-2500k.json | 4 ++-- .../config-custom-dataset-small-hnsw-500k.json | 4 ++-- .../config-custom-dataset-small-hnsw-1000k.json | 4 ++-- .../config-custom-dataset-small-hnsw-1500k.json | 4 ++-- .../config-custom-dataset-small-hnsw-2000k.json | 4 ++-- .../config-custom-dataset-small-hnsw-2500k.json | 4 ++-- custom-run-configs/config-custom-dataset-small-hnsw-500k.json | 4 ++-- 15 files changed, 30 insertions(+), 30 deletions(-) diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3000k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3000k.json index e82e520fc..2a822902d 100644 --- a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3000k.json +++ b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3000k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 6, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/azureuser/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/azureuser/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", "is-shuffled": false }, "run_count": 1 diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3500k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3500k.json index b56c50230..a1cc4ec50 100644 --- a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3500k.json +++ b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3500k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 7, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/azureuser/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/azureuser/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", "is-shuffled": false }, "run_count": 1 diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4000k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4000k.json index 9d4d7e671..26615954e 100644 --- a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4000k.json +++ b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4000k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 8, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/azureuser/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/azureuser/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", "is-shuffled": false }, "run_count": 1 diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4500k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4500k.json index 331d818cd..f666ed424 100644 --- a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4500k.json +++ b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4500k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 9, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/azureuser/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/azureuser/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", "is-shuffled": false }, "run_count": 1 diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-5000k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-5000k.json index 9dc745bc5..0fe1c9a9d 100644 --- a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-5000k.json +++ b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-5000k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 10, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/azureuser/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/azureuser/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", "is-shuffled": false }, "run_count": 1 diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1000k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1000k.json index ccda5c237..f6b49680e 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1000k.json +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1000k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 2, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/azureuser/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/azureuser/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", "is-shuffled": false }, "run_count": 1 diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1500k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1500k.json index 381d5c982..af3ea6175 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1500k.json +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1500k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 3, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/azureuser/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/azureuser/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", "is-shuffled": false }, "run_count": 1 diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2000k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2000k.json index ada201112..e80b1b20e 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2000k.json +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2000k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 4, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/azureuser/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/azureuser/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", "is-shuffled": false }, "run_count": 1 diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2500k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2500k.json index 1d1f669b5..0a572765d 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2500k.json +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2500k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 5, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/azureuser/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/azureuser/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", "is-shuffled": false }, "run_count": 1 diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-500k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-500k.json index 74514cf2a..e9a61787e 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-500k.json +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-500k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 1, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/azureuser/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/azureuser/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", "is-shuffled": false }, "run_count": 1 diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json b/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json index f1cc2534e..06d81d4f5 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 2, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/azureuser/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/azureuser/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", "is-shuffled": false }, "run_count": 1 diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json index 93aa2d4db..c2bd493d5 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 3, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/azureuser/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/azureuser/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", "is-shuffled": false }, "run_count": 1 diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-2000k.json b/custom-run-configs/config-custom-dataset-small-hnsw-2000k.json index 31d62fec7..4611c6062 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-2000k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-2000k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 4, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/azureuser/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/azureuser/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", "is-shuffled": false }, "run_count": 1 diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json index 08b380cc0..776cb9d8c 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 5, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/azureuser/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/azureuser/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", "is-shuffled": false }, "run_count": 1 diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-500k.json index 8e173c630..f204bf56b 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-500k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-500k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 1, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/azureuser/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/azureuser/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", "is-shuffled": false }, "run_count": 1 From d990ac5f8405e544daaee3d189d46582952613d1 Mon Sep 17 00:00:00 2001 From: Sheharyar Ahmad Date: Tue, 15 Oct 2024 19:58:42 +0500 Subject: [PATCH 08/20] Added run configs upto 5 million custom dataset --- ...onfig-custom-dataset-small-hnsw-3000k.json | 43 +++++++++++++++++++ ...onfig-custom-dataset-small-hnsw-3500k.json | 43 +++++++++++++++++++ ...onfig-custom-dataset-small-hnsw-4000k.json | 43 +++++++++++++++++++ ...onfig-custom-dataset-small-hnsw-4500k.json | 43 +++++++++++++++++++ ...onfig-custom-dataset-small-hnsw-5000k.json | 43 +++++++++++++++++++ 5 files changed, 215 insertions(+) create mode 100644 custom-run-configs/config-custom-dataset-small-hnsw-3000k.json create mode 100644 custom-run-configs/config-custom-dataset-small-hnsw-3500k.json create mode 100644 custom-run-configs/config-custom-dataset-small-hnsw-4000k.json create mode 100644 custom-run-configs/config-custom-dataset-small-hnsw-4500k.json create mode 100644 custom-run-configs/config-custom-dataset-small-hnsw-5000k.json diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-3000k.json b/custom-run-configs/config-custom-dataset-small-hnsw-3000k.json new file mode 100644 index 000000000..0be352d78 --- /dev/null +++ b/custom-run-configs/config-custom-dataset-small-hnsw-3000k.json @@ -0,0 +1,43 @@ +{ + "database": { + "host": "localhost1", + "username": "postgres", + "password": "postgres", + "db_name": "ann-3000k", + "instance_type": "db.m6i.large", + "provider": "aws", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison", + "drop_old": false, + "load": false, + "search-serial": true, + "search-concurrent": true, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "8GB", + "max-parallel-workers": 3, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-3m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_3000k", + "custom-dataset-size": 3000000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 6, + "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", + "is-shuffled": false + }, + "run_count": 1 + } + ] + } + \ No newline at end of file diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-3500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-3500k.json new file mode 100644 index 000000000..d87c5fbb5 --- /dev/null +++ b/custom-run-configs/config-custom-dataset-small-hnsw-3500k.json @@ -0,0 +1,43 @@ +{ + "database": { + "host": "localhost2", + "username": "postgres", + "password": "postgres", + "db_name": "ann-3500k", + "instance_type": "db.m6i.large", + "provider": "aws", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison", + "drop_old": false, + "load": false, + "search-serial": true, + "search-concurrent": true, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "8GB", + "max-parallel-workers": 3, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-3_5m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_3500k", + "custom-dataset-size": 3500000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 7, + "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", + "is-shuffled": false + }, + "run_count": 1 + } + ] + } + \ No newline at end of file diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-4000k.json b/custom-run-configs/config-custom-dataset-small-hnsw-4000k.json new file mode 100644 index 000000000..e0cbfd5ce --- /dev/null +++ b/custom-run-configs/config-custom-dataset-small-hnsw-4000k.json @@ -0,0 +1,43 @@ +{ + "database": { + "host": "localhost1", + "username": "postgres", + "password": "postgres", + "db_name": "ann-4000k", + "instance_type": "db.m6i.large", + "provider": "aws", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison", + "drop_old": false, + "load": false, + "search-serial": true, + "search-concurrent": true, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "8GB", + "max-parallel-workers": 3, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-4m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_4000k", + "custom-dataset-size": 4000000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 8, + "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", + "is-shuffled": false + }, + "run_count": 1 + } + ] + } + \ No newline at end of file diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-4500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-4500k.json new file mode 100644 index 000000000..c7110eb85 --- /dev/null +++ b/custom-run-configs/config-custom-dataset-small-hnsw-4500k.json @@ -0,0 +1,43 @@ +{ + "database": { + "host": "localhost2", + "username": "postgres", + "password": "postgres", + "db_name": "ann-4500k", + "instance_type": "db.m6i.large", + "provider": "aws", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison", + "drop_old": false, + "load": false, + "search-serial": true, + "search-concurrent": true, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "8GB", + "max-parallel-workers": 3, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-4_5m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_4500k", + "custom-dataset-size": 4500000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 9, + "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", + "is-shuffled": false + }, + "run_count": 1 + } + ] + } + \ No newline at end of file diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-5000k.json b/custom-run-configs/config-custom-dataset-small-hnsw-5000k.json new file mode 100644 index 000000000..1d9124996 --- /dev/null +++ b/custom-run-configs/config-custom-dataset-small-hnsw-5000k.json @@ -0,0 +1,43 @@ +{ + "database": { + "host": "localhost1", + "username": "postgres", + "password": "postgres", + "db_name": "ann-5000k", + "instance_type": "db.m6i.large", + "provider": "aws", + "enable_seqscan": "on" + }, + "cases": [ + { + "db-label": "memory-comparison", + "drop_old": false, + "load": false, + "search-serial": true, + "search-concurrent": true, + "case-type": "PerformanceCustomDataset", + "maintenance-work-mem": "8GB", + "max-parallel-workers": 3, + "ef-search": [40], + "ef-construction": 128, + "m": 32, + "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100", + "concurrency-duration": 30, + "k": 10, + "custom-case-name": "hnsw-1536D-5m", + "custom-dataset-name": "custom-openai", + "custom-dataset-dir": "openai_5000k", + "custom-dataset-size": 5000000, + "custom-dataset-dim": 1536, + "custom-dataset-file-count": 10, + "custom-dataset-use-shuffled": false, + "create-dataset-args": { + "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", + "is-shuffled": false + }, + "run_count": 1 + } + ] + } + \ No newline at end of file From b6dd4d5159941bfe8ed37e689d5ee992c7ca58f6 Mon Sep 17 00:00:00 2001 From: Sheharyar Ahmad Date: Tue, 15 Oct 2024 20:00:49 +0500 Subject: [PATCH 09/20] Increased run count to 3 for search configs --- custom-run-configs/config-custom-dataset-small-hnsw-1000k.json | 2 +- custom-run-configs/config-custom-dataset-small-hnsw-1500k.json | 2 +- custom-run-configs/config-custom-dataset-small-hnsw-2000k.json | 2 +- custom-run-configs/config-custom-dataset-small-hnsw-2500k.json | 2 +- custom-run-configs/config-custom-dataset-small-hnsw-3000k.json | 2 +- custom-run-configs/config-custom-dataset-small-hnsw-3500k.json | 2 +- custom-run-configs/config-custom-dataset-small-hnsw-4000k.json | 2 +- custom-run-configs/config-custom-dataset-small-hnsw-4500k.json | 2 +- custom-run-configs/config-custom-dataset-small-hnsw-5000k.json | 2 +- custom-run-configs/config-custom-dataset-small-hnsw-500k.json | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json b/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json index 06d81d4f5..633aad799 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json @@ -36,7 +36,7 @@ "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", "is-shuffled": false }, - "run_count": 1 + "run_count": 3 } ] } diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json index c2bd493d5..d84947c21 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json @@ -36,7 +36,7 @@ "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", "is-shuffled": false }, - "run_count": 1 + "run_count": 3 } ] } diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-2000k.json b/custom-run-configs/config-custom-dataset-small-hnsw-2000k.json index 4611c6062..e04da17b4 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-2000k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-2000k.json @@ -36,7 +36,7 @@ "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", "is-shuffled": false }, - "run_count": 1 + "run_count": 3 } ] } diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json index 776cb9d8c..752d71d60 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json @@ -36,7 +36,7 @@ "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", "is-shuffled": false }, - "run_count": 1 + "run_count": 3 } ] } diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-3000k.json b/custom-run-configs/config-custom-dataset-small-hnsw-3000k.json index 0be352d78..2c8f015e3 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-3000k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-3000k.json @@ -36,7 +36,7 @@ "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", "is-shuffled": false }, - "run_count": 1 + "run_count": 3 } ] } diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-3500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-3500k.json index d87c5fbb5..dfb850610 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-3500k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-3500k.json @@ -36,7 +36,7 @@ "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", "is-shuffled": false }, - "run_count": 1 + "run_count": 3 } ] } diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-4000k.json b/custom-run-configs/config-custom-dataset-small-hnsw-4000k.json index e0cbfd5ce..68074b2f4 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-4000k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-4000k.json @@ -36,7 +36,7 @@ "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", "is-shuffled": false }, - "run_count": 1 + "run_count": 3 } ] } diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-4500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-4500k.json index c7110eb85..24d830036 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-4500k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-4500k.json @@ -36,7 +36,7 @@ "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", "is-shuffled": false }, - "run_count": 1 + "run_count": 3 } ] } diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-5000k.json b/custom-run-configs/config-custom-dataset-small-hnsw-5000k.json index 1d9124996..63371ff68 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-5000k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-5000k.json @@ -36,7 +36,7 @@ "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", "is-shuffled": false }, - "run_count": 1 + "run_count": 3 } ] } diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-500k.json index f204bf56b..81d89826d 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-500k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-500k.json @@ -36,7 +36,7 @@ "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", "is-shuffled": false }, - "run_count": 1 + "run_count": 3 } ] } From 17b34e058ca55bd74e52fb8d4d00f189382a4426 Mon Sep 17 00:00:00 2001 From: Sheharyar Ahmad Date: Tue, 15 Oct 2024 20:07:42 +0500 Subject: [PATCH 10/20] Updated db labels in configs. --- .../config-custom-dataset-small-hnsw-3000k.json | 2 +- .../config-custom-dataset-small-hnsw-3500k.json | 2 +- .../config-custom-dataset-small-hnsw-4000k.json | 2 +- .../config-custom-dataset-small-hnsw-4500k.json | 2 +- .../config-custom-dataset-small-hnsw-5000k.json | 2 +- .../config-custom-dataset-small-hnsw-1000k.json | 2 +- .../config-custom-dataset-small-hnsw-1500k.json | 2 +- .../config-custom-dataset-small-hnsw-2000k.json | 2 +- .../config-custom-dataset-small-hnsw-2500k.json | 2 +- .../config-custom-dataset-small-hnsw-500k.json | 2 +- custom-run-configs/config-custom-dataset-small-hnsw-1000k.json | 2 +- custom-run-configs/config-custom-dataset-small-hnsw-1500k.json | 2 +- custom-run-configs/config-custom-dataset-small-hnsw-2000k.json | 2 +- custom-run-configs/config-custom-dataset-small-hnsw-2500k.json | 2 +- custom-run-configs/config-custom-dataset-small-hnsw-3000k.json | 2 +- custom-run-configs/config-custom-dataset-small-hnsw-3500k.json | 2 +- custom-run-configs/config-custom-dataset-small-hnsw-4000k.json | 2 +- custom-run-configs/config-custom-dataset-small-hnsw-5000k.json | 2 +- custom-run-configs/config-custom-dataset-small-hnsw-500k.json | 2 +- 19 files changed, 19 insertions(+), 19 deletions(-) diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3000k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3000k.json index 2a822902d..477f395f4 100644 --- a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3000k.json +++ b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3000k.json @@ -10,7 +10,7 @@ }, "cases": [ { - "db-label": "memory-comparison", + "db-label": "memory-comparison-3000k", "drop_old": true, "load": true, "search-serial": false, diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3500k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3500k.json index a1cc4ec50..2b3284e4c 100644 --- a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3500k.json +++ b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3500k.json @@ -10,7 +10,7 @@ }, "cases": [ { - "db-label": "memory-comparison", + "db-label": "memory-comparison-3500k", "drop_old": true, "load": true, "search-serial": false, diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4000k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4000k.json index 26615954e..25979e397 100644 --- a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4000k.json +++ b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4000k.json @@ -10,7 +10,7 @@ }, "cases": [ { - "db-label": "memory-comparison", + "db-label": "memory-comparison-4000k", "drop_old": true, "load": true, "search-serial": false, diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4500k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4500k.json index f666ed424..caf5fa5e3 100644 --- a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4500k.json +++ b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4500k.json @@ -10,7 +10,7 @@ }, "cases": [ { - "db-label": "memory-comparison", + "db-label": "memory-comparison-4500k", "drop_old": true, "load": true, "search-serial": false, diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-5000k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-5000k.json index 0fe1c9a9d..f897ec422 100644 --- a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-5000k.json +++ b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-5000k.json @@ -10,7 +10,7 @@ }, "cases": [ { - "db-label": "memory-comparison", + "db-label": "memory-comparison-5000k", "drop_old": true, "load": true, "search-serial": false, diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1000k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1000k.json index f6b49680e..8d1bcf5c7 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1000k.json +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1000k.json @@ -10,7 +10,7 @@ }, "cases": [ { - "db-label": "memory-comparison", + "db-label": "memory-comparison-1000k", "drop_old": true, "load": true, "search-serial": false, diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1500k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1500k.json index af3ea6175..b0edfe6f2 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1500k.json +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1500k.json @@ -10,7 +10,7 @@ }, "cases": [ { - "db-label": "memory-comparison", + "db-label": "memory-comparison-1500k", "drop_old": true, "load": true, "search-serial": false, diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2000k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2000k.json index e80b1b20e..eba71b9f5 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2000k.json +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2000k.json @@ -10,7 +10,7 @@ }, "cases": [ { - "db-label": "memory-comparison", + "db-label": "memory-comparison-2000k", "drop_old": true, "load": true, "search-serial": false, diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2500k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2500k.json index 0a572765d..530e11bbc 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2500k.json +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2500k.json @@ -10,7 +10,7 @@ }, "cases": [ { - "db-label": "memory-comparison", + "db-label": "memory-comparison-2500k", "drop_old": true, "load": true, "search-serial": false, diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-500k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-500k.json index e9a61787e..b49db3f1a 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-500k.json +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-500k.json @@ -10,7 +10,7 @@ }, "cases": [ { - "db-label": "memory-comparison", + "db-label": "memory-comparison-500k", "drop_old": true, "load": true, "search-serial": false, diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json b/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json index 633aad799..f3aea5ec3 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json @@ -10,7 +10,7 @@ }, "cases": [ { - "db-label": "memory-comparison", + "db-label": "memory-comparison-1000k", "drop_old": false, "load": false, "search-serial": true, diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json index d84947c21..98c5c71f7 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json @@ -10,7 +10,7 @@ }, "cases": [ { - "db-label": "memory-comparison", + "db-label": "memory-comparison-1500k", "drop_old": false, "load": false, "search-serial": true, diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-2000k.json b/custom-run-configs/config-custom-dataset-small-hnsw-2000k.json index e04da17b4..2452055a8 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-2000k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-2000k.json @@ -10,7 +10,7 @@ }, "cases": [ { - "db-label": "memory-comparison", + "db-label": "memory-comparison-2000k", "drop_old": false, "load": false, "search-serial": true, diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json index 752d71d60..100cc6cb1 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json @@ -10,7 +10,7 @@ }, "cases": [ { - "db-label": "memory-comparison", + "db-label": "memory-comparison-2500k", "drop_old": false, "load": false, "search-serial": true, diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-3000k.json b/custom-run-configs/config-custom-dataset-small-hnsw-3000k.json index 2c8f015e3..c1d3829b1 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-3000k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-3000k.json @@ -10,7 +10,7 @@ }, "cases": [ { - "db-label": "memory-comparison", + "db-label": "memory-comparison-3000k", "drop_old": false, "load": false, "search-serial": true, diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-3500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-3500k.json index dfb850610..a69d3015a 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-3500k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-3500k.json @@ -10,7 +10,7 @@ }, "cases": [ { - "db-label": "memory-comparison", + "db-label": "memory-comparison-3500k", "drop_old": false, "load": false, "search-serial": true, diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-4000k.json b/custom-run-configs/config-custom-dataset-small-hnsw-4000k.json index 68074b2f4..8096bc863 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-4000k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-4000k.json @@ -10,7 +10,7 @@ }, "cases": [ { - "db-label": "memory-comparison", + "db-label": "memory-comparison-4000k", "drop_old": false, "load": false, "search-serial": true, diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-5000k.json b/custom-run-configs/config-custom-dataset-small-hnsw-5000k.json index 63371ff68..a00471e09 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-5000k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-5000k.json @@ -10,7 +10,7 @@ }, "cases": [ { - "db-label": "memory-comparison", + "db-label": "memory-comparison-5000k", "drop_old": false, "load": false, "search-serial": true, diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-500k.json index 81d89826d..a151768b0 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-500k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-500k.json @@ -10,7 +10,7 @@ }, "cases": [ { - "db-label": "memory-comparison", + "db-label": "memory-comparison-500k", "drop_old": false, "load": false, "search-serial": true, From 0ab30ce37893611a78a151c27689e6215be56331 Mon Sep 17 00:00:00 2001 From: Sheharyar Ahmad Date: Tue, 15 Oct 2024 20:09:05 +0500 Subject: [PATCH 11/20] Updated db-label in config. --- custom-run-configs/config-custom-dataset-small-hnsw-4500k.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-4500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-4500k.json index 24d830036..2f3b3ba0d 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-4500k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-4500k.json @@ -10,7 +10,7 @@ }, "cases": [ { - "db-label": "memory-comparison", + "db-label": "memory-comparison-4500k", "drop_old": false, "load": false, "search-serial": true, From 2603723b980c46867ec856d0b3d0b37f172b57d8 Mon Sep 17 00:00:00 2001 From: Sheharyar Ahmad Date: Tue, 15 Oct 2024 20:52:10 +0500 Subject: [PATCH 12/20] Dividing configs based on dataset size for building indexes in parallel --- .../config-custom-dataset-small-hnsw-2000k.json | 0 .../config-custom-dataset-small-hnsw-500k.json | 0 .../config-custom-dataset-small-hnsw-3500k.json | 0 .../config-custom-dataset-small-hnsw-4000k.json | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename {custom-run-build-index-configs => custom-run-build-index-configs-large}/config-custom-dataset-small-hnsw-2000k.json (100%) rename {custom-run-build-index-configs => custom-run-build-index-configs-large}/config-custom-dataset-small-hnsw-500k.json (100%) rename {custom-run-build-index-configs-large => custom-run-build-index-configs}/config-custom-dataset-small-hnsw-3500k.json (100%) rename {custom-run-build-index-configs-large => custom-run-build-index-configs}/config-custom-dataset-small-hnsw-4000k.json (100%) diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2000k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-2000k.json similarity index 100% rename from custom-run-build-index-configs/config-custom-dataset-small-hnsw-2000k.json rename to custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-2000k.json diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-500k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-500k.json similarity index 100% rename from custom-run-build-index-configs/config-custom-dataset-small-hnsw-500k.json rename to custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-500k.json diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3500k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-3500k.json similarity index 100% rename from custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3500k.json rename to custom-run-build-index-configs/config-custom-dataset-small-hnsw-3500k.json diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4000k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-4000k.json similarity index 100% rename from custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4000k.json rename to custom-run-build-index-configs/config-custom-dataset-small-hnsw-4000k.json From 22e2fd9a9fbdc127eced22c4d41c6256d94ba048 Mon Sep 17 00:00:00 2001 From: Sheharyar Ahmad Date: Tue, 15 Oct 2024 21:14:25 +0500 Subject: [PATCH 13/20] Fixed create_dataset_args directory paths --- .../config-custom-dataset-small-hnsw-2000k.json | 4 ++-- .../config-custom-dataset-small-hnsw-3000k.json | 4 ++-- .../config-custom-dataset-small-hnsw-4500k.json | 4 ++-- .../config-custom-dataset-small-hnsw-5000k.json | 4 ++-- .../config-custom-dataset-small-hnsw-500k.json | 4 ++-- .../config-custom-dataset-small-hnsw-1000k.json | 4 ++-- .../config-custom-dataset-small-hnsw-1500k.json | 4 ++-- .../config-custom-dataset-small-hnsw-2500k.json | 4 ++-- .../config-custom-dataset-small-hnsw-3500k.json | 4 ++-- .../config-custom-dataset-small-hnsw-4000k.json | 4 ++-- .../config-custom-dataset-small-hnsw-1000k.json | 4 ++-- .../config-custom-dataset-small-hnsw-1500k.json | 4 ++-- .../config-custom-dataset-small-hnsw-2000k.json | 4 ++-- .../config-custom-dataset-small-hnsw-2500k.json | 4 ++-- .../config-custom-dataset-small-hnsw-3000k.json | 4 ++-- .../config-custom-dataset-small-hnsw-3500k.json | 4 ++-- .../config-custom-dataset-small-hnsw-4000k.json | 4 ++-- .../config-custom-dataset-small-hnsw-4500k.json | 4 ++-- .../config-custom-dataset-small-hnsw-5000k.json | 4 ++-- custom-run-configs/config-custom-dataset-small-hnsw-500k.json | 4 ++-- 20 files changed, 40 insertions(+), 40 deletions(-) diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-2000k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-2000k.json index eba71b9f5..b17bca14b 100644 --- a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-2000k.json +++ b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-2000k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 4, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", "is-shuffled": false }, "run_count": 1 diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3000k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3000k.json index 477f395f4..eb77e65e1 100644 --- a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3000k.json +++ b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3000k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 6, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", "is-shuffled": false }, "run_count": 1 diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4500k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4500k.json index caf5fa5e3..3cd27b7a8 100644 --- a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4500k.json +++ b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4500k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 9, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", "is-shuffled": false }, "run_count": 1 diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-5000k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-5000k.json index f897ec422..b3292219f 100644 --- a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-5000k.json +++ b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-5000k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 10, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", "is-shuffled": false }, "run_count": 1 diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-500k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-500k.json index b49db3f1a..92dd5af3c 100644 --- a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-500k.json +++ b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-500k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 1, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", "is-shuffled": false }, "run_count": 1 diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1000k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1000k.json index 8d1bcf5c7..4f70e0d08 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1000k.json +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1000k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 2, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", "is-shuffled": false }, "run_count": 1 diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1500k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1500k.json index b0edfe6f2..c65b6f3ae 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1500k.json +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1500k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 3, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", "is-shuffled": false }, "run_count": 1 diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2500k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2500k.json index 530e11bbc..9eb156f0e 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2500k.json +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2500k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 5, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", "is-shuffled": false }, "run_count": 1 diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-3500k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-3500k.json index 2b3284e4c..9622276c3 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-3500k.json +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-3500k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 7, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", "is-shuffled": false }, "run_count": 1 diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-4000k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-4000k.json index 25979e397..61f278149 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-4000k.json +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-4000k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 8, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", "is-shuffled": false }, "run_count": 1 diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json b/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json index f3aea5ec3..3f6a20112 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 2, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", "is-shuffled": false }, "run_count": 3 diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json index 98c5c71f7..7e7f66318 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 3, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", "is-shuffled": false }, "run_count": 3 diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-2000k.json b/custom-run-configs/config-custom-dataset-small-hnsw-2000k.json index 2452055a8..07ecf7844 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-2000k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-2000k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 4, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", "is-shuffled": false }, "run_count": 3 diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json index 100cc6cb1..d58d863a5 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 5, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", "is-shuffled": false }, "run_count": 3 diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-3000k.json b/custom-run-configs/config-custom-dataset-small-hnsw-3000k.json index c1d3829b1..f2b5975fa 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-3000k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-3000k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 6, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", "is-shuffled": false }, "run_count": 3 diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-3500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-3500k.json index a69d3015a..29cf8bd89 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-3500k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-3500k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 7, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", "is-shuffled": false }, "run_count": 3 diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-4000k.json b/custom-run-configs/config-custom-dataset-small-hnsw-4000k.json index 8096bc863..1e48bd316 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-4000k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-4000k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 8, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", "is-shuffled": false }, "run_count": 3 diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-4500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-4500k.json index 2f3b3ba0d..e2bd637f9 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-4500k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-4500k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 9, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", "is-shuffled": false }, "run_count": 3 diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-5000k.json b/custom-run-configs/config-custom-dataset-small-hnsw-5000k.json index a00471e09..a15a1ba0a 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-5000k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-5000k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 10, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", "is-shuffled": false }, "run_count": 3 diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-500k.json index a151768b0..99ec24468 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-500k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-500k.json @@ -32,8 +32,8 @@ "custom-dataset-file-count": 1, "custom-dataset-use-shuffled": false, "create-dataset-args": { - "directory": "/home/ubuntu/vectordb_bench/datasets/openai/openai_large_5m", - "save-dir-path": "/home/ubuntu/vectordb_bench/datasets/custom-openai/", + "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m", + "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/", "is-shuffled": false }, "run_count": 3 From 34e6b4514d47c10ece8996d9cd3e56e85e8fbe56 Mon Sep 17 00:00:00 2001 From: Sheharyar Ahmad Date: Tue, 15 Oct 2024 21:34:22 +0500 Subject: [PATCH 14/20] copy neigbors.parquet in created_dataset directory --- create_dataset_subsets.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/create_dataset_subsets.py b/create_dataset_subsets.py index 21b131a84..55c147374 100644 --- a/create_dataset_subsets.py +++ b/create_dataset_subsets.py @@ -44,6 +44,11 @@ def create_dataset(base_dir, save_dir_path, subset_prefix, file_prefix, file_cou shutil.copy(src_test_file, dst_test_file) logger.info(f"Copied test.parquet to {subset_dir}") + src_test_file = os.path.join(base_dir, "neighbors.parquet") + dst_test_file = os.path.join(subset_dir, "neighbors.parquet") + shutil.copy(src_test_file, dst_test_file) + logger.info(f"Copied neighbors.parquet to {subset_dir}") + logger.info(f"Dataset creation completed. {file_count} files have been copied to {subset_dir}.") if __name__ == "__main__": From 130425e01f6eeab9eac6409afecb0e3f51463371 Mon Sep 17 00:00:00 2001 From: Sheharyar Ahmad Date: Wed, 16 Oct 2024 11:26:03 +0500 Subject: [PATCH 15/20] Divided run config in 2 folders to run in parallel on 2 instances. --- .../config-custom-dataset-small-hnsw-2000k.json | 0 .../config-custom-dataset-small-hnsw-3000k.json | 0 .../config-custom-dataset-small-hnsw-4500k.json | 0 .../config-custom-dataset-small-hnsw-5000k.json | 0 .../config-custom-dataset-small-hnsw-500k.json | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename {custom-run-configs => custom-run-configs-large}/config-custom-dataset-small-hnsw-2000k.json (100%) rename {custom-run-configs => custom-run-configs-large}/config-custom-dataset-small-hnsw-3000k.json (100%) rename {custom-run-configs => custom-run-configs-large}/config-custom-dataset-small-hnsw-4500k.json (100%) rename {custom-run-configs => custom-run-configs-large}/config-custom-dataset-small-hnsw-5000k.json (100%) rename {custom-run-configs => custom-run-configs-large}/config-custom-dataset-small-hnsw-500k.json (100%) diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-2000k.json b/custom-run-configs-large/config-custom-dataset-small-hnsw-2000k.json similarity index 100% rename from custom-run-configs/config-custom-dataset-small-hnsw-2000k.json rename to custom-run-configs-large/config-custom-dataset-small-hnsw-2000k.json diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-3000k.json b/custom-run-configs-large/config-custom-dataset-small-hnsw-3000k.json similarity index 100% rename from custom-run-configs/config-custom-dataset-small-hnsw-3000k.json rename to custom-run-configs-large/config-custom-dataset-small-hnsw-3000k.json diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-4500k.json b/custom-run-configs-large/config-custom-dataset-small-hnsw-4500k.json similarity index 100% rename from custom-run-configs/config-custom-dataset-small-hnsw-4500k.json rename to custom-run-configs-large/config-custom-dataset-small-hnsw-4500k.json diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-5000k.json b/custom-run-configs-large/config-custom-dataset-small-hnsw-5000k.json similarity index 100% rename from custom-run-configs/config-custom-dataset-small-hnsw-5000k.json rename to custom-run-configs-large/config-custom-dataset-small-hnsw-5000k.json diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-500k.json b/custom-run-configs-large/config-custom-dataset-small-hnsw-500k.json similarity index 100% rename from custom-run-configs/config-custom-dataset-small-hnsw-500k.json rename to custom-run-configs-large/config-custom-dataset-small-hnsw-500k.json From 43c18e651cf071d0a5b6d076074059d6c6023b51 Mon Sep 17 00:00:00 2001 From: Sheharyar Ahmad Date: Wed, 16 Oct 2024 11:43:17 +0500 Subject: [PATCH 16/20] Updated instance type in all configs. --- .../config-custom-dataset-small-hnsw-2000k.json | 2 +- .../config-custom-dataset-small-hnsw-3000k.json | 2 +- .../config-custom-dataset-small-hnsw-4500k.json | 2 +- .../config-custom-dataset-small-hnsw-5000k.json | 2 +- .../config-custom-dataset-small-hnsw-500k.json | 2 +- .../config-custom-dataset-small-hnsw-1000k.json | 2 +- .../config-custom-dataset-small-hnsw-1500k.json | 2 +- .../config-custom-dataset-small-hnsw-2500k.json | 2 +- .../config-custom-dataset-small-hnsw-3500k.json | 2 +- .../config-custom-dataset-small-hnsw-4000k.json | 2 +- .../config-custom-dataset-small-hnsw-2000k.json | 2 +- .../config-custom-dataset-small-hnsw-3000k.json | 2 +- .../config-custom-dataset-small-hnsw-4500k.json | 2 +- .../config-custom-dataset-small-hnsw-5000k.json | 2 +- .../config-custom-dataset-small-hnsw-500k.json | 2 +- custom-run-configs/config-custom-dataset-small-hnsw-1000k.json | 2 +- custom-run-configs/config-custom-dataset-small-hnsw-1500k.json | 2 +- custom-run-configs/config-custom-dataset-small-hnsw-2500k.json | 2 +- custom-run-configs/config-custom-dataset-small-hnsw-3500k.json | 2 +- custom-run-configs/config-custom-dataset-small-hnsw-4000k.json | 2 +- sample-configs/config-custom-dataset-small-hnsw.json | 2 +- 21 files changed, 21 insertions(+), 21 deletions(-) diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-2000k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-2000k.json index b17bca14b..1a1649783 100644 --- a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-2000k.json +++ b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-2000k.json @@ -4,7 +4,7 @@ "username": "postgres", "password": "postgres", "db_name": "ann-2000k", - "instance_type": "db.m6i.large", + "instance_type": "db.m6i.xlarge", "provider": "aws", "enable_seqscan": "on" }, diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3000k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3000k.json index eb77e65e1..ebad41a1b 100644 --- a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3000k.json +++ b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3000k.json @@ -4,7 +4,7 @@ "username": "postgres", "password": "postgres", "db_name": "ann-3000k", - "instance_type": "db.m6i.large", + "instance_type": "db.m6i.xlarge", "provider": "aws", "enable_seqscan": "on" }, diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4500k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4500k.json index 3cd27b7a8..348ecc763 100644 --- a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4500k.json +++ b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4500k.json @@ -4,7 +4,7 @@ "username": "postgres", "password": "postgres", "db_name": "ann-4500k", - "instance_type": "db.m6i.large", + "instance_type": "db.m6i.xlarge", "provider": "aws", "enable_seqscan": "on" }, diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-5000k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-5000k.json index b3292219f..a9ce08aef 100644 --- a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-5000k.json +++ b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-5000k.json @@ -4,7 +4,7 @@ "username": "postgres", "password": "postgres", "db_name": "ann-5000k", - "instance_type": "db.m6i.large", + "instance_type": "db.m6i.xlarge", "provider": "aws", "enable_seqscan": "on" }, diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-500k.json b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-500k.json index 92dd5af3c..edcbd1fd9 100644 --- a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-500k.json +++ b/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-500k.json @@ -4,7 +4,7 @@ "username": "postgres", "password": "postgres", "db_name": "ann-500k", - "instance_type": "db.m6i.large", + "instance_type": "db.m6i.xlarge", "provider": "aws", "enable_seqscan": "on" }, diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1000k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1000k.json index 4f70e0d08..d0fe32403 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1000k.json +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1000k.json @@ -4,7 +4,7 @@ "username": "postgres", "password": "postgres", "db_name": "ann-1000k", - "instance_type": "db.m6i.large", + "instance_type": "db.m6i.xlarge", "provider": "aws", "enable_seqscan": "on" }, diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1500k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1500k.json index c65b6f3ae..4b4cb164e 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1500k.json +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1500k.json @@ -4,7 +4,7 @@ "username": "postgres", "password": "postgres", "db_name": "ann-1500k", - "instance_type": "db.m6i.large", + "instance_type": "db.m6i.xlarge", "provider": "aws", "enable_seqscan": "on" }, diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2500k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2500k.json index 9eb156f0e..89b84446c 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2500k.json +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2500k.json @@ -4,7 +4,7 @@ "username": "postgres", "password": "postgres", "db_name": "ann-2500k", - "instance_type": "db.m6i.large", + "instance_type": "db.m6i.xlarge", "provider": "aws", "enable_seqscan": "on" }, diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-3500k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-3500k.json index 9622276c3..9550bbdf1 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-3500k.json +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-3500k.json @@ -4,7 +4,7 @@ "username": "postgres", "password": "postgres", "db_name": "ann-3500k", - "instance_type": "db.m6i.large", + "instance_type": "db.m6i.xlarge", "provider": "aws", "enable_seqscan": "on" }, diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-4000k.json b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-4000k.json index 61f278149..0ef34ddfe 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-4000k.json +++ b/custom-run-build-index-configs/config-custom-dataset-small-hnsw-4000k.json @@ -4,7 +4,7 @@ "username": "postgres", "password": "postgres", "db_name": "ann-4000k", - "instance_type": "db.m6i.large", + "instance_type": "db.m6i.xlarge", "provider": "aws", "enable_seqscan": "on" }, diff --git a/custom-run-configs-large/config-custom-dataset-small-hnsw-2000k.json b/custom-run-configs-large/config-custom-dataset-small-hnsw-2000k.json index 07ecf7844..deddc393a 100644 --- a/custom-run-configs-large/config-custom-dataset-small-hnsw-2000k.json +++ b/custom-run-configs-large/config-custom-dataset-small-hnsw-2000k.json @@ -4,7 +4,7 @@ "username": "postgres", "password": "postgres", "db_name": "ann-2000k", - "instance_type": "db.m6i.large", + "instance_type": "db.m6i.xlarge", "provider": "aws", "enable_seqscan": "on" }, diff --git a/custom-run-configs-large/config-custom-dataset-small-hnsw-3000k.json b/custom-run-configs-large/config-custom-dataset-small-hnsw-3000k.json index f2b5975fa..6dc1c0cca 100644 --- a/custom-run-configs-large/config-custom-dataset-small-hnsw-3000k.json +++ b/custom-run-configs-large/config-custom-dataset-small-hnsw-3000k.json @@ -4,7 +4,7 @@ "username": "postgres", "password": "postgres", "db_name": "ann-3000k", - "instance_type": "db.m6i.large", + "instance_type": "db.m6i.xlarge", "provider": "aws", "enable_seqscan": "on" }, diff --git a/custom-run-configs-large/config-custom-dataset-small-hnsw-4500k.json b/custom-run-configs-large/config-custom-dataset-small-hnsw-4500k.json index e2bd637f9..f41ab4999 100644 --- a/custom-run-configs-large/config-custom-dataset-small-hnsw-4500k.json +++ b/custom-run-configs-large/config-custom-dataset-small-hnsw-4500k.json @@ -4,7 +4,7 @@ "username": "postgres", "password": "postgres", "db_name": "ann-4500k", - "instance_type": "db.m6i.large", + "instance_type": "db.m6i.xlarge", "provider": "aws", "enable_seqscan": "on" }, diff --git a/custom-run-configs-large/config-custom-dataset-small-hnsw-5000k.json b/custom-run-configs-large/config-custom-dataset-small-hnsw-5000k.json index a15a1ba0a..398891f9f 100644 --- a/custom-run-configs-large/config-custom-dataset-small-hnsw-5000k.json +++ b/custom-run-configs-large/config-custom-dataset-small-hnsw-5000k.json @@ -4,7 +4,7 @@ "username": "postgres", "password": "postgres", "db_name": "ann-5000k", - "instance_type": "db.m6i.large", + "instance_type": "db.m6i.xlarge", "provider": "aws", "enable_seqscan": "on" }, diff --git a/custom-run-configs-large/config-custom-dataset-small-hnsw-500k.json b/custom-run-configs-large/config-custom-dataset-small-hnsw-500k.json index 99ec24468..4dba59c2e 100644 --- a/custom-run-configs-large/config-custom-dataset-small-hnsw-500k.json +++ b/custom-run-configs-large/config-custom-dataset-small-hnsw-500k.json @@ -4,7 +4,7 @@ "username": "postgres", "password": "postgres", "db_name": "ann-500k", - "instance_type": "db.m6i.large", + "instance_type": "db.m6i.xlarge", "provider": "aws", "enable_seqscan": "on" }, diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json b/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json index 3f6a20112..91bc15ec3 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json @@ -4,7 +4,7 @@ "username": "postgres", "password": "postgres", "db_name": "ann-1000k", - "instance_type": "db.m6i.large", + "instance_type": "db.m6i.xlarge", "provider": "aws", "enable_seqscan": "on" }, diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json index 7e7f66318..1cc8990d1 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json @@ -4,7 +4,7 @@ "username": "postgres", "password": "postgres", "db_name": "ann-1500k", - "instance_type": "db.m6i.large", + "instance_type": "db.m6i.xlarge", "provider": "aws", "enable_seqscan": "on" }, diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json index d58d863a5..0fde71065 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json @@ -4,7 +4,7 @@ "username": "postgres", "password": "postgres", "db_name": "ann-2500k", - "instance_type": "db.m6i.large", + "instance_type": "db.m6i.xlarge", "provider": "aws", "enable_seqscan": "on" }, diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-3500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-3500k.json index 29cf8bd89..c9097398c 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-3500k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-3500k.json @@ -4,7 +4,7 @@ "username": "postgres", "password": "postgres", "db_name": "ann-3500k", - "instance_type": "db.m6i.large", + "instance_type": "db.m6i.xlarge", "provider": "aws", "enable_seqscan": "on" }, diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-4000k.json b/custom-run-configs/config-custom-dataset-small-hnsw-4000k.json index 1e48bd316..c52ac849a 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-4000k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-4000k.json @@ -4,7 +4,7 @@ "username": "postgres", "password": "postgres", "db_name": "ann-4000k", - "instance_type": "db.m6i.large", + "instance_type": "db.m6i.xlarge", "provider": "aws", "enable_seqscan": "on" }, diff --git a/sample-configs/config-custom-dataset-small-hnsw.json b/sample-configs/config-custom-dataset-small-hnsw.json index 8eb2b865b..707ec41fa 100644 --- a/sample-configs/config-custom-dataset-small-hnsw.json +++ b/sample-configs/config-custom-dataset-small-hnsw.json @@ -4,7 +4,7 @@ "username": "postgres", "password": "postgres", "db_name": "ann", - "instance_type": "db.m6i.large", + "instance_type": "db.m6i.xlarge", "provider": "aws", "enable_seqscan": "on" }, From 2efaf440b50f40e0c68ffb82f89237a7c9ea4318 Mon Sep 17 00:00:00 2001 From: Sheharyar Ahmad Date: Thu, 17 Oct 2024 13:54:46 +0500 Subject: [PATCH 17/20] Added prewarm query result in logs. --- run-custom-dataset.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/run-custom-dataset.py b/run-custom-dataset.py index bde14a6be..01979ff46 100644 --- a/run-custom-dataset.py +++ b/run-custom-dataset.py @@ -186,7 +186,7 @@ def query_configurations(config): return {} def pre_warm(config): - print("Running pre warm") + print(f"Running pre warm for database:{config['db_name']}") try: conn = psycopg.connect( dbname=config['db_name'], @@ -195,11 +195,14 @@ def pre_warm(config): host=config['host'], ) cursor = conn.cursor() - cursor.execute("SELECT pg_prewarm('public.pgvector_index')"); + cursor.execute("SELECT pg_prewarm('public.pgvector_index') as block_loaded") + conn.commit() + + result = cursor.fetchone() + print(f"Pre-warm blocks loaded: {result[0]}") conn.close() - print("Pre-warm completed") - except Exception: - pass + except Exception as e: + print(f"Failed to pre-warm the database: {e}") def run_benchmark(case, db_config): base_command = [ @@ -294,7 +297,6 @@ def run_benchmark(case, db_config): get_stats(db_config) f.flush() pre_warm(db_config) - print(f"Running with prewarm") print(f"Running command: {' '.join(command)}") f.flush() From 7a695e5da3280eac47bceebb45ee2c952f22af30 Mon Sep 17 00:00:00 2001 From: Sheharyar Ahmad Date: Thu, 17 Oct 2024 14:13:07 +0500 Subject: [PATCH 18/20] Updated db label in all configs --- .../config-custom-dataset-small-hnsw-2000k.json | 2 +- .../config-custom-dataset-small-hnsw-3000k.json | 2 +- .../config-custom-dataset-small-hnsw-4500k.json | 2 +- .../config-custom-dataset-small-hnsw-5000k.json | 2 +- .../config-custom-dataset-small-hnsw-500k.json | 2 +- custom-run-configs/config-custom-dataset-small-hnsw-1000k.json | 2 +- custom-run-configs/config-custom-dataset-small-hnsw-1500k.json | 2 +- custom-run-configs/config-custom-dataset-small-hnsw-2500k.json | 2 +- custom-run-configs/config-custom-dataset-small-hnsw-3500k.json | 2 +- custom-run-configs/config-custom-dataset-small-hnsw-4000k.json | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/custom-run-configs-large/config-custom-dataset-small-hnsw-2000k.json b/custom-run-configs-large/config-custom-dataset-small-hnsw-2000k.json index deddc393a..5fb71007e 100644 --- a/custom-run-configs-large/config-custom-dataset-small-hnsw-2000k.json +++ b/custom-run-configs-large/config-custom-dataset-small-hnsw-2000k.json @@ -10,7 +10,7 @@ }, "cases": [ { - "db-label": "memory-comparison-2000k", + "db-label": "memory-comparison-2000k-20k-test-dataset", "drop_old": false, "load": false, "search-serial": true, diff --git a/custom-run-configs-large/config-custom-dataset-small-hnsw-3000k.json b/custom-run-configs-large/config-custom-dataset-small-hnsw-3000k.json index 6dc1c0cca..1ddec8fcf 100644 --- a/custom-run-configs-large/config-custom-dataset-small-hnsw-3000k.json +++ b/custom-run-configs-large/config-custom-dataset-small-hnsw-3000k.json @@ -10,7 +10,7 @@ }, "cases": [ { - "db-label": "memory-comparison-3000k", + "db-label": "memory-comparison-3000k-20k-test-dataset", "drop_old": false, "load": false, "search-serial": true, diff --git a/custom-run-configs-large/config-custom-dataset-small-hnsw-4500k.json b/custom-run-configs-large/config-custom-dataset-small-hnsw-4500k.json index f41ab4999..3405f3d4c 100644 --- a/custom-run-configs-large/config-custom-dataset-small-hnsw-4500k.json +++ b/custom-run-configs-large/config-custom-dataset-small-hnsw-4500k.json @@ -10,7 +10,7 @@ }, "cases": [ { - "db-label": "memory-comparison-4500k", + "db-label": "memory-comparison-4500k-20k-test-dataset", "drop_old": false, "load": false, "search-serial": true, diff --git a/custom-run-configs-large/config-custom-dataset-small-hnsw-5000k.json b/custom-run-configs-large/config-custom-dataset-small-hnsw-5000k.json index 398891f9f..101d94b4e 100644 --- a/custom-run-configs-large/config-custom-dataset-small-hnsw-5000k.json +++ b/custom-run-configs-large/config-custom-dataset-small-hnsw-5000k.json @@ -10,7 +10,7 @@ }, "cases": [ { - "db-label": "memory-comparison-5000k", + "db-label": "memory-comparison-5000k-20k-test-dataset", "drop_old": false, "load": false, "search-serial": true, diff --git a/custom-run-configs-large/config-custom-dataset-small-hnsw-500k.json b/custom-run-configs-large/config-custom-dataset-small-hnsw-500k.json index 4dba59c2e..8af70db82 100644 --- a/custom-run-configs-large/config-custom-dataset-small-hnsw-500k.json +++ b/custom-run-configs-large/config-custom-dataset-small-hnsw-500k.json @@ -10,7 +10,7 @@ }, "cases": [ { - "db-label": "memory-comparison-500k", + "db-label": "memory-comparison-500k-20k-test-dataset", "drop_old": false, "load": false, "search-serial": true, diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json b/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json index 91bc15ec3..eb2f6fd68 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json @@ -10,7 +10,7 @@ }, "cases": [ { - "db-label": "memory-comparison-1000k", + "db-label": "memory-comparison-1000k-20k-test-dataset", "drop_old": false, "load": false, "search-serial": true, diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json index 1cc8990d1..af7588328 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json @@ -10,7 +10,7 @@ }, "cases": [ { - "db-label": "memory-comparison-1500k", + "db-label": "memory-comparison-1500k-20k-test-dataset", "drop_old": false, "load": false, "search-serial": true, diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json index 0fde71065..692afe519 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json @@ -10,7 +10,7 @@ }, "cases": [ { - "db-label": "memory-comparison-2500k", + "db-label": "memory-comparison-2500k-20k-test-dataset", "drop_old": false, "load": false, "search-serial": true, diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-3500k.json b/custom-run-configs/config-custom-dataset-small-hnsw-3500k.json index c9097398c..8be316aa3 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-3500k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-3500k.json @@ -10,7 +10,7 @@ }, "cases": [ { - "db-label": "memory-comparison-3500k", + "db-label": "memory-comparison-3500k-20k-test-dataset", "drop_old": false, "load": false, "search-serial": true, diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-4000k.json b/custom-run-configs/config-custom-dataset-small-hnsw-4000k.json index c52ac849a..4aa10662b 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-4000k.json +++ b/custom-run-configs/config-custom-dataset-small-hnsw-4000k.json @@ -10,7 +10,7 @@ }, "cases": [ { - "db-label": "memory-comparison-4000k", + "db-label": "memory-comparison-4000k-20k-test-dataset", "drop_old": false, "load": false, "search-serial": true, From aaff72d1a4559d318d6051cd56c1b19cb6cd9adc Mon Sep 17 00:00:00 2001 From: Sheharyar Ahmad Date: Mon, 4 Nov 2024 17:47:12 +0500 Subject: [PATCH 19/20] Updated custom dataset configs --- .../config-custom-dataset-small-hnsw-1000k.json | 8 ++++---- .../config-custom-dataset-small-hnsw-2000k.json | 8 ++++---- .../config-custom-dataset-small-hnsw-3500k.json | 8 ++++---- .../config-custom-dataset-small-hnsw-4000k.json | 8 ++++---- .../config-custom-dataset-small-hnsw-1500k.json | 8 ++++---- .../config-custom-dataset-small-hnsw-2500k.json | 8 ++++---- .../config-custom-dataset-small-hnsw-4500k.json | 8 ++++---- .../config-custom-dataset-small-hnsw-3000k.json | 8 ++++---- .../config-custom-dataset-small-hnsw-5000k.json | 8 ++++---- .../config-custom-dataset-small-hnsw-500k.json | 8 ++++---- .../config-custom-dataset-small-hnsw-1000k.json | 10 +++++----- .../config-custom-dataset-small-hnsw-2000k.json | 10 +++++----- .../config-custom-dataset-small-hnsw-3500k.json | 10 +++++----- .../config-custom-dataset-small-hnsw-4000k.json | 10 +++++----- .../config-custom-dataset-small-hnsw-1500k.json | 10 +++++----- .../config-custom-dataset-small-hnsw-2500k.json | 8 ++++---- .../config-custom-dataset-small-hnsw-4500k.json | 10 +++++----- .../config-custom-dataset-small-hnsw-3000k.json | 10 +++++----- .../config-custom-dataset-small-hnsw-5000k.json | 10 +++++----- .../config-custom-dataset-small-hnsw-500k.json | 10 +++++----- 20 files changed, 89 insertions(+), 89 deletions(-) rename {custom-run-build-index-configs => custom-run-build-index-configs-1}/config-custom-dataset-small-hnsw-1000k.json (89%) rename {custom-run-build-index-configs-large => custom-run-build-index-configs-1}/config-custom-dataset-small-hnsw-2000k.json (89%) rename {custom-run-build-index-configs => custom-run-build-index-configs-1}/config-custom-dataset-small-hnsw-3500k.json (89%) rename {custom-run-build-index-configs => custom-run-build-index-configs-1}/config-custom-dataset-small-hnsw-4000k.json (89%) rename {custom-run-build-index-configs => custom-run-build-index-configs-2}/config-custom-dataset-small-hnsw-1500k.json (89%) rename {custom-run-build-index-configs => custom-run-build-index-configs-2}/config-custom-dataset-small-hnsw-2500k.json (89%) rename {custom-run-build-index-configs-large => custom-run-build-index-configs-2}/config-custom-dataset-small-hnsw-4500k.json (89%) rename {custom-run-build-index-configs-large => custom-run-build-index-configs-3}/config-custom-dataset-small-hnsw-3000k.json (89%) rename {custom-run-build-index-configs-large => custom-run-build-index-configs-3}/config-custom-dataset-small-hnsw-5000k.json (89%) rename {custom-run-build-index-configs-large => custom-run-build-index-configs-3}/config-custom-dataset-small-hnsw-500k.json (89%) rename {custom-run-configs => custom-run-configs-1}/config-custom-dataset-small-hnsw-1000k.json (84%) rename {custom-run-configs-large => custom-run-configs-1}/config-custom-dataset-small-hnsw-2000k.json (84%) rename {custom-run-configs => custom-run-configs-1}/config-custom-dataset-small-hnsw-3500k.json (84%) rename {custom-run-configs => custom-run-configs-1}/config-custom-dataset-small-hnsw-4000k.json (84%) rename {custom-run-configs => custom-run-configs-2}/config-custom-dataset-small-hnsw-1500k.json (84%) rename {custom-run-configs => custom-run-configs-2}/config-custom-dataset-small-hnsw-2500k.json (86%) rename {custom-run-configs-large => custom-run-configs-2}/config-custom-dataset-small-hnsw-4500k.json (84%) rename {custom-run-configs-large => custom-run-configs-3}/config-custom-dataset-small-hnsw-3000k.json (84%) rename {custom-run-configs-large => custom-run-configs-3}/config-custom-dataset-small-hnsw-5000k.json (84%) rename {custom-run-configs-large => custom-run-configs-3}/config-custom-dataset-small-hnsw-500k.json (84%) diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1000k.json b/custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-1000k.json similarity index 89% rename from custom-run-build-index-configs/config-custom-dataset-small-hnsw-1000k.json rename to custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-1000k.json index d0fe32403..c3674f572 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1000k.json +++ b/custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-1000k.json @@ -4,8 +4,8 @@ "username": "postgres", "password": "postgres", "db_name": "ann-1000k", - "instance_type": "db.m6i.xlarge", - "provider": "aws", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", "enable_seqscan": "on" }, "cases": [ @@ -16,8 +16,8 @@ "search-serial": false, "search-concurrent": false, "case-type": "PerformanceCustomDataset", - "maintenance-work-mem": "42GB", - "max-parallel-workers": 15, + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, "ef-search": [40], "ef-construction": 128, "m": 32, diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-2000k.json b/custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-2000k.json similarity index 89% rename from custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-2000k.json rename to custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-2000k.json index 1a1649783..0d95f1197 100644 --- a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-2000k.json +++ b/custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-2000k.json @@ -4,8 +4,8 @@ "username": "postgres", "password": "postgres", "db_name": "ann-2000k", - "instance_type": "db.m6i.xlarge", - "provider": "aws", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", "enable_seqscan": "on" }, "cases": [ @@ -16,8 +16,8 @@ "search-serial": false, "search-concurrent": false, "case-type": "PerformanceCustomDataset", - "maintenance-work-mem": "42GB", - "max-parallel-workers": 15, + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, "ef-search": [40], "ef-construction": 128, "m": 32, diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-3500k.json b/custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-3500k.json similarity index 89% rename from custom-run-build-index-configs/config-custom-dataset-small-hnsw-3500k.json rename to custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-3500k.json index 9550bbdf1..1cd60da9e 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-3500k.json +++ b/custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-3500k.json @@ -4,8 +4,8 @@ "username": "postgres", "password": "postgres", "db_name": "ann-3500k", - "instance_type": "db.m6i.xlarge", - "provider": "aws", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", "enable_seqscan": "on" }, "cases": [ @@ -16,8 +16,8 @@ "search-serial": false, "search-concurrent": false, "case-type": "PerformanceCustomDataset", - "maintenance-work-mem": "42GB", - "max-parallel-workers": 15, + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, "ef-search": [40], "ef-construction": 128, "m": 32, diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-4000k.json b/custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-4000k.json similarity index 89% rename from custom-run-build-index-configs/config-custom-dataset-small-hnsw-4000k.json rename to custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-4000k.json index 0ef34ddfe..017c2bdcf 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-4000k.json +++ b/custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-4000k.json @@ -4,8 +4,8 @@ "username": "postgres", "password": "postgres", "db_name": "ann-4000k", - "instance_type": "db.m6i.xlarge", - "provider": "aws", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", "enable_seqscan": "on" }, "cases": [ @@ -16,8 +16,8 @@ "search-serial": false, "search-concurrent": false, "case-type": "PerformanceCustomDataset", - "maintenance-work-mem": "42GB", - "max-parallel-workers": 15, + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, "ef-search": [40], "ef-construction": 128, "m": 32, diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1500k.json b/custom-run-build-index-configs-2/config-custom-dataset-small-hnsw-1500k.json similarity index 89% rename from custom-run-build-index-configs/config-custom-dataset-small-hnsw-1500k.json rename to custom-run-build-index-configs-2/config-custom-dataset-small-hnsw-1500k.json index 4b4cb164e..d726ea1fd 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-1500k.json +++ b/custom-run-build-index-configs-2/config-custom-dataset-small-hnsw-1500k.json @@ -4,8 +4,8 @@ "username": "postgres", "password": "postgres", "db_name": "ann-1500k", - "instance_type": "db.m6i.xlarge", - "provider": "aws", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", "enable_seqscan": "on" }, "cases": [ @@ -16,8 +16,8 @@ "search-serial": false, "search-concurrent": false, "case-type": "PerformanceCustomDataset", - "maintenance-work-mem": "42GB", - "max-parallel-workers": 15, + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, "ef-search": [40], "ef-construction": 128, "m": 32, diff --git a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2500k.json b/custom-run-build-index-configs-2/config-custom-dataset-small-hnsw-2500k.json similarity index 89% rename from custom-run-build-index-configs/config-custom-dataset-small-hnsw-2500k.json rename to custom-run-build-index-configs-2/config-custom-dataset-small-hnsw-2500k.json index 89b84446c..801005c0c 100644 --- a/custom-run-build-index-configs/config-custom-dataset-small-hnsw-2500k.json +++ b/custom-run-build-index-configs-2/config-custom-dataset-small-hnsw-2500k.json @@ -4,8 +4,8 @@ "username": "postgres", "password": "postgres", "db_name": "ann-2500k", - "instance_type": "db.m6i.xlarge", - "provider": "aws", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", "enable_seqscan": "on" }, "cases": [ @@ -16,8 +16,8 @@ "search-serial": false, "search-concurrent": false, "case-type": "PerformanceCustomDataset", - "maintenance-work-mem": "42GB", - "max-parallel-workers": 15, + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, "ef-search": [40], "ef-construction": 128, "m": 32, diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4500k.json b/custom-run-build-index-configs-2/config-custom-dataset-small-hnsw-4500k.json similarity index 89% rename from custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4500k.json rename to custom-run-build-index-configs-2/config-custom-dataset-small-hnsw-4500k.json index 348ecc763..e31cf3bfa 100644 --- a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-4500k.json +++ b/custom-run-build-index-configs-2/config-custom-dataset-small-hnsw-4500k.json @@ -4,8 +4,8 @@ "username": "postgres", "password": "postgres", "db_name": "ann-4500k", - "instance_type": "db.m6i.xlarge", - "provider": "aws", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", "enable_seqscan": "on" }, "cases": [ @@ -16,8 +16,8 @@ "search-serial": false, "search-concurrent": false, "case-type": "PerformanceCustomDataset", - "maintenance-work-mem": "42GB", - "max-parallel-workers": 15, + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, "ef-search": [40], "ef-construction": 128, "m": 32, diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3000k.json b/custom-run-build-index-configs-3/config-custom-dataset-small-hnsw-3000k.json similarity index 89% rename from custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3000k.json rename to custom-run-build-index-configs-3/config-custom-dataset-small-hnsw-3000k.json index ebad41a1b..496d868db 100644 --- a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-3000k.json +++ b/custom-run-build-index-configs-3/config-custom-dataset-small-hnsw-3000k.json @@ -4,8 +4,8 @@ "username": "postgres", "password": "postgres", "db_name": "ann-3000k", - "instance_type": "db.m6i.xlarge", - "provider": "aws", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", "enable_seqscan": "on" }, "cases": [ @@ -16,8 +16,8 @@ "search-serial": false, "search-concurrent": false, "case-type": "PerformanceCustomDataset", - "maintenance-work-mem": "42GB", - "max-parallel-workers": 15, + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, "ef-search": [40], "ef-construction": 128, "m": 32, diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-5000k.json b/custom-run-build-index-configs-3/config-custom-dataset-small-hnsw-5000k.json similarity index 89% rename from custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-5000k.json rename to custom-run-build-index-configs-3/config-custom-dataset-small-hnsw-5000k.json index a9ce08aef..0656dd083 100644 --- a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-5000k.json +++ b/custom-run-build-index-configs-3/config-custom-dataset-small-hnsw-5000k.json @@ -4,8 +4,8 @@ "username": "postgres", "password": "postgres", "db_name": "ann-5000k", - "instance_type": "db.m6i.xlarge", - "provider": "aws", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", "enable_seqscan": "on" }, "cases": [ @@ -16,8 +16,8 @@ "search-serial": false, "search-concurrent": false, "case-type": "PerformanceCustomDataset", - "maintenance-work-mem": "42GB", - "max-parallel-workers": 15, + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, "ef-search": [40], "ef-construction": 128, "m": 32, diff --git a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-500k.json b/custom-run-build-index-configs-3/config-custom-dataset-small-hnsw-500k.json similarity index 89% rename from custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-500k.json rename to custom-run-build-index-configs-3/config-custom-dataset-small-hnsw-500k.json index edcbd1fd9..61c3cd037 100644 --- a/custom-run-build-index-configs-large/config-custom-dataset-small-hnsw-500k.json +++ b/custom-run-build-index-configs-3/config-custom-dataset-small-hnsw-500k.json @@ -4,8 +4,8 @@ "username": "postgres", "password": "postgres", "db_name": "ann-500k", - "instance_type": "db.m6i.xlarge", - "provider": "aws", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", "enable_seqscan": "on" }, "cases": [ @@ -16,8 +16,8 @@ "search-serial": false, "search-concurrent": false, "case-type": "PerformanceCustomDataset", - "maintenance-work-mem": "42GB", - "max-parallel-workers": 15, + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, "ef-search": [40], "ef-construction": 128, "m": 32, diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json b/custom-run-configs-1/config-custom-dataset-small-hnsw-1000k.json similarity index 84% rename from custom-run-configs/config-custom-dataset-small-hnsw-1000k.json rename to custom-run-configs-1/config-custom-dataset-small-hnsw-1000k.json index eb2f6fd68..cba9c28c2 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-1000k.json +++ b/custom-run-configs-1/config-custom-dataset-small-hnsw-1000k.json @@ -4,20 +4,20 @@ "username": "postgres", "password": "postgres", "db_name": "ann-1000k", - "instance_type": "db.m6i.xlarge", - "provider": "aws", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", "enable_seqscan": "on" }, "cases": [ { - "db-label": "memory-comparison-1000k-20k-test-dataset", + "db-label": "memory-comparison-1000k", "drop_old": false, "load": false, "search-serial": true, "search-concurrent": true, "case-type": "PerformanceCustomDataset", - "maintenance-work-mem": "8GB", - "max-parallel-workers": 3, + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, "ef-search": [40], "ef-construction": 128, "m": 32, diff --git a/custom-run-configs-large/config-custom-dataset-small-hnsw-2000k.json b/custom-run-configs-1/config-custom-dataset-small-hnsw-2000k.json similarity index 84% rename from custom-run-configs-large/config-custom-dataset-small-hnsw-2000k.json rename to custom-run-configs-1/config-custom-dataset-small-hnsw-2000k.json index 5fb71007e..fc56c9280 100644 --- a/custom-run-configs-large/config-custom-dataset-small-hnsw-2000k.json +++ b/custom-run-configs-1/config-custom-dataset-small-hnsw-2000k.json @@ -4,20 +4,20 @@ "username": "postgres", "password": "postgres", "db_name": "ann-2000k", - "instance_type": "db.m6i.xlarge", - "provider": "aws", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", "enable_seqscan": "on" }, "cases": [ { - "db-label": "memory-comparison-2000k-20k-test-dataset", + "db-label": "memory-comparison-2000k", "drop_old": false, "load": false, "search-serial": true, "search-concurrent": true, "case-type": "PerformanceCustomDataset", - "maintenance-work-mem": "8GB", - "max-parallel-workers": 3, + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, "ef-search": [40], "ef-construction": 128, "m": 32, diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-3500k.json b/custom-run-configs-1/config-custom-dataset-small-hnsw-3500k.json similarity index 84% rename from custom-run-configs/config-custom-dataset-small-hnsw-3500k.json rename to custom-run-configs-1/config-custom-dataset-small-hnsw-3500k.json index 8be316aa3..3f1145cf5 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-3500k.json +++ b/custom-run-configs-1/config-custom-dataset-small-hnsw-3500k.json @@ -4,20 +4,20 @@ "username": "postgres", "password": "postgres", "db_name": "ann-3500k", - "instance_type": "db.m6i.xlarge", - "provider": "aws", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", "enable_seqscan": "on" }, "cases": [ { - "db-label": "memory-comparison-3500k-20k-test-dataset", + "db-label": "memory-comparison-3500k", "drop_old": false, "load": false, "search-serial": true, "search-concurrent": true, "case-type": "PerformanceCustomDataset", - "maintenance-work-mem": "8GB", - "max-parallel-workers": 3, + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, "ef-search": [40], "ef-construction": 128, "m": 32, diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-4000k.json b/custom-run-configs-1/config-custom-dataset-small-hnsw-4000k.json similarity index 84% rename from custom-run-configs/config-custom-dataset-small-hnsw-4000k.json rename to custom-run-configs-1/config-custom-dataset-small-hnsw-4000k.json index 4aa10662b..a74a21264 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-4000k.json +++ b/custom-run-configs-1/config-custom-dataset-small-hnsw-4000k.json @@ -4,20 +4,20 @@ "username": "postgres", "password": "postgres", "db_name": "ann-4000k", - "instance_type": "db.m6i.xlarge", - "provider": "aws", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", "enable_seqscan": "on" }, "cases": [ { - "db-label": "memory-comparison-4000k-20k-test-dataset", + "db-label": "memory-comparison-4000k", "drop_old": false, "load": false, "search-serial": true, "search-concurrent": true, "case-type": "PerformanceCustomDataset", - "maintenance-work-mem": "8GB", - "max-parallel-workers": 3, + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, "ef-search": [40], "ef-construction": 128, "m": 32, diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json b/custom-run-configs-2/config-custom-dataset-small-hnsw-1500k.json similarity index 84% rename from custom-run-configs/config-custom-dataset-small-hnsw-1500k.json rename to custom-run-configs-2/config-custom-dataset-small-hnsw-1500k.json index af7588328..7e3c29493 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-1500k.json +++ b/custom-run-configs-2/config-custom-dataset-small-hnsw-1500k.json @@ -4,20 +4,20 @@ "username": "postgres", "password": "postgres", "db_name": "ann-1500k", - "instance_type": "db.m6i.xlarge", - "provider": "aws", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", "enable_seqscan": "on" }, "cases": [ { - "db-label": "memory-comparison-1500k-20k-test-dataset", + "db-label": "memory-comparison-1500k", "drop_old": false, "load": false, "search-serial": true, "search-concurrent": true, "case-type": "PerformanceCustomDataset", - "maintenance-work-mem": "8GB", - "max-parallel-workers": 3, + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, "ef-search": [40], "ef-construction": 128, "m": 32, diff --git a/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json b/custom-run-configs-2/config-custom-dataset-small-hnsw-2500k.json similarity index 86% rename from custom-run-configs/config-custom-dataset-small-hnsw-2500k.json rename to custom-run-configs-2/config-custom-dataset-small-hnsw-2500k.json index 692afe519..4aed76165 100644 --- a/custom-run-configs/config-custom-dataset-small-hnsw-2500k.json +++ b/custom-run-configs-2/config-custom-dataset-small-hnsw-2500k.json @@ -4,20 +4,20 @@ "username": "postgres", "password": "postgres", "db_name": "ann-2500k", - "instance_type": "db.m6i.xlarge", + "instance_type": "db.m6i.large", "provider": "aws", "enable_seqscan": "on" }, "cases": [ { - "db-label": "memory-comparison-2500k-20k-test-dataset", + "db-label": "memory-comparison-2500k", "drop_old": false, "load": false, "search-serial": true, "search-concurrent": true, "case-type": "PerformanceCustomDataset", - "maintenance-work-mem": "8GB", - "max-parallel-workers": 3, + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, "ef-search": [40], "ef-construction": 128, "m": 32, diff --git a/custom-run-configs-large/config-custom-dataset-small-hnsw-4500k.json b/custom-run-configs-2/config-custom-dataset-small-hnsw-4500k.json similarity index 84% rename from custom-run-configs-large/config-custom-dataset-small-hnsw-4500k.json rename to custom-run-configs-2/config-custom-dataset-small-hnsw-4500k.json index 3405f3d4c..be671b538 100644 --- a/custom-run-configs-large/config-custom-dataset-small-hnsw-4500k.json +++ b/custom-run-configs-2/config-custom-dataset-small-hnsw-4500k.json @@ -4,20 +4,20 @@ "username": "postgres", "password": "postgres", "db_name": "ann-4500k", - "instance_type": "db.m6i.xlarge", - "provider": "aws", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", "enable_seqscan": "on" }, "cases": [ { - "db-label": "memory-comparison-4500k-20k-test-dataset", + "db-label": "memory-comparison-4500k", "drop_old": false, "load": false, "search-serial": true, "search-concurrent": true, "case-type": "PerformanceCustomDataset", - "maintenance-work-mem": "8GB", - "max-parallel-workers": 3, + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, "ef-search": [40], "ef-construction": 128, "m": 32, diff --git a/custom-run-configs-large/config-custom-dataset-small-hnsw-3000k.json b/custom-run-configs-3/config-custom-dataset-small-hnsw-3000k.json similarity index 84% rename from custom-run-configs-large/config-custom-dataset-small-hnsw-3000k.json rename to custom-run-configs-3/config-custom-dataset-small-hnsw-3000k.json index 1ddec8fcf..1cdfb7beb 100644 --- a/custom-run-configs-large/config-custom-dataset-small-hnsw-3000k.json +++ b/custom-run-configs-3/config-custom-dataset-small-hnsw-3000k.json @@ -4,20 +4,20 @@ "username": "postgres", "password": "postgres", "db_name": "ann-3000k", - "instance_type": "db.m6i.xlarge", - "provider": "aws", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", "enable_seqscan": "on" }, "cases": [ { - "db-label": "memory-comparison-3000k-20k-test-dataset", + "db-label": "memory-comparison-3000k", "drop_old": false, "load": false, "search-serial": true, "search-concurrent": true, "case-type": "PerformanceCustomDataset", - "maintenance-work-mem": "8GB", - "max-parallel-workers": 3, + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, "ef-search": [40], "ef-construction": 128, "m": 32, diff --git a/custom-run-configs-large/config-custom-dataset-small-hnsw-5000k.json b/custom-run-configs-3/config-custom-dataset-small-hnsw-5000k.json similarity index 84% rename from custom-run-configs-large/config-custom-dataset-small-hnsw-5000k.json rename to custom-run-configs-3/config-custom-dataset-small-hnsw-5000k.json index 101d94b4e..b1244c0cb 100644 --- a/custom-run-configs-large/config-custom-dataset-small-hnsw-5000k.json +++ b/custom-run-configs-3/config-custom-dataset-small-hnsw-5000k.json @@ -4,20 +4,20 @@ "username": "postgres", "password": "postgres", "db_name": "ann-5000k", - "instance_type": "db.m6i.xlarge", - "provider": "aws", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", "enable_seqscan": "on" }, "cases": [ { - "db-label": "memory-comparison-5000k-20k-test-dataset", + "db-label": "memory-comparison-5000k", "drop_old": false, "load": false, "search-serial": true, "search-concurrent": true, "case-type": "PerformanceCustomDataset", - "maintenance-work-mem": "8GB", - "max-parallel-workers": 3, + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, "ef-search": [40], "ef-construction": 128, "m": 32, diff --git a/custom-run-configs-large/config-custom-dataset-small-hnsw-500k.json b/custom-run-configs-3/config-custom-dataset-small-hnsw-500k.json similarity index 84% rename from custom-run-configs-large/config-custom-dataset-small-hnsw-500k.json rename to custom-run-configs-3/config-custom-dataset-small-hnsw-500k.json index 8af70db82..f3ef4ba40 100644 --- a/custom-run-configs-large/config-custom-dataset-small-hnsw-500k.json +++ b/custom-run-configs-3/config-custom-dataset-small-hnsw-500k.json @@ -4,20 +4,20 @@ "username": "postgres", "password": "postgres", "db_name": "ann-500k", - "instance_type": "db.m6i.xlarge", - "provider": "aws", + "instance_type": "Standard_D8ds_v5", + "provider": "azure", "enable_seqscan": "on" }, "cases": [ { - "db-label": "memory-comparison-500k-20k-test-dataset", + "db-label": "memory-comparison-500k", "drop_old": false, "load": false, "search-serial": true, "search-concurrent": true, "case-type": "PerformanceCustomDataset", - "maintenance-work-mem": "8GB", - "max-parallel-workers": 3, + "maintenance-work-mem": "16GB", + "max-parallel-workers": 7, "ef-search": [40], "ef-construction": 128, "m": 32, From c675feb86014c8fd0764ead4775bd14a6ebe8548 Mon Sep 17 00:00:00 2001 From: Sheharyar Ahmad Date: Mon, 4 Nov 2024 18:03:55 +0500 Subject: [PATCH 20/20] set shuffled_data to false --- vectordb_bench/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vectordb_bench/__init__.py b/vectordb_bench/__init__.py index 3d8419a4f..3795535ed 100644 --- a/vectordb_bench/__init__.py +++ b/vectordb_bench/__init__.py @@ -21,7 +21,7 @@ class config: NUM_PER_BATCH = env.int("NUM_PER_BATCH", 5000) DROP_OLD = env.bool("DROP_OLD", True) - USE_SHUFFLED_DATA = env.bool("USE_SHUFFLED_DATA", True) + USE_SHUFFLED_DATA = env.bool("USE_SHUFFLED_DATA", False) NUM_CONCURRENCY = env.list("NUM_CONCURRENCY", [1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100], subcast=int )