EmumbaOrg · Sheharyar570 · Oct 14, 2024 · Oct 14, 2024 · Oct 15, 2024 · Oct 15, 2024
diff --git a/create_dataset_subsets.py b/create_dataset_subsets.py
@@ -1,36 +1,63 @@
 import os
 import shutil
 import argparse
+import logging
 
+# Set up logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
 
 def get_file_name(file_name: str, file_prefix: str, files_count: int) -> str:
     if files_count == 1:
         return file_prefix + ".parquet"
     file_name = file_name.split("of-")[0]
     return file_name + "of-" + str(files_count).zfill(2) + ".parquet"
 
-def create_subsets(base_dir, save_dir_path, subset_prefix, file_prefix, step_size):
-    files = sorted([f for f in os.listdir(base_dir) if f.startswith(file_prefix)])
+def create_dataset(base_dir, save_dir_path, subset_prefix, file_prefix, file_count, row_count=500_000):
+    logger.info(f"Starting dataset creation with {file_count} files.")
+
+    # Sort the files and pick only the first 'file_count' files
+    files = sorted([f for f in os.listdir(base_dir) if f.startswith(file_prefix)])[:file_count]
     num_files = len(files)
+
+    if num_files == 0:
+        logger.warning("No files found with the specified prefix.")
+        return
+
+    logger.info(f"Found {num_files} files. Creating dataset...")
+
+    # Create the directory for the dataset
+    subset_dir = os.path.join(save_dir_path, f"{subset_prefix}_{file_count * row_count // 1000}k")
+    os.makedirs(subset_dir, exist_ok=True)
+    logger.info(f"Created directory for the dataset: {subset_dir}")
+
+    # Copy the first 'file_count' files into the subset directory
+    for file in files:
+        src_file = os.path.join(base_dir, file)
+        dst_file = os.path.join(subset_dir, get_file_name(file, file_prefix, file_count))
+        shutil.copy(src_file, dst_file)
+        logger.info(f"Copied {file} to {dst_file}")
 
-    for i in range(1, num_files + 1):
-        subset_dir = os.path.join(save_dir_path, f"{subset_prefix}_{i * step_size // 1000}k")
-        os.makedirs(subset_dir, exist_ok=True)
-
-        for j in range(i):
-            src_file = os.path.join(base_dir, files[j])
-            dst_file = os.path.join(subset_dir, get_file_name(files[j], file_prefix, i))
-            shutil.copy(src_file, dst_file)
-        src_test_file = os.path.join(base_dir, "test.parquet")
-        dst_test_file = os.path.join(subset_dir, "test.parquet")
-        shutil.copy(src_test_file, dst_test_file)
+    # Also copy the test.parquet file
+    src_test_file = os.path.join(base_dir, "test.parquet")
+    dst_test_file = os.path.join(subset_dir, "test.parquet")
+    shutil.copy(src_test_file, dst_test_file)
+    logger.info(f"Copied test.parquet to {subset_dir}")
+
+    src_test_file = os.path.join(base_dir, "neighbors.parquet")
+    dst_test_file = os.path.join(subset_dir, "neighbors.parquet")
+    shutil.copy(src_test_file, dst_test_file)
+    logger.info(f"Copied neighbors.parquet to {subset_dir}")
+
+    logger.info(f"Dataset creation completed. {file_count} files have been copied to {subset_dir}.")
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Create subsets of Parquet files using Dask.")
-    parser.add_argument("--directory", type=str, help="Path to the directory containing Parquet files.")
-    parser.add_argument("--save-dir-path", type=str, help="Directory path where data will be saved")
-    parser.add_argument("--dataset-name-prefix", type=str, help="Name prefix of the folder where each subset will be saved.")
+    parser = argparse.ArgumentParser(description="Create a dataset with a specified number of Parquet files.")
+    parser.add_argument("--directory", type=str, required=True, help="Path to the directory containing Parquet files.")
+    parser.add_argument("--save-dir-path", type=str, required=True, help="Directory path where the dataset will be saved.")
+    parser.add_argument("--dataset-name-prefix", type=str, help="Name prefix for the dataset folder.")
     parser.add_argument("--is-shuffled", type=bool, help="Whether the files are shuffled or not.")
+    parser.add_argument("--file-count", type=int, required=True, help="Number of Parquet files to include in the dataset.")
     args = parser.parse_args()
 
     file_prefix = (
@@ -48,7 +75,14 @@ def create_subsets(base_dir, save_dir_path, subset_prefix, file_prefix, step_siz
         if args.save_dir_path
         else args.directory
     )
-    step_size = 500_000  # 500k
 
-    create_subsets(args.directory, save_dir_path, subset_prefix, file_prefix, step_size)
-    print(f'Finished creating subsets of Parquet files in {args.directory}.')
+    if os.path.exists(save_dir_path) and os.listdir(save_dir_path):
+        shutil.rmtree(save_dir_path)
+        logger.info(f"Deleted existing directory: {save_dir_path}")
+
+    # Log the input parameters
+    logger.info(f"Parameters received: directory={args.directory}, save_dir_path={args.save_dir_path}, file_count={args.file_count}, dataset_name_prefix={subset_prefix}, is_shuffled={args.is_shuffled}")
+
+    # Create the dataset with the specified file_count
+    create_dataset(args.directory, save_dir_path, subset_prefix, file_prefix, args.file_count)
+    logger.info(f'Finished creating a dataset with {args.file_count} Parquet files.')
diff --git a/custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-1000k.json b/custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-1000k.json
@@ -0,0 +1,43 @@
+{
+    "database": {
+      "host": "localhost1",
+      "username": "postgres",
+      "password": "postgres",
+      "db_name": "ann-1000k",
+      "instance_type": "Standard_D8ds_v5",
+      "provider": "azure",
+      "enable_seqscan": "on"
+    },
+    "cases": [
+      {
+        "db-label": "memory-comparison-1000k",
+        "drop_old": true,
+        "load": true,
+        "search-serial": false,
+        "search-concurrent": false,
+        "case-type": "PerformanceCustomDataset",
+        "maintenance-work-mem": "16GB",
+        "max-parallel-workers": 7,
+        "ef-search": [40],
+        "ef-construction": 128,
+        "m": 32,
+        "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100",
+        "concurrency-duration": 30,
+        "k": 10,
+        "custom-case-name": "hnsw-1536D-1m",
+        "custom-dataset-name": "custom-openai",
+        "custom-dataset-dir": "openai_1000k",
+        "custom-dataset-size": 1000000,
+        "custom-dataset-dim": 1536,
+        "custom-dataset-file-count": 2,
+        "custom-dataset-use-shuffled": false,
+        "create-dataset-args": {
+          "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m",
+          "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/",
+          "is-shuffled": false
+        },
+        "run_count": 1
+      }
+    ]
+  }
+
diff --git a/custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-2000k.json b/custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-2000k.json
@@ -0,0 +1,43 @@
+{
+    "database": {
+      "host": "localhost1",
+      "username": "postgres",
+      "password": "postgres",
+      "db_name": "ann-2000k",
+      "instance_type": "Standard_D8ds_v5",
+      "provider": "azure",
+      "enable_seqscan": "on"
+    },
+    "cases": [
+      {
+        "db-label": "memory-comparison-2000k",
+        "drop_old": true,
+        "load": true,
+        "search-serial": false,
+        "search-concurrent": false,
+        "case-type": "PerformanceCustomDataset",
+        "maintenance-work-mem": "16GB",
+        "max-parallel-workers": 7,
+        "ef-search": [40],
+        "ef-construction": 128,
+        "m": 32,
+        "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100",
+        "concurrency-duration": 30,
+        "k": 10,
+        "custom-case-name": "hnsw-1536D-2m",
+        "custom-dataset-name": "custom-openai",
+        "custom-dataset-dir": "openai_2000k",
+        "custom-dataset-size": 2000000,
+        "custom-dataset-dim": 1536,
+        "custom-dataset-file-count": 4,
+        "custom-dataset-use-shuffled": false,
+        "create-dataset-args": {
+          "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m",
+          "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/",
+          "is-shuffled": false
+        },
+        "run_count": 1
+      }
+    ]
+  }
+
diff --git a/custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-3500k.json b/custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-3500k.json
@@ -0,0 +1,43 @@
+{
+    "database": {
+      "host": "localhost2",
+      "username": "postgres",
+      "password": "postgres",
+      "db_name": "ann-3500k",
+      "instance_type": "Standard_D8ds_v5",
+      "provider": "azure",
+      "enable_seqscan": "on"
+    },
+    "cases": [
+      {
+        "db-label": "memory-comparison-3500k",
+        "drop_old": true,
+        "load": true,
+        "search-serial": false,
+        "search-concurrent": false,
+        "case-type": "PerformanceCustomDataset",
+        "maintenance-work-mem": "16GB",
+        "max-parallel-workers": 7,
+        "ef-search": [40],
+        "ef-construction": 128,
+        "m": 32,
+        "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100",
+        "concurrency-duration": 30,
+        "k": 10,
+        "custom-case-name": "hnsw-1536D-3_5m",
+        "custom-dataset-name": "custom-openai",
+        "custom-dataset-dir": "openai_3500k",
+        "custom-dataset-size": 3500000,
+        "custom-dataset-dim": 1536,
+        "custom-dataset-file-count": 7,
+        "custom-dataset-use-shuffled": false,
+        "create-dataset-args": {
+          "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m",
+          "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/",
+          "is-shuffled": false
+        },
+        "run_count": 1
+      }
+    ]
+  }
+
diff --git a/custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-4000k.json b/custom-run-build-index-configs-1/config-custom-dataset-small-hnsw-4000k.json
@@ -0,0 +1,43 @@
+{
+    "database": {
+      "host": "localhost1",
+      "username": "postgres",
+      "password": "postgres",
+      "db_name": "ann-4000k",
+      "instance_type": "Standard_D8ds_v5",
+      "provider": "azure",
+      "enable_seqscan": "on"
+    },
+    "cases": [
+      {
+        "db-label": "memory-comparison-4000k",
+        "drop_old": true,
+        "load": true,
+        "search-serial": false,
+        "search-concurrent": false,
+        "case-type": "PerformanceCustomDataset",
+        "maintenance-work-mem": "16GB",
+        "max-parallel-workers": 7,
+        "ef-search": [40],
+        "ef-construction": 128,
+        "m": 32,
+        "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100",
+        "concurrency-duration": 30,
+        "k": 10,
+        "custom-case-name": "hnsw-1536D-4m",
+        "custom-dataset-name": "custom-openai",
+        "custom-dataset-dir": "openai_4000k",
+        "custom-dataset-size": 4000000,
+        "custom-dataset-dim": 1536,
+        "custom-dataset-file-count": 8,
+        "custom-dataset-use-shuffled": false,
+        "create-dataset-args": {
+          "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m",
+          "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/",
+          "is-shuffled": false
+        },
+        "run_count": 1
+      }
+    ]
+  }
+
diff --git a/custom-run-build-index-configs-2/config-custom-dataset-small-hnsw-1500k.json b/custom-run-build-index-configs-2/config-custom-dataset-small-hnsw-1500k.json
@@ -0,0 +1,43 @@
+{
+    "database": {
+      "host": "localhost2",
+      "username": "postgres",
+      "password": "postgres",
+      "db_name": "ann-1500k",
+      "instance_type": "Standard_D8ds_v5",
+      "provider": "azure",
+      "enable_seqscan": "on"
+    },
+    "cases": [
+      {
+        "db-label": "memory-comparison-1500k",
+        "drop_old": true,
+        "load": true,
+        "search-serial": false,
+        "search-concurrent": false,
+        "case-type": "PerformanceCustomDataset",
+        "maintenance-work-mem": "16GB",
+        "max-parallel-workers": 7,
+        "ef-search": [40],
+        "ef-construction": 128,
+        "m": 32,
+        "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100",
+        "concurrency-duration": 30,
+        "k": 10,
+        "custom-case-name": "hnsw-1536D-1_5m",
+        "custom-dataset-name": "custom-openai",
+        "custom-dataset-dir": "openai_1500k",
+        "custom-dataset-size": 1500000,
+        "custom-dataset-dim": 1536,
+        "custom-dataset-file-count": 3,
+        "custom-dataset-use-shuffled": false,
+        "create-dataset-args": {
+          "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m",
+          "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/",
+          "is-shuffled": false
+        },
+        "run_count": 1
+      }
+    ]
+  }
+
diff --git a/custom-run-build-index-configs-2/config-custom-dataset-small-hnsw-2500k.json b/custom-run-build-index-configs-2/config-custom-dataset-small-hnsw-2500k.json
@@ -0,0 +1,43 @@
+{
+    "database": {
+      "host": "localhost2",
+      "username": "postgres",
+      "password": "postgres",
+      "db_name": "ann-2500k",
+      "instance_type": "Standard_D8ds_v5",
+      "provider": "azure",
+      "enable_seqscan": "on"
+    },
+    "cases": [
+      {
+        "db-label": "memory-comparison-2500k",
+        "drop_old": true,
+        "load": true,
+        "search-serial": false,
+        "search-concurrent": false,
+        "case-type": "PerformanceCustomDataset",
+        "maintenance-work-mem": "16GB",
+        "max-parallel-workers": 7,
+        "ef-search": [40],
+        "ef-construction": 128,
+        "m": 32,
+        "num-concurrency": "1,10,20,30,40,50,60,70,80,90,100",
+        "concurrency-duration": 30,
+        "k": 10,
+        "custom-case-name": "hnsw-1536D-2_5m",
+        "custom-dataset-name": "custom-openai",
+        "custom-dataset-dir": "openai_2500k",
+        "custom-dataset-size": 2500000,
+        "custom-dataset-dim": 1536,
+        "custom-dataset-file-count": 5,
+        "custom-dataset-use-shuffled": false,
+        "create-dataset-args": {
+          "directory": "/home/ubuntu/vectordb_bench/dataset/openai/openai_large_5m",
+          "save-dir-path": "/home/ubuntu/vectordb_bench/dataset/custom-openai/",
+          "is-shuffled": false
+        },
+        "run_count": 1
+      }
+    ]
+  }
+