EmumbaOrg · Sheharyar570 · Aug 28, 2024 · Aug 28, 2024 · Aug 28, 2024 · Aug 28, 2024
diff --git a/combine_bm_results.py b/combine_bm_results.py
@@ -0,0 +1,74 @@
+import os
+import re
+import json
+import argparse
+
+parser = argparse.ArgumentParser(description="Combine benchmark results")
+parser.add_argument("--benchmark", type=str, choices=["HammerDB", "VectorDBBench"], help="Path to the directory where results are stored")
+parser.add_argument("--results-dir-path", type=str, help="Path to the directory where results are stored")
+args = parser.parse_args()
+
+
+def extract_from_hdbxtprofile(file_path):
+    with open(file_path, "r") as file:
+        content = file.read()
+    summaries = re.findall(
+        r'>>>>> SUMMARY OF ([\d.]+) ACTIVE VIRTUAL USERS.*?AVG:\s*([\d.]+)ms.*?P99:\s*([\d.]+).*?TOTAL VECTOR QPS:\s*([\d.]+).*?NOPM:\s*(\d+).*?TPM:\s*(\d+)',
+        content, re.DOTALL | re.IGNORECASE
+    )
+    results = {}
+
+    for summary in summaries:
+        user_count = int(summary[0])
+        avg_latency = float(summary[1])
+        p99_latency = float(summary[2])
+        total_vector_qps = float(summary[3])
+        tpm = int(summary[4])
+        nopm = int(summary[5])
+
+        results[user_count] = {
+            "Total Vector QPS": total_vector_qps,
+            "TPM": tpm,
+            "NOPM": nopm,
+            "avg_latency": avg_latency,
+            "P99 Latency": p99_latency
+        }
+
+    return results
+
+def extract_from_log_txt(file_path):
+    result = {}
+    with open(file_path, "r") as file:
+        for line in file:
+            if line.startswith("Running command:"):
+                break
+            if ":" in line:
+                key, value = line.split(":", 1)
+                result[key.strip()] = value.strip()
+    return result
+
+def extract_from_json(file_path):
+    with open(file_path, "r") as file:
+        return json.load(file)
+
+def main():
+    for root, _, file_names in os.walk(args.results_dir_path):
+        combined_data = {}
+        for file_name in file_names:
+            if file_name == "log.txt":
+                combined_data["config"] = extract_from_log_txt(os.path.join(root, file_name))
+            elif args.benchmark == "HammerDB" and file_name == "hdbxtprofile.log":
+                combined_data["HammerDB"] = extract_from_hdbxtprofile(os.path.join(root, file_name))
+            elif file_name.startswith("result") and file_name.endswith(".json"):
+                combined_data["vectordb"] = extract_from_json(os.path.join(root, file_name))['results']
+
+        if len(file_names) > 0:
+            result_file_name = root.split("/")[-1] + "-results.json"
+            with open(os.path.join(root, result_file_name), "w") as result_file:
+                json.dump(combined_data, result_file, indent=4)
+                print(f"File saved in {root}")
+
+    print("Results combined successfully")
+
+if __name__ == "__main__":
+    main()
diff --git a/config.json b/config.json
@@ -0,0 +1,48 @@
+{
+  "database": {
+    "host": "172.17.0.2",
+    "username": "postgres",
+    "password": "admin123",
+    "db_name": "ann",
+    "instance_type": "db.m6i.large",
+    "provider": "aws",
+    "enable_seqscan": "on"
+  },
+  "hammerdb": {
+    "db": "pg",
+    "bm": "TPC-C",
+    "vindex": "hnsw",
+    "vector_table_name": "public.pg_vector_collection",
+    "build_schema": true,
+    "pg_driver": "timed",
+    "pg_total_iterations": "10000000",
+    "pg_count_ware": "400",
+    "pg_num_vu": "10",
+    "pg_rampup": "0",
+    "pg_duration": "1",
+    "pg_allwarehouse": "false",
+    "pg_timeprofile": "true",
+    "pg_vacuum": "false",
+    "keepalive_margin": "90"
+  },
+  "cases": [
+    {
+      "db-label": "run1-seqon",  
+      "drop_old": true,
+      "load": true,
+      "search-serial": false,
+      "search-concurrent": false,
+      "case-type": "Performance1536D50K",
+      "maintenance-work-mem": "4GB",
+      "max-parallel-workers": 2,
+      "ef-search": [10, 20, 40, 80, 120, 200, 400],
+      "ef-construction": 32,
+      "m": 8,
+      "num-concurrency": ["1", "10", "20", "30", "40", "50", "60", "70", "80", "90", "100"],
+      "concurrency-duration": 30,
+      "k": 10,
+      "mw_oltp_vector_vu_ratio": "0.8",
+      "run_count": 1
+    }
+  ]
+}
diff --git a/config/generic.xml b/config/generic.xml
@@ -75,6 +75,9 @@
            <profiler>xtprof</profiler>
            <xt_unique_log_name>0</xt_unique_log_name>
            <xt_gather_timeout>60</xt_gather_timeout>
-           <xt_job_storage>1</xt_job_storage>
+           <xt_job_storage>0</xt_job_storage>
     </timeprofile>
+    <vectordb>
+        <vindex>hnsw</vindex>
+    </vectordb>
 </hammerdb>
diff --git a/config/vectordb.xml b/config/vectordb.xml
@@ -0,0 +1,63 @@
+<?xml version="1.0" encoding="utf-8"?>
+<vectordb>
+    <ivfflat>
+        <name>IVF_FLAT</name>
+        <desc>Inverted File Flat Index</desc>
+        <session_params>
+            <ss_probes>10</ss_probes>
+        </session_params>
+        <search_params>
+            <se_k>10</se_k>
+            <se_distance>cosine</se_distance>
+        </search_params>
+        <index_params>
+            <in_max_parallel_workers>4</in_max_parallel_workers>
+            <in_maintenance_work_mem>8GB</in_maintenance_work_mem>
+        </index_params>
+        <index_creation_with_options>
+            <ino_lists>100</ino_lists>
+        </index_creation_with_options>
+    </ivfflat>
+    <hnsw>
+        <name>HNSW</name>
+        <desc>Hierarchical Navigable Small World</desc>
+        <session_params>
+            <ss_hnsw.ef_search>100</ss_hnsw.ef_search>
+        </session_params>
+        <search_params>
+            <se_k>10</se_k>
+            <se_distance>cosine</se_distance>
+        </search_params>
+        <index_params>
+            <in_max_parallel_workers>4</in_max_parallel_workers>
+            <in_maintenance_work_mem>8GB</in_maintenance_work_mem>
+        </index_params>
+        <index_creation_with_options>
+            <ino_ef_construction>200</ino_ef_construction>
+            <ino_m>16</ino_m>
+        </index_creation_with_options>
+    </hnsw>
+    <pgdiskann>
+        <name>PGDiskAnn</name>
+        <desc>DiskAnn</desc>
+        <session_params>
+            <ss_diskann.l_value_is>100</ss_diskann.l_value_is>
+        </session_params>
+        <search_params>
+            <se_k>10</se_k>
+            <se_distance>cosine</se_distance>
+        </search_params>
+        <index_params>
+            <in_max_parallel_workers>4</in_max_parallel_workers>
+            <in_maintenance_work_mem>8GB</in_maintenance_work_mem>
+        </index_params>
+        <index_creation_with_options>
+            <ino_max_neighbors>128</ino_max_neighbors>
+            <ino_l_value_ib>16</ino_l_value_ib>
+        </index_creation_with_options>
+    </pgdiskann>
+    <mixed_workload>
+        <mw_oltp_vector_vu_ratio>0.8</mw_oltp_vector_vu_ratio>
+        <vector_table_name>public.pg_vector_collection</vector_table_name>
+    </mixed_workload>
+</vectordb>
diff --git a/dataset/vector/ground_truth/output_gt.csv b/dataset/vector/ground_truth/output_gt.csv
diff --git a/dataset/vector/test/output.csv b/dataset/vector/test/output.csv
diff --git a/mixed_workload.py b/mixed_workload.py
@@ -0,0 +1,73 @@
+import subprocess
+import os
+import time
+tmpdir = os.getenv('TMP', "/tmp")
+
+dbhost = "172.17.0.2"
+dbname = "postgres"
+dbuser = "postgres"
+dbpass = "admin123"
+dbport = 5432
+
+dbset('db','pg')
+dbset('bm','TPC-C')
+dbset( 'vindex', 'hnsw')
+
+diset('connection','pg_host', dbhost)
+diset('connection','pg_port', dbport)
+diset('connection','pg_sslmode','prefer')
+
+diset('tpcc','pg_superuser', dbuser)
+diset('tpcc','pg_superuserpass', dbpass)
+diset('tpcc','pg_defaultdbase', dbname)
+diset('tpcc','pg_user', dbuser)
+diset('tpcc','pg_pass', dbpass)
+diset('tpcc','pg_dbase', dbname )
+diset('tpcc','pg_driver','timed')
+diset('tpcc','pg_total_iterations','10000000')
+diset('tpcc','pg_rampup','1')
+diset('tpcc','pg_duration','1')
+diset('tpcc','pg_allwarehouse','false')
+diset('tpcc','pg_timeprofile','true')
+diset('tpcc','pg_vacuum','false')
+giset("commandline", "keepalive_margin", "90")
+
+print("STARTED LOADING VECTOR DATA IN DB AND BUILDING INDEX")
+result = subprocess.run(["vectordbbench", "pgvectorhnsw", "--config-file", "/home/emumba/emumba/VDB/VectorDBBench/vectordb_bench/config-files/sample_config.yml"], capture_output=True)
+print(result)
+print("VECTOR DATA LOADED AND INDEX BUILD COMPLETE")
+
+if result.returncode == 0:
+    buildschema()
+    loadscript()
+    vudestroy()
+    print("TEST STARTED")
+    vuset('vu','4')
+    vucreate()
+    tcstart()
+    tcstatus()
+    jobid = tclpy.eval('vurun')
+    vudestroy()
+    tcstop()
+    print("TEST COMPLETE")
+    file_path = os.path.join(tmpdir , "pg_tprocc" )
+    fd = open(file_path, "w")
+    fd.write(jobid)
+    fd.close()
+    time.sleep(10)
+
+    print("STARTING RECALL CALCULATION")
+    diset('tpcc','pg_driver','test')
+    customscript("recall_calculation.tcl")
+    vuset("vu", "1")
+    vucreate()
+    jobid = tclpy.eval('vurun')
+    vudestroy()
+    print("TEST COMPLETE")
+    # TODO: Fix - logs are not being written to file
+    file_path = os.path.join(tmpdir , "pg_tprocc" )
+    fd = open(file_path, "w")
+    fd.write(jobid)
+    fd.close()
+    print("RECALL CALUCLATION COMPLETE")
+exit()
diff --git a/modules/tpcccommon-1.0.tm b/modules/tpcccommon-1.0.tm
@@ -340,3 +340,29 @@ if { $res_format eq "TPM" } {
     return "TEST RESULT : System achieved $nopm NOPM from $tpm $db TPM"
   }
 }
+
+proc load_vector_data { path is_ground_truth } {
+  #TODO: Make it singleton
+  set file [open $path r]
+  set file_content [read $file]
+  close $file
+  set lines [split $file_content "\n"]
+  set data {}
+  for {set i 0} {$i < [llength $lines]} {incr i} {
+    set first_comma_index [string first "," [lindex $lines $i]]
+    set id [string range [lindex $lines $i] 0 [expr {$first_comma_index - 1}]]
+    set line [string range [lindex $lines $i] [expr {$first_comma_index + 2}] end] ;# +2 to skip comma and space
+    # Remove the quotes from id and emb
+    set id [string trim $id {"}]
+    set line [string trim $line {"}]
+    if { $is_ground_truth } {
+      set line [string map {"," " "} $line]
+    }
+    lappend data [list $id $line]
+  }
+  return $data
+}
+
+global vector_test_dataset vector_ground_truth
+set vector_test_dataset [ load_vector_data "./dataset/vector/test/output.csv" "false" ]
+set vector_ground_truth [ load_vector_data "./dataset/vector/ground_truth/output_gt.csv" "true" ]