Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add mixed workload #1

Open
wants to merge 32 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
3b4ae36
POC done, running vector and OLTP queryies in parallel.
Sheharyar570 Aug 28, 2024
e7f8869
DB functions, prepared statment, and store procedures added for vecto…
Sheharyar570 Aug 28, 2024
e515598
Assign vector and OLTP workload to different set of virtual users to …
Sheharyar570 Aug 28, 2024
2cb8bd0
Bug fixed: vu 1 exits with finished failed status, because not all vi…
Sheharyar570 Aug 28, 2024
9561caf
Added QPS metric for vector queries.
Sheharyar570 Aug 28, 2024
7329a02
Added Recall metric calculation for vector queries
Sheharyar570 Aug 28, 2024
93f9c61
Add QPS calculation in xtprof
wahajali Aug 29, 2024
f2718d4
Vector QPS calculation - ignore rampup data
wahajali Aug 29, 2024
8a01ff2
Merge pull request #2 from EmumbaOrg/add-qps-calculation
Sheharyar570 Aug 29, 2024
1be4d57
Dataset load in memory adn read optimization and some refactoring
Sheharyar570 Aug 30, 2024
e6a4a06
Updated OLTP workload to start profiling after rampup time.
Sheharyar570 Aug 30, 2024
44cf3bf
Recall calculated after test run using custom script.
Sheharyar570 Aug 30, 2024
9ad3d01
Vector workload related params configuration enabled from config file.
Sheharyar570 Sep 5, 2024
cded18c
Update vector related params from HammerDB CLI.
Sheharyar570 Sep 5, 2024
5f8667e
Fixed issue with setting session params.
Sheharyar570 Sep 11, 2024
6ab1e21
Added automation script to run mixed workload with different configu…
Sheharyar570 Sep 11, 2024
7ff3f30
Updated xtprof module to store results in path defined in a separate …
Sheharyar570 Sep 11, 2024
9dfcea1
Add distance operator dynamically to prepared statement. Updated vect…
Sheharyar570 Sep 12, 2024
485d84e
Use relative path to test and ground_truth data files.
Sheharyar570 Sep 12, 2024
79dbd3e
Configure vectordb dict in hammerdb before running each benchmark case.
Sheharyar570 Sep 12, 2024
e9ddb0c
Update config.json to support optional build schema, warehouse count,…
Sheharyar570 Sep 17, 2024
ab0a4ad
Add NOPM and TPM metric in xtprof module
Sheharyar570 Sep 17, 2024
27a7d8a
Added pgdiskann support
Sheharyar570 Sep 19, 2024
8e2fc30
Updated config file name
Sheharyar570 Sep 19, 2024
ccde878
Fixed diskann automation script
Sheharyar570 Sep 19, 2024
735a8df
Fixed bug
Sheharyar570 Sep 24, 2024
1d44c44
Add mixed workload distribution between vu ratio configration param
Sheharyar570 Sep 24, 2024
95a0177
Add vector table name configuration parameter.
Sheharyar570 Sep 24, 2024
898f83a
Removed stored procedures option for vector search.
Sheharyar570 Oct 2, 2024
4a26393
Script to combine benchmark results in multiple files into single jso…
Sheharyar570 Oct 2, 2024
09d1614
Dump HammerDB config in logs
Sheharyar570 Oct 4, 2024
d15d1ed
Updated script to handle results in multiple directories
Sheharyar570 Oct 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions combine_bm_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import os
import re
import json
import argparse

parser = argparse.ArgumentParser(description="Combine benchmark results")
parser.add_argument("--benchmark", type=str, choices=["HammerDB", "VectorDBBench"], help="Path to the directory where results are stored")
parser.add_argument("--results-dir-path", type=str, help="Path to the directory where results are stored")
args = parser.parse_args()


def extract_from_hdbxtprofile(file_path):
with open(file_path, "r") as file:
content = file.read()
summaries = re.findall(
r'>>>>> SUMMARY OF ([\d.]+) ACTIVE VIRTUAL USERS.*?AVG:\s*([\d.]+)ms.*?P99:\s*([\d.]+).*?TOTAL VECTOR QPS:\s*([\d.]+).*?NOPM:\s*(\d+).*?TPM:\s*(\d+)',
content, re.DOTALL | re.IGNORECASE
)
results = {}

for summary in summaries:
user_count = int(summary[0])
avg_latency = float(summary[1])
p99_latency = float(summary[2])
total_vector_qps = float(summary[3])
tpm = int(summary[4])
nopm = int(summary[5])

results[user_count] = {
"Total Vector QPS": total_vector_qps,
"TPM": tpm,
"NOPM": nopm,
"avg_latency": avg_latency,
"P99 Latency": p99_latency
}

return results

def extract_from_log_txt(file_path):
result = {}
with open(file_path, "r") as file:
for line in file:
if line.startswith("Running command:"):
break
if ":" in line:
key, value = line.split(":", 1)
result[key.strip()] = value.strip()
return result

def extract_from_json(file_path):
with open(file_path, "r") as file:
return json.load(file)

def main():
for root, _, file_names in os.walk(args.results_dir_path):
combined_data = {}
for file_name in file_names:
if file_name == "log.txt":
combined_data["config"] = extract_from_log_txt(os.path.join(root, file_name))
elif args.benchmark == "HammerDB" and file_name == "hdbxtprofile.log":
combined_data["HammerDB"] = extract_from_hdbxtprofile(os.path.join(root, file_name))
elif file_name.startswith("result") and file_name.endswith(".json"):
combined_data["vectordb"] = extract_from_json(os.path.join(root, file_name))['results']

if len(file_names) > 0:
result_file_name = root.split("/")[-1] + "-results.json"
with open(os.path.join(root, result_file_name), "w") as result_file:
json.dump(combined_data, result_file, indent=4)
print(f"File saved in {root}")

print("Results combined successfully")

if __name__ == "__main__":
main()
48 changes: 48 additions & 0 deletions config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
{
"database": {
"host": "172.17.0.2",
"username": "postgres",
"password": "admin123",
"db_name": "ann",
"instance_type": "db.m6i.large",
"provider": "aws",
"enable_seqscan": "on"
},
"hammerdb": {
"db": "pg",
"bm": "TPC-C",
"vindex": "hnsw",
"vector_table_name": "public.pg_vector_collection",
"build_schema": true,
"pg_driver": "timed",
"pg_total_iterations": "10000000",
"pg_count_ware": "400",
"pg_num_vu": "10",
"pg_rampup": "0",
"pg_duration": "1",
"pg_allwarehouse": "false",
"pg_timeprofile": "true",
"pg_vacuum": "false",
"keepalive_margin": "90"
},
"cases": [
{
"db-label": "run1-seqon",
"drop_old": true,
"load": true,
"search-serial": false,
"search-concurrent": false,
"case-type": "Performance1536D50K",
"maintenance-work-mem": "4GB",
"max-parallel-workers": 2,
"ef-search": [10, 20, 40, 80, 120, 200, 400],
"ef-construction": 32,
"m": 8,
"num-concurrency": ["1", "10", "20", "30", "40", "50", "60", "70", "80", "90", "100"],
"concurrency-duration": 30,
"k": 10,
"mw_oltp_vector_vu_ratio": "0.8",
"run_count": 1
}
]
}
5 changes: 4 additions & 1 deletion config/generic.xml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@
<profiler>xtprof</profiler>
<xt_unique_log_name>0</xt_unique_log_name>
<xt_gather_timeout>60</xt_gather_timeout>
<xt_job_storage>1</xt_job_storage>
<xt_job_storage>0</xt_job_storage>
</timeprofile>
<vectordb>
<vindex>hnsw</vindex>
</vectordb>
</hammerdb>
63 changes: 63 additions & 0 deletions config/vectordb.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
<?xml version="1.0" encoding="utf-8"?>
<vectordb>
<ivfflat>
<name>IVF_FLAT</name>
<desc>Inverted File Flat Index</desc>
<session_params>
<ss_probes>10</ss_probes>
</session_params>
<search_params>
<se_k>10</se_k>
<se_distance>cosine</se_distance>
</search_params>
<index_params>
<in_max_parallel_workers>4</in_max_parallel_workers>
<in_maintenance_work_mem>8GB</in_maintenance_work_mem>
</index_params>
<index_creation_with_options>
<ino_lists>100</ino_lists>
</index_creation_with_options>
</ivfflat>
<hnsw>
<name>HNSW</name>
<desc>Hierarchical Navigable Small World</desc>
<session_params>
<ss_hnsw.ef_search>100</ss_hnsw.ef_search>
</session_params>
<search_params>
<se_k>10</se_k>
<se_distance>cosine</se_distance>
</search_params>
<index_params>
<in_max_parallel_workers>4</in_max_parallel_workers>
<in_maintenance_work_mem>8GB</in_maintenance_work_mem>
</index_params>
<index_creation_with_options>
<ino_ef_construction>200</ino_ef_construction>
<ino_m>16</ino_m>
</index_creation_with_options>
</hnsw>
<pgdiskann>
<name>PGDiskAnn</name>
<desc>DiskAnn</desc>
<session_params>
<ss_diskann.l_value_is>100</ss_diskann.l_value_is>
</session_params>
<search_params>
<se_k>10</se_k>
<se_distance>cosine</se_distance>
</search_params>
<index_params>
<in_max_parallel_workers>4</in_max_parallel_workers>
<in_maintenance_work_mem>8GB</in_maintenance_work_mem>
</index_params>
<index_creation_with_options>
<ino_max_neighbors>128</ino_max_neighbors>
<ino_l_value_ib>16</ino_l_value_ib>
</index_creation_with_options>
</pgdiskann>
<mixed_workload>
<mw_oltp_vector_vu_ratio>0.8</mw_oltp_vector_vu_ratio>
<vector_table_name>public.pg_vector_collection</vector_table_name>
</mixed_workload>
</vectordb>
1,000 changes: 1,000 additions & 0 deletions dataset/vector/ground_truth/output_gt.csv

Large diffs are not rendered by default.

1,000 changes: 1,000 additions & 0 deletions dataset/vector/test/output.csv

Large diffs are not rendered by default.

73 changes: 73 additions & 0 deletions mixed_workload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import subprocess
import os
import time
tmpdir = os.getenv('TMP', "/tmp")

dbhost = "172.17.0.2"
dbname = "postgres"
dbuser = "postgres"
dbpass = "admin123"
dbport = 5432

dbset('db','pg')
dbset('bm','TPC-C')
dbset( 'vindex', 'hnsw')

diset('connection','pg_host', dbhost)
diset('connection','pg_port', dbport)
diset('connection','pg_sslmode','prefer')

diset('tpcc','pg_superuser', dbuser)
diset('tpcc','pg_superuserpass', dbpass)
diset('tpcc','pg_defaultdbase', dbname)
diset('tpcc','pg_user', dbuser)
diset('tpcc','pg_pass', dbpass)
diset('tpcc','pg_dbase', dbname )
diset('tpcc','pg_driver','timed')
diset('tpcc','pg_total_iterations','10000000')
diset('tpcc','pg_rampup','1')
diset('tpcc','pg_duration','1')
diset('tpcc','pg_allwarehouse','false')
diset('tpcc','pg_timeprofile','true')
diset('tpcc','pg_vacuum','false')
giset("commandline", "keepalive_margin", "90")

print("STARTED LOADING VECTOR DATA IN DB AND BUILDING INDEX")
result = subprocess.run(["vectordbbench", "pgvectorhnsw", "--config-file", "/home/emumba/emumba/VDB/VectorDBBench/vectordb_bench/config-files/sample_config.yml"], capture_output=True)
print(result)
print("VECTOR DATA LOADED AND INDEX BUILD COMPLETE")

if result.returncode == 0:
buildschema()
loadscript()
vudestroy()
print("TEST STARTED")
vuset('vu','4')
vucreate()
tcstart()
tcstatus()
jobid = tclpy.eval('vurun')
vudestroy()
tcstop()
print("TEST COMPLETE")
file_path = os.path.join(tmpdir , "pg_tprocc" )
fd = open(file_path, "w")
fd.write(jobid)
fd.close()
time.sleep(10)

print("STARTING RECALL CALCULATION")
diset('tpcc','pg_driver','test')
customscript("recall_calculation.tcl")
vuset("vu", "1")
vucreate()
jobid = tclpy.eval('vurun')
vudestroy()
print("TEST COMPLETE")
# TODO: Fix - logs are not being written to file
file_path = os.path.join(tmpdir , "pg_tprocc" )
fd = open(file_path, "w")
fd.write(jobid)
fd.close()
print("RECALL CALUCLATION COMPLETE")
exit()
26 changes: 26 additions & 0 deletions modules/tpcccommon-1.0.tm
Original file line number Diff line number Diff line change
Expand Up @@ -340,3 +340,29 @@ if { $res_format eq "TPM" } {
return "TEST RESULT : System achieved $nopm NOPM from $tpm $db TPM"
}
}

proc load_vector_data { path is_ground_truth } {
#TODO: Make it singleton
set file [open $path r]
set file_content [read $file]
close $file
set lines [split $file_content "\n"]
set data {}
for {set i 0} {$i < [llength $lines]} {incr i} {
set first_comma_index [string first "," [lindex $lines $i]]
set id [string range [lindex $lines $i] 0 [expr {$first_comma_index - 1}]]
set line [string range [lindex $lines $i] [expr {$first_comma_index + 2}] end] ;# +2 to skip comma and space
# Remove the quotes from id and emb
set id [string trim $id {"}]
set line [string trim $line {"}]
if { $is_ground_truth } {
set line [string map {"," " "} $line]
}
lappend data [list $id $line]
}
return $data
}

global vector_test_dataset vector_ground_truth
set vector_test_dataset [ load_vector_data "./dataset/vector/test/output.csv" "false" ]
set vector_ground_truth [ load_vector_data "./dataset/vector/ground_truth/output_gt.csv" "true" ]
Loading