forked from edin-dal/sdql-benchmark
-
Notifications
You must be signed in to change notification settings - Fork 0
/
extractor.py
63 lines (51 loc) · 2.08 KB
/
extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import json
import os
import re
import subprocess
from collections import defaultdict
from typing import Iterable, Final
from benches import average_times, SEC_TO_MS
from connectors.hyper import LOG_PATH, Hyper
REPO_ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)))
RE_TPCH: Final[re.Pattern] = re.compile(r"TPC-H Query ([1|2]?\d)")
# this run will just populate the log for us to extract
def hyper_dry_run(indices: Iterable[int], queries: Iterable[str], threads: int) -> None:
with Hyper(threads=threads, is_log=True) as db:
for i, q in zip(indices, queries):
res = db.conn.execute_query(q)
res.close()
def extract_hyper_log_times(
indices: Iterable[int], queries: Iterable[str], threads: int
) -> (list[float], list[float]):
print("Generating Hyper log")
# runs elsewhere for benchmarks don't generate logs - in case it affects performance
hyper_dry_run(indices, queries, threads)
print("Hyper log generated")
print("Processing Hyper log")
res = subprocess.run(
["grep", '"execution-time"', LOG_PATH],
cwd=REPO_ROOT,
stdout=subprocess.PIPE,
).stdout.decode()
i_to_elapsed_times = defaultdict(list)
i_to_execution_times = defaultdict(list)
for line in res.splitlines():
if m := RE_TPCH.search(line):
i = int(m.group(1))
as_json = json.loads(line)
elapsed = as_json["v"]["elapsed"] * SEC_TO_MS
execution_time = as_json["v"]["execution-time"] * SEC_TO_MS
i_to_elapsed_times[i].append(elapsed)
i_to_execution_times[i].append(execution_time)
elapsed_times = []
for i in sorted(i_to_elapsed_times):
q_times = i_to_elapsed_times[i]
mean_ms = average_times(q_times, i, "Elapsed")
elapsed_times.append(mean_ms)
execution_times = []
for i in sorted(i_to_execution_times):
q_times = i_to_execution_times[i]
mean_ms = average_times(q_times, i, "Execution")
execution_times.append(mean_ms)
print("Hyper log processed")
return elapsed_times, execution_times