Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reintroduce Join Order Benchmark #41

Open
wants to merge 18 commits into
base: refactoring
Choose a base branch
from
Open
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
[submodule "tpch-kit"]
path = tpch-kit
url = https://github.com/marcelja/tpch-kit.git
[submodule "join-order-benchmark"]
path = join-order-benchmark
url = https://github.com/gregrahn/join-order-benchmark.git
[submodule "hypopg"]
path = hypopg
url = https://github.com/HypoPG/hypopg.git
Expand Down
2 changes: 1 addition & 1 deletion benchmark_results/job/config.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"database_system": "postgres",
"benchmark_name": "JOB",
"benchmark_name": "job",
"scale_factor": 1,
"algorithms": [
{
Expand Down
92 changes: 68 additions & 24 deletions benchmark_results/notebooks/Graphs.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"import pandas as pd\n",
Expand Down Expand Up @@ -37,7 +39,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"def get_costs(df):\n",
Expand All @@ -56,7 +60,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"def draw_cost_graph(cophy_costs=None, cophy_memory_consumption=None, legend=True):\n",
Expand Down Expand Up @@ -238,7 +244,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"def draw_legend():\n",
Expand Down Expand Up @@ -275,7 +283,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"CSV_PATH = '../tpch_wo_2_17_20'\n",
Expand All @@ -290,7 +300,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# old\n",
Expand All @@ -313,7 +325,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# Cophy What-If time: 151.91098499298096 - cost_requests: 82676 - cache_hits: 45776 - Gurobi Times:\n",
Expand All @@ -333,7 +347,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"draw_what_if_graph()\n"
Expand All @@ -349,7 +365,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"CSV_PATH = '../tpch_wo_2_17_20/all_queries'\n",
Expand Down Expand Up @@ -527,7 +545,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"CSV_PATH = '../tpcds_wo_4_6_9_10_11_32_35_41_95'\n",
Expand All @@ -540,7 +560,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"cophy_memory_consumptions_mb = [250,500,1000,1500,2000,2500,3000,3500,4250,5000,5750,6500,8000,10000,12500,15000]\n",
Expand All @@ -561,7 +583,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# Cophy What-If time: 579.6870040893555 - cost_requests: 394317 - cache_hits: 342140 - Gurobi Times:\n",
Expand All @@ -581,7 +605,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# draw_what_if_graph(million=True)"
Expand All @@ -597,11 +623,13 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"CSV_PATH = '../job/'\n",
"BENCHMARK = 'JOB'\n",
"CSV_PATH = '../job'\n",
"BENCHMARK = 'job'\n",
"SCALE_FACTOR = None\n",
"QUERIES = range(0, 113)\n",
"XLIM = 12"
Expand All @@ -610,7 +638,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# draw_cost_graph()\n",
Expand All @@ -630,7 +660,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# Cophy What-If time: 822.8340845108032 - cost_requests: 305326 - cache_hits: 267996 - Gurobi Times:\n",
Expand All @@ -650,7 +682,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# GRAPH_SIZE = (5,2.2)\n",
Expand All @@ -660,14 +694,18 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"\n"
Expand All @@ -676,7 +714,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"CSV_PATH = '../tpch_mssql'\n",
Expand All @@ -690,7 +730,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"draw_cost_graph()"
Expand All @@ -699,7 +741,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [],
"source": []
}
Expand Down
1 change: 1 addition & 0 deletions join-order-benchmark
Submodule join-order-benchmark added at e6d4ab
15 changes: 12 additions & 3 deletions selection/dbms/postgres_dbms.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,18 @@ def create_database(self, database_name):
self.exec_only("create database {}".format(database_name))
logging.info("Database {} created".format(database_name))

def import_data(self, table, path, delimiter="|"):
with open(path, "r") as file:
self._cursor.copy_from(file, table, sep=delimiter, null="")
def import_data(self, table, path, delimiter="|", encoding=None):
with open(path, encoding=encoding) as file:
if encoding:
self._cursor.copy_expert(
(
f"COPY {table} FROM STDIN WITH DELIMITER AS '{delimiter}' NULL "
f"AS 'NULL' CSV QUOTE AS '\"' ENCODING '{encoding}'"
),
file,
)
else:
self._cursor.copy_from(file, table, sep=delimiter, null="")

def indexes_size(self):
# Returns size in bytes
Expand Down
5 changes: 4 additions & 1 deletion selection/index_selection_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,17 @@ def _setup_config(self, config):
dbms_class = DBMSYSTEMS[config["database_system"]]
generating_connector = dbms_class(None, autocommit=True)
table_generator = TableGenerator(
config["benchmark_name"], config["scale_factor"], generating_connector
config["benchmark_name"],
config["scale_factor"] if "scale_factor" in config else 1,
generating_connector,
)
self.database_name = table_generator.database_name()
self.database_system = config["database_system"]
self.setup_db_connector(self.database_name, self.database_system)

if "queries" not in config:
config["queries"] = None

query_generator = QueryGenerator(
config["benchmark_name"],
config["scale_factor"],
Expand Down
45 changes: 44 additions & 1 deletion selection/query_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,37 @@ def _run_command(self, command, return_output=False, shell=False):
def _files(self):
return os.listdir(self.directory)

def _generate_job(self):
logging.info("Generating JOB Queries")
for filename in os.listdir(self.directory):
if ".sql" not in filename or "fkindexes" in filename or "schema" in filename:
continue
query_id = filename.replace(".sql", "")

with open(f"{self.directory}/{filename}") as query_file:
query_text = query_file.read()
query_text = query_text.replace("\t", "")
query = Query(query_id, query_text)

assert "WHERE" in query_text, "Query without WHERE clause encountered"

split = query_text.split("WHERE")
assert len(split) == 2, "Query split for JOB query contains subquery"
query_text_before_where = split[0]
query_text_after_where = split[1]

Bensk1 marked this conversation as resolved.
Show resolved Hide resolved
# Add indexable columns to query
for column in self.columns:
if (
klauck marked this conversation as resolved.
Show resolved Hide resolved
column.name in query_text_after_where
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not in query_text?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe the reasons was that column names before WHERE were not always clearly distinguishable. There are for example multiple tables with columns info.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add a comment in the code to explain the parsing difficulties and that indexabale columns may be missing.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we should probably do that or implement a more sophisticated parsing solution.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will add a comment to the code and an issue requesting improvements.

and f"{column.table.name} " in query_text_before_where
):
query.columns.append(column)
self.queries.append(query)
self._validate_query(query)

logging.info("Queries generated")

def generate(self):
if self.benchmark_name == "tpch":
self.directory = "./tpch-kit/dbgen"
Expand All @@ -154,5 +185,17 @@ def generate(self):
self.make_command.append("OS=MACOS")

self._generate_tpcds()
elif self.benchmark_name == "job":
assert self.scale_factor == 1, (
"Can only handle JOB with a scale factor of 1"
", i.e., no specific scaling"
)
assert self.query_ids is None, (
"Query filtering, i.e., providing query_ids to JOB QueryGenerator "
"is not supported."
)

self.directory = "./join-order-benchmark"
self._generate_job()
else:
raise NotImplementedError("only tpch/tpcds implemented.")
raise NotImplementedError("Only TPC-H/-DS and JOB implemented.")
Loading