hyrise · Bensk1 · Jun 25, 2020 · Jun 25, 2020 · Jun 25, 2020 · Jun 25, 2020
diff --git a/.gitmodules b/.gitmodules
@@ -4,6 +4,9 @@
 [submodule "tpch-kit"]
 	path = tpch-kit
 	url = https://github.com/marcelja/tpch-kit.git
+[submodule "join-order-benchmark"]
+	path = join-order-benchmark
+	url = https://github.com/gregrahn/join-order-benchmark.git
 [submodule "hypopg"]
 	path = hypopg
 	url = https://github.com/HypoPG/hypopg.git

diff --git a/benchmark_results/job/config.json b/benchmark_results/job/config.json
@@ -1,6 +1,6 @@
 {
     "database_system": "postgres",
-    "benchmark_name": "JOB",
+    "benchmark_name": "job",
     "scale_factor": 1,
     "algorithms": [
         {

diff --git a/benchmark_results/notebooks/Graphs.ipynb b/benchmark_results/notebooks/Graphs.ipynb
@@ -3,7 +3,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
     "import pandas as pd\n",
@@ -37,7 +39,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
     "def get_costs(df):\n",
@@ -56,7 +60,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
     "def draw_cost_graph(cophy_costs=None, cophy_memory_consumption=None, legend=True):\n",
@@ -238,7 +244,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
     "def draw_legend():\n",
@@ -275,7 +283,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
     "CSV_PATH = '../tpch_wo_2_17_20'\n",
@@ -290,7 +300,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
     "# old\n",
@@ -313,7 +325,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
     "# Cophy What-If time: 151.91098499298096 - cost_requests: 82676 - cache_hits: 45776 - Gurobi Times:\n",
@@ -333,7 +347,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
     "draw_what_if_graph()\n"
@@ -349,7 +365,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
     "CSV_PATH = '../tpch_wo_2_17_20/all_queries'\n",
@@ -527,7 +545,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
     "CSV_PATH = '../tpcds_wo_4_6_9_10_11_32_35_41_95'\n",
@@ -540,7 +560,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
     "cophy_memory_consumptions_mb = [250,500,1000,1500,2000,2500,3000,3500,4250,5000,5750,6500,8000,10000,12500,15000]\n",
@@ -561,7 +583,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
     "# Cophy What-If time: 579.6870040893555 - cost_requests: 394317 - cache_hits: 342140 - Gurobi Times:\n",
@@ -581,7 +605,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
     "# draw_what_if_graph(million=True)"
@@ -597,11 +623,13 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
-    "CSV_PATH = '../job/'\n",
-    "BENCHMARK = 'JOB'\n",
+    "CSV_PATH = '../job'\n",
+    "BENCHMARK = 'job'\n",
     "SCALE_FACTOR = None\n",
     "QUERIES = range(0, 113)\n",
     "XLIM = 12"
@@ -610,7 +638,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
     "# draw_cost_graph()\n",
@@ -630,7 +660,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
     "# Cophy What-If time: 822.8340845108032 - cost_requests: 305326 - cache_hits: 267996 - Gurobi Times:\n",
@@ -650,7 +682,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
     "# GRAPH_SIZE = (5,2.2)\n",
@@ -660,14 +694,18 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": []
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
     "\n"
@@ -676,7 +714,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
     "CSV_PATH = '../tpch_mssql'\n",
@@ -690,7 +730,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": [
     "draw_cost_graph()"
@@ -699,7 +741,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "scrolled": false
+   },
    "outputs": [],
    "source": []
   }

diff --git a/join-order-benchmark b/join-order-benchmark
diff --git a/selection/dbms/postgres_dbms.py b/selection/dbms/postgres_dbms.py
@@ -72,9 +72,18 @@ def create_database(self, database_name):
         self.exec_only("create database {}".format(database_name))
         logging.info("Database {} created".format(database_name))
 
-    def import_data(self, table, path, delimiter="|"):
-        with open(path, "r") as file:
-            self._cursor.copy_from(file, table, sep=delimiter, null="")
+    def import_data(self, table, path, delimiter="|", encoding=None):
+        with open(path, encoding=encoding) as file:
+            if encoding:
+                self._cursor.copy_expert(
+                    (
+                        f"COPY {table} FROM STDIN WITH DELIMITER AS '{delimiter}' NULL "
+                        f"AS 'NULL' CSV QUOTE AS '\"' ENCODING '{encoding}'"
+                    ),
+                    file,
+                )
+            else:
+                self._cursor.copy_from(file, table, sep=delimiter, null="")
 
     def indexes_size(self):
         # Returns size in bytes

diff --git a/selection/index_selection_evaluation.py b/selection/index_selection_evaluation.py
@@ -61,14 +61,17 @@ def _setup_config(self, config):
         dbms_class = DBMSYSTEMS[config["database_system"]]
         generating_connector = dbms_class(None, autocommit=True)
         table_generator = TableGenerator(
-            config["benchmark_name"], config["scale_factor"], generating_connector
+            config["benchmark_name"],
+            config["scale_factor"] if "scale_factor" in config else 1,
+            generating_connector,
         )
         self.database_name = table_generator.database_name()
         self.database_system = config["database_system"]
         self.setup_db_connector(self.database_name, self.database_system)
 
         if "queries" not in config:
             config["queries"] = None
+
         query_generator = QueryGenerator(
             config["benchmark_name"],
             config["scale_factor"],

diff --git a/selection/query_generator.py b/selection/query_generator.py
@@ -136,6 +136,37 @@ def _run_command(self, command, return_output=False, shell=False):
     def _files(self):
         return os.listdir(self.directory)
 
+    def _generate_job(self):
+        logging.info("Generating JOB Queries")
+        for filename in os.listdir(self.directory):
+            if ".sql" not in filename or "fkindexes" in filename or "schema" in filename:
+                continue
+            query_id = filename.replace(".sql", "")
+
+            with open(f"{self.directory}/{filename}") as query_file:
+                query_text = query_file.read()
+                query_text = query_text.replace("\t", "")
+                query = Query(query_id, query_text)
+
+                assert "WHERE" in query_text, "Query without WHERE clause encountered"
+
+                split = query_text.split("WHERE")
+                assert len(split) == 2, "Query split for JOB query contains subquery"
+                query_text_before_where = split[0]
+                query_text_after_where = split[1]
+
+                # Add indexable columns to query
+                for column in self.columns:
+                    if (
+                        column.name in query_text_after_where
+                        and f"{column.table.name} " in query_text_before_where
+                    ):
+                        query.columns.append(column)
+                self.queries.append(query)
+                self._validate_query(query)
+
+        logging.info("Queries generated")
+
     def generate(self):
         if self.benchmark_name == "tpch":
             self.directory = "./tpch-kit/dbgen"
@@ -154,5 +185,17 @@ def generate(self):
                 self.make_command.append("OS=MACOS")
 
             self._generate_tpcds()
+        elif self.benchmark_name == "job":
+            assert self.scale_factor == 1, (
+                "Can only handle JOB with a scale factor of 1"
+                ", i.e., no specific scaling"
+            )
+            assert self.query_ids is None, (
+                "Query filtering, i.e., providing query_ids to JOB QueryGenerator "
+                "is not supported."
+            )
+
+            self.directory = "./join-order-benchmark"
+            self._generate_job()
         else:
-            raise NotImplementedError("only tpch/tpcds implemented.")
+            raise NotImplementedError("Only TPC-H/-DS and JOB implemented.")