From 4ace8f0d6991f6a1e8f535f51f3fca57eb9aacbf Mon Sep 17 00:00:00 2001
From: ThyrixYang <thyrixyang@gmail.com>
Date: Mon, 20 Mar 2017 20:39:07 +0800
Subject: [PATCH 1/2] Add benchmark script for mlpack_decision_tree

---
 config.yaml                     |  17 ++-
 methods/mlpack/decision_tree.py | 198 ++++++++++++++++++++++++++++++++
 2 files changed, 214 insertions(+), 1 deletion(-)
 create mode 100644 methods/mlpack/decision_tree.py

diff --git a/config.yaml b/config.yaml
index f676de1..8aa16f7 100644
--- a/config.yaml
+++ b/config.yaml
@@ -54,7 +54,22 @@ methods:
                        ['datasets/shuttle_train.csv', 'datasets/shuttle_test.csv'],
                        ['datasets/transfusion_train.csv', 'datasets/transfusion_test.csv'],
                        ['datasets/arcene_train.csv', 'datasets/arcene_test.csv'] ]
-
+    DTC:
+        run: ['metric']
+        script: methods/mlpack/decision_tree.py
+        format: [csv, txt]
+        datasets:
+                - files: [ ['datasets/iris_train.csv', 'datasets/iris_test.csv', 'datasets/iris_labels.csv'],
+                           ['datasets/oilspill_train.csv', 'datasets/oilspill_test.csv', 'datasets/oilspill_labels.csv'],
+                           ['datasets/scene_train.csv', 'datasets/scene_test.csv', 'datasets/scene_labels.csv'],
+                           ['datasets/webpage_train.csv', 'datasets/webpage_test.csv', 'datasets/webpage_labels.csv'],
+                           ['datasets/isolet_train.csv', 'datasets/isolet_test.csv', 'datasets/isolet_labels.csv'],
+                           ['datasets/mammography_train.csv', 'datasets/mammography_test.csv', 'datasets/mammography_labels.csv'],
+                           ['datasets/abalone19_train.csv', 'datasets/abalone19_test.csv', 'datasets/abalone19_labels.csv'],
+                           ['datasets/sickEuthyroid_train.csv', 'datasets/sickEuthyroid_test.csv', 'datasets/sickEuthyroid_labels.csv'],
+                           ['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
+                           ['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
+                           ['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]
     NMF:
         run: ['metric']
         script: methods/mlpack/nmf.py
diff --git a/methods/mlpack/decision_tree.py b/methods/mlpack/decision_tree.py
new file mode 100644
index 0000000..4f03cd2
--- /dev/null
+++ b/methods/mlpack/decision_tree.py
@@ -0,0 +1,198 @@
+'''
+  @file decision_tree.py
+  @author Thyrix Yang
+
+  Class to benchmark the mlpack decision tree method.
+'''
+
+import os
+import sys
+import inspect
+import numpy as np
+
+# Import the util path, this method even works if the path contains symlinks to
+# modules.
+cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join(
+  os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../util")))
+if cmd_subfolder not in sys.path:
+  sys.path.insert(0, cmd_subfolder)
+
+#Import the metrics definitions path.
+metrics_folder = os.path.realpath(os.path.abspath(os.path.join(
+  os.path.split(inspect.getfile(inspect.currentframe()))[0], "../metrics")))
+if metrics_folder not in sys.path:
+  sys.path.insert(0, metrics_folder)
+
+from log import *
+from profiler import *
+from definitions import *
+from misc import *
+import shlex
+
+try:
+  import subprocess32 as subprocess
+except ImportError:
+  import subprocess
+
+import re
+import collections
+
+'''
+This class implements the Decision Tree benchmark.
+'''
+class DTC(object):
+
+  '''
+  Create the Decision Tree benchmark instance, show some
+  informations and return the instance.
+
+  @param dataset - Input dataset to perform Decision Tree Prediction on.
+  @param timeout - The time until the timeout. Default no timeout.
+  @param path - Path to the mlpack executable.
+  @param verbose - Display informational messages.
+  '''
+  def __init__(self, dataset, timeout=0, path=os.environ["MLPACK_BIN"],
+      verbose=True, debug=os.environ["MLPACK_BIN_DEBUG"]):
+    self.verbose = verbose
+    self.dataset = dataset
+    self.path = path
+    self.timeout = timeout
+    self.debug = debug
+
+    # Get description from executable.
+    cmd = shlex.split(self.path + "mlpack_decision_tree -h")
+    try:
+      s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False)
+    except Exception as e:
+      Log.Fatal("Could not execute command: " + str(cmd))
+    else:
+      # Use regular expression pattern to get the description.
+      pattern = re.compile(br"""(.*?)Required.*?options:""",
+          re.VERBOSE|re.MULTILINE|re.DOTALL)
+
+      match = pattern.match(s)
+      if not match:
+        Log.Warn("Can't parse description", self.verbose)
+        description = ""
+      else:
+        description = match.group(1)
+
+      self.description = description
+
+  '''
+  Run valgrind massif profiler on the Decision Tree
+  method. If the method has been successfully completed the report is saved in
+  the specified file.
+
+  @param options - Extra options for the method.
+  @param fileName - The name of the massif output file.
+  @param massifOptions - Extra massif options.
+  @return Returns False if the method was not successful, if the method was
+  successful save the report file in the specified file.
+  '''
+  def RunMemory(self, options, fileName, massifOptions="--depth=2"):
+    Log.Info("Perform Decision Tree Memory Profiling.", self.verbose)
+
+    # If the dataset contains two files then the second file is the test file.
+    # In this case we add this to the command line.
+    if len(self.dataset) >= 2:
+      cmd = shlex.split(self.debug + "mlpack_decision_tree -t " +
+          self.dataset[0] + " -T " + self.dataset[1] + " -v " + options)
+    else:
+      Log.Fatal("This method requires atleast two datasets.")
+
+    return Profiler.MassifMemoryUsage(cmd, fileName, self.timeout, massifOptions)
+
+  '''
+  Perform Decision Tree Prediction. If the method has been
+  successfully completed return the elapsed time in seconds.
+
+  @param options - Extra options for the method.
+  @return - Elapsed time in seconds or a negative value if the method was not
+  successful.
+  '''
+  def RunMetrics(self, options):
+    Log.Info("Perform Decision Tree Prediction.", self.verbose)
+
+    # If the dataset contains two files then the second file is the test file.
+    # In this case we add this to the command line.
+    if len(self.dataset) >= 2:
+      cmd = shlex.split(self.path + "mlpack_decision_tree -t " +
+          self.dataset[0] + " -T " + self.dataset[1] + " -v 1 -p output.csv" + options)
+    else:
+      Log.Fatal("This method requires atleast two datasets.")
+
+    # Run command with the nessecary arguments and return its output as a byte
+    # string. We have untrusted input so we disable all shell based features.
+    try:
+      s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False,
+          timeout=self.timeout)
+    except subprocess.TimeoutExpired as e:
+      Log.Warn(str(e))
+      return -2
+    except Exception as e:
+      Log.Fatal("Could not execute command: " + str(cmd))
+      return -1
+
+    # Datastructure to store the results.
+    metrics = {}
+
+    # Parse data (runtime and number of base cases).
+    timer = self.parseTimer(s)
+
+    if timer != -1:
+      metrics['Runtime'] = timer.total_time - timer.loading_data - timer.saving_data
+
+      Log.Info(("total time: %fs" % (metrics['Runtime'])), self.verbose)
+    
+    # If the dataset contains three files then the third is the ture test label
+    if len(self.dataset) >= 3 and CheckFileAvailable('output.csv'):
+
+      truelabels = LoadDataset(self.dataset[2])
+
+      predictedlabels = LoadDataset("output.csv")
+
+      confusionMatrix = Metrics.ConfusionMatrix(truelabels, predictedlabels)
+      AvgAcc = Metrics.AverageAccuracy(confusionMatrix)
+      AvgPrec = Metrics.AvgPrecision(confusionMatrix)
+      AvgRec = Metrics.AvgRecall(confusionMatrix)
+      AvgF = Metrics.AvgFMeasure(confusionMatrix)
+      AvgLift = Metrics.LiftMultiClass(confusionMatrix)
+      AvgMCC = Metrics.MCCMultiClass(confusionMatrix)
+      AvgInformation = Metrics.AvgMPIArray(confusionMatrix, truelabels, predictedlabels)
+      SimpleMSE = Metrics.SimpleMeanSquaredError(truelabels, predictedlabels)
+      metrics['Avg Accuracy'] = AvgAcc
+      metrics['MultiClass Precision'] = AvgPrec
+      metrics['MultiClass Recall'] = AvgRec
+      metrics['MultiClass FMeasure'] = AvgF
+      metrics['MultiClass Lift'] = AvgLift
+      metrics['MultiClass MCC'] = AvgMCC
+      metrics['MultiClass Information'] = AvgInformation
+      metrics['Simple MSE'] = SimpleMSE
+
+    return metrics
+
+  '''
+  Parse the timer data form a given string.
+
+  @param data - String to parse timer data from.
+  @return - Namedtuple that contains the timer data or -1 in case of an error.
+  '''
+  def parseTimer(self, data):
+    # Compile the regular expression pattern into a regular expression object to
+    # parse the timer data.
+    pattern = re.compile(br"""
+        .*?loading_data: (?P<loading_data>.*?)s.*?
+        .*?saving_data: (?P<saving_data>.*?)s.*?
+        .*?total_time: (?P<total_time>.*?)s.*?
+        """, re.VERBOSE|re.MULTILINE|re.DOTALL)
+
+    match = pattern.match(data)
+    if not match:
+      Log.Fatal("Can't parse the data: wrong format")
+      return -1
+    else:
+      # Create a namedtuple and return the timer data.
+      timer = collections.namedtuple('timer', ["loading_data", "total_time", "saving_data"])
+      return timer(float(match.group("loading_data")),
+          float(match.group("total_time")), float(match.group("saving_data")))

From a18e4541ccbe574b9cc11d738e36bdb7573aa08f Mon Sep 17 00:00:00 2001
From: thyrixyang <thyrixyang@gmail.com>
Date: Sun, 30 Apr 2017 10:32:52 +0800
Subject: [PATCH 2/2] add test script for mlpack_decision_tree

---
 tests/benchmark_decision_tree.py | 35 ++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/tests/benchmark_decision_tree.py b/tests/benchmark_decision_tree.py
index ac10050..adc9a6f 100644
--- a/tests/benchmark_decision_tree.py
+++ b/tests/benchmark_decision_tree.py
@@ -80,3 +80,38 @@ def test_RunMetrics(self):
 
 if __name__ == '__main__':
   unittest.main()
+
+'''
+Test the mlpack Decision Tree Prediction script.
+'''
+class DecisionTree_MLPACK_TEST(unittest.TestCase):
+
+  '''
+  Test initialization.
+  '''
+  def setUp(self):
+    self.dataset = ['datasets/iris_train.csv','datasets/iris_test.csv']
+    self.verbose = False
+    self.timeout = 9000
+
+    module = Loader.ImportModuleFromPath("methods/mlpack/decision_tree.py")
+    obj = getattr(module, "DTC")
+    self.instance = obj(self.dataset, verbose=self.verbose, timeout=self.timeout)
+
+  '''
+  Test the constructor.
+  '''
+  def test_Constructor(self):
+    self.assertEqual(self.instance.verbose, self.verbose)
+    self.assertEqual(self.instance.timeout, self.timeout)
+    self.assertEqual(self.instance.dataset, self.dataset)
+
+  '''
+  Test the 'RunMetrics' function.
+  '''
+  def test_RunMetrics(self):
+    result = self.instance.RunMetrics("")
+    self.assertTrue(result["Runtime"] > 0)
+
+if __name__ == '__main__':
+  unittest.main()