From 4ace8f0d6991f6a1e8f535f51f3fca57eb9aacbf Mon Sep 17 00:00:00 2001 From: ThyrixYang Date: Mon, 20 Mar 2017 20:39:07 +0800 Subject: [PATCH 1/2] Add benchmark script for mlpack_decision_tree --- config.yaml | 17 ++- methods/mlpack/decision_tree.py | 198 ++++++++++++++++++++++++++++++++ 2 files changed, 214 insertions(+), 1 deletion(-) create mode 100644 methods/mlpack/decision_tree.py diff --git a/config.yaml b/config.yaml index f676de1..8aa16f7 100644 --- a/config.yaml +++ b/config.yaml @@ -54,7 +54,22 @@ methods: ['datasets/shuttle_train.csv', 'datasets/shuttle_test.csv'], ['datasets/transfusion_train.csv', 'datasets/transfusion_test.csv'], ['datasets/arcene_train.csv', 'datasets/arcene_test.csv'] ] - + DTC: + run: ['metric'] + script: methods/mlpack/decision_tree.py + format: [csv, txt] + datasets: + - files: [ ['datasets/iris_train.csv', 'datasets/iris_test.csv', 'datasets/iris_labels.csv'], + ['datasets/oilspill_train.csv', 'datasets/oilspill_test.csv', 'datasets/oilspill_labels.csv'], + ['datasets/scene_train.csv', 'datasets/scene_test.csv', 'datasets/scene_labels.csv'], + ['datasets/webpage_train.csv', 'datasets/webpage_test.csv', 'datasets/webpage_labels.csv'], + ['datasets/isolet_train.csv', 'datasets/isolet_test.csv', 'datasets/isolet_labels.csv'], + ['datasets/mammography_train.csv', 'datasets/mammography_test.csv', 'datasets/mammography_labels.csv'], + ['datasets/abalone19_train.csv', 'datasets/abalone19_test.csv', 'datasets/abalone19_labels.csv'], + ['datasets/sickEuthyroid_train.csv', 'datasets/sickEuthyroid_test.csv', 'datasets/sickEuthyroid_labels.csv'], + ['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'], + ['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'], + ['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ] NMF: run: ['metric'] script: methods/mlpack/nmf.py diff --git a/methods/mlpack/decision_tree.py b/methods/mlpack/decision_tree.py new file mode 100644 index 0000000..4f03cd2 --- /dev/null +++ b/methods/mlpack/decision_tree.py @@ -0,0 +1,198 @@ +''' + @file decision_tree.py + @author Thyrix Yang + + Class to benchmark the mlpack decision tree method. +''' + +import os +import sys +import inspect +import numpy as np + +# Import the util path, this method even works if the path contains symlinks to +# modules. +cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join( + os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../util"))) +if cmd_subfolder not in sys.path: + sys.path.insert(0, cmd_subfolder) + +#Import the metrics definitions path. +metrics_folder = os.path.realpath(os.path.abspath(os.path.join( + os.path.split(inspect.getfile(inspect.currentframe()))[0], "../metrics"))) +if metrics_folder not in sys.path: + sys.path.insert(0, metrics_folder) + +from log import * +from profiler import * +from definitions import * +from misc import * +import shlex + +try: + import subprocess32 as subprocess +except ImportError: + import subprocess + +import re +import collections + +''' +This class implements the Decision Tree benchmark. +''' +class DTC(object): + + ''' + Create the Decision Tree benchmark instance, show some + informations and return the instance. + + @param dataset - Input dataset to perform Decision Tree Prediction on. + @param timeout - The time until the timeout. Default no timeout. + @param path - Path to the mlpack executable. + @param verbose - Display informational messages. + ''' + def __init__(self, dataset, timeout=0, path=os.environ["MLPACK_BIN"], + verbose=True, debug=os.environ["MLPACK_BIN_DEBUG"]): + self.verbose = verbose + self.dataset = dataset + self.path = path + self.timeout = timeout + self.debug = debug + + # Get description from executable. + cmd = shlex.split(self.path + "mlpack_decision_tree -h") + try: + s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False) + except Exception as e: + Log.Fatal("Could not execute command: " + str(cmd)) + else: + # Use regular expression pattern to get the description. + pattern = re.compile(br"""(.*?)Required.*?options:""", + re.VERBOSE|re.MULTILINE|re.DOTALL) + + match = pattern.match(s) + if not match: + Log.Warn("Can't parse description", self.verbose) + description = "" + else: + description = match.group(1) + + self.description = description + + ''' + Run valgrind massif profiler on the Decision Tree + method. If the method has been successfully completed the report is saved in + the specified file. + + @param options - Extra options for the method. + @param fileName - The name of the massif output file. + @param massifOptions - Extra massif options. + @return Returns False if the method was not successful, if the method was + successful save the report file in the specified file. + ''' + def RunMemory(self, options, fileName, massifOptions="--depth=2"): + Log.Info("Perform Decision Tree Memory Profiling.", self.verbose) + + # If the dataset contains two files then the second file is the test file. + # In this case we add this to the command line. + if len(self.dataset) >= 2: + cmd = shlex.split(self.debug + "mlpack_decision_tree -t " + + self.dataset[0] + " -T " + self.dataset[1] + " -v " + options) + else: + Log.Fatal("This method requires atleast two datasets.") + + return Profiler.MassifMemoryUsage(cmd, fileName, self.timeout, massifOptions) + + ''' + Perform Decision Tree Prediction. If the method has been + successfully completed return the elapsed time in seconds. + + @param options - Extra options for the method. + @return - Elapsed time in seconds or a negative value if the method was not + successful. + ''' + def RunMetrics(self, options): + Log.Info("Perform Decision Tree Prediction.", self.verbose) + + # If the dataset contains two files then the second file is the test file. + # In this case we add this to the command line. + if len(self.dataset) >= 2: + cmd = shlex.split(self.path + "mlpack_decision_tree -t " + + self.dataset[0] + " -T " + self.dataset[1] + " -v 1 -p output.csv" + options) + else: + Log.Fatal("This method requires atleast two datasets.") + + # Run command with the nessecary arguments and return its output as a byte + # string. We have untrusted input so we disable all shell based features. + try: + s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False, + timeout=self.timeout) + except subprocess.TimeoutExpired as e: + Log.Warn(str(e)) + return -2 + except Exception as e: + Log.Fatal("Could not execute command: " + str(cmd)) + return -1 + + # Datastructure to store the results. + metrics = {} + + # Parse data (runtime and number of base cases). + timer = self.parseTimer(s) + + if timer != -1: + metrics['Runtime'] = timer.total_time - timer.loading_data - timer.saving_data + + Log.Info(("total time: %fs" % (metrics['Runtime'])), self.verbose) + + # If the dataset contains three files then the third is the ture test label + if len(self.dataset) >= 3 and CheckFileAvailable('output.csv'): + + truelabels = LoadDataset(self.dataset[2]) + + predictedlabels = LoadDataset("output.csv") + + confusionMatrix = Metrics.ConfusionMatrix(truelabels, predictedlabels) + AvgAcc = Metrics.AverageAccuracy(confusionMatrix) + AvgPrec = Metrics.AvgPrecision(confusionMatrix) + AvgRec = Metrics.AvgRecall(confusionMatrix) + AvgF = Metrics.AvgFMeasure(confusionMatrix) + AvgLift = Metrics.LiftMultiClass(confusionMatrix) + AvgMCC = Metrics.MCCMultiClass(confusionMatrix) + AvgInformation = Metrics.AvgMPIArray(confusionMatrix, truelabels, predictedlabels) + SimpleMSE = Metrics.SimpleMeanSquaredError(truelabels, predictedlabels) + metrics['Avg Accuracy'] = AvgAcc + metrics['MultiClass Precision'] = AvgPrec + metrics['MultiClass Recall'] = AvgRec + metrics['MultiClass FMeasure'] = AvgF + metrics['MultiClass Lift'] = AvgLift + metrics['MultiClass MCC'] = AvgMCC + metrics['MultiClass Information'] = AvgInformation + metrics['Simple MSE'] = SimpleMSE + + return metrics + + ''' + Parse the timer data form a given string. + + @param data - String to parse timer data from. + @return - Namedtuple that contains the timer data or -1 in case of an error. + ''' + def parseTimer(self, data): + # Compile the regular expression pattern into a regular expression object to + # parse the timer data. + pattern = re.compile(br""" + .*?loading_data: (?P.*?)s.*? + .*?saving_data: (?P.*?)s.*? + .*?total_time: (?P.*?)s.*? + """, re.VERBOSE|re.MULTILINE|re.DOTALL) + + match = pattern.match(data) + if not match: + Log.Fatal("Can't parse the data: wrong format") + return -1 + else: + # Create a namedtuple and return the timer data. + timer = collections.namedtuple('timer', ["loading_data", "total_time", "saving_data"]) + return timer(float(match.group("loading_data")), + float(match.group("total_time")), float(match.group("saving_data"))) From a18e4541ccbe574b9cc11d738e36bdb7573aa08f Mon Sep 17 00:00:00 2001 From: thyrixyang Date: Sun, 30 Apr 2017 10:32:52 +0800 Subject: [PATCH 2/2] add test script for mlpack_decision_tree --- tests/benchmark_decision_tree.py | 35 ++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/tests/benchmark_decision_tree.py b/tests/benchmark_decision_tree.py index ac10050..adc9a6f 100644 --- a/tests/benchmark_decision_tree.py +++ b/tests/benchmark_decision_tree.py @@ -80,3 +80,38 @@ def test_RunMetrics(self): if __name__ == '__main__': unittest.main() + +''' +Test the mlpack Decision Tree Prediction script. +''' +class DecisionTree_MLPACK_TEST(unittest.TestCase): + + ''' + Test initialization. + ''' + def setUp(self): + self.dataset = ['datasets/iris_train.csv','datasets/iris_test.csv'] + self.verbose = False + self.timeout = 9000 + + module = Loader.ImportModuleFromPath("methods/mlpack/decision_tree.py") + obj = getattr(module, "DTC") + self.instance = obj(self.dataset, verbose=self.verbose, timeout=self.timeout) + + ''' + Test the constructor. + ''' + def test_Constructor(self): + self.assertEqual(self.instance.verbose, self.verbose) + self.assertEqual(self.instance.timeout, self.timeout) + self.assertEqual(self.instance.dataset, self.dataset) + + ''' + Test the 'RunMetrics' function. + ''' + def test_RunMetrics(self): + result = self.instance.RunMetrics("") + self.assertTrue(result["Runtime"] > 0) + +if __name__ == '__main__': + unittest.main()