Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add benchmark script for mlpack_decision_tree #30

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,22 @@ methods:
['datasets/shuttle_train.csv', 'datasets/shuttle_test.csv'],
['datasets/transfusion_train.csv', 'datasets/transfusion_test.csv'],
['datasets/arcene_train.csv', 'datasets/arcene_test.csv'] ]

DTC:
run: ['metric']
script: methods/mlpack/decision_tree.py
format: [csv, txt]
datasets:
- files: [ ['datasets/iris_train.csv', 'datasets/iris_test.csv', 'datasets/iris_labels.csv'],
['datasets/oilspill_train.csv', 'datasets/oilspill_test.csv', 'datasets/oilspill_labels.csv'],
['datasets/scene_train.csv', 'datasets/scene_test.csv', 'datasets/scene_labels.csv'],
['datasets/webpage_train.csv', 'datasets/webpage_test.csv', 'datasets/webpage_labels.csv'],
['datasets/isolet_train.csv', 'datasets/isolet_test.csv', 'datasets/isolet_labels.csv'],
['datasets/mammography_train.csv', 'datasets/mammography_test.csv', 'datasets/mammography_labels.csv'],
['datasets/abalone19_train.csv', 'datasets/abalone19_test.csv', 'datasets/abalone19_labels.csv'],
['datasets/sickEuthyroid_train.csv', 'datasets/sickEuthyroid_test.csv', 'datasets/sickEuthyroid_labels.csv'],
['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]
NMF:
run: ['metric']
script: methods/mlpack/nmf.py
Expand Down
198 changes: 198 additions & 0 deletions methods/mlpack/decision_tree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
'''
@file decision_tree.py
@author Thyrix Yang

Class to benchmark the mlpack decision tree method.
'''

import os
import sys
import inspect
import numpy as np

# Import the util path, this method even works if the path contains symlinks to
# modules.
cmd_subfolder = os.path.realpath(os.path.abspath(os.path.join(
os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../util")))
if cmd_subfolder not in sys.path:
sys.path.insert(0, cmd_subfolder)

#Import the metrics definitions path.
metrics_folder = os.path.realpath(os.path.abspath(os.path.join(
os.path.split(inspect.getfile(inspect.currentframe()))[0], "../metrics")))
if metrics_folder not in sys.path:
sys.path.insert(0, metrics_folder)

from log import *
from profiler import *
from definitions import *
from misc import *
import shlex

try:
import subprocess32 as subprocess
except ImportError:
import subprocess

import re
import collections

'''
This class implements the Decision Tree benchmark.
'''
class DTC(object):

'''
Create the Decision Tree benchmark instance, show some
informations and return the instance.

@param dataset - Input dataset to perform Decision Tree Prediction on.
@param timeout - The time until the timeout. Default no timeout.
@param path - Path to the mlpack executable.
@param verbose - Display informational messages.
'''
def __init__(self, dataset, timeout=0, path=os.environ["MLPACK_BIN"],
verbose=True, debug=os.environ["MLPACK_BIN_DEBUG"]):
self.verbose = verbose
self.dataset = dataset
self.path = path
self.timeout = timeout
self.debug = debug

# Get description from executable.
cmd = shlex.split(self.path + "mlpack_decision_tree -h")
try:
s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False)
except Exception as e:
Log.Fatal("Could not execute command: " + str(cmd))
else:
# Use regular expression pattern to get the description.
pattern = re.compile(br"""(.*?)Required.*?options:""",
re.VERBOSE|re.MULTILINE|re.DOTALL)

match = pattern.match(s)
if not match:
Log.Warn("Can't parse description", self.verbose)
description = ""
else:
description = match.group(1)

self.description = description

'''
Run valgrind massif profiler on the Decision Tree
method. If the method has been successfully completed the report is saved in
the specified file.

@param options - Extra options for the method.
@param fileName - The name of the massif output file.
@param massifOptions - Extra massif options.
@return Returns False if the method was not successful, if the method was
successful save the report file in the specified file.
'''
def RunMemory(self, options, fileName, massifOptions="--depth=2"):
Log.Info("Perform Decision Tree Memory Profiling.", self.verbose)

# If the dataset contains two files then the second file is the test file.
# In this case we add this to the command line.
if len(self.dataset) >= 2:
cmd = shlex.split(self.debug + "mlpack_decision_tree -t " +
self.dataset[0] + " -T " + self.dataset[1] + " -v " + options)
else:
Log.Fatal("This method requires atleast two datasets.")

return Profiler.MassifMemoryUsage(cmd, fileName, self.timeout, massifOptions)

'''
Perform Decision Tree Prediction. If the method has been
successfully completed return the elapsed time in seconds.

@param options - Extra options for the method.
@return - Elapsed time in seconds or a negative value if the method was not
successful.
'''
def RunMetrics(self, options):
Log.Info("Perform Decision Tree Prediction.", self.verbose)

# If the dataset contains two files then the second file is the test file.
# In this case we add this to the command line.
if len(self.dataset) >= 2:
cmd = shlex.split(self.path + "mlpack_decision_tree -t " +
self.dataset[0] + " -T " + self.dataset[1] + " -v 1 -p output.csv" + options)
else:
Log.Fatal("This method requires atleast two datasets.")

# Run command with the nessecary arguments and return its output as a byte
# string. We have untrusted input so we disable all shell based features.
try:
s = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=False,
timeout=self.timeout)
except subprocess.TimeoutExpired as e:
Log.Warn(str(e))
return -2
except Exception as e:
Log.Fatal("Could not execute command: " + str(cmd))
return -1

# Datastructure to store the results.
metrics = {}

# Parse data (runtime and number of base cases).
timer = self.parseTimer(s)

if timer != -1:
metrics['Runtime'] = timer.total_time - timer.loading_data - timer.saving_data

Log.Info(("total time: %fs" % (metrics['Runtime'])), self.verbose)

# If the dataset contains three files then the third is the ture test label
if len(self.dataset) >= 3 and CheckFileAvailable('output.csv'):

truelabels = LoadDataset(self.dataset[2])

predictedlabels = LoadDataset("output.csv")

confusionMatrix = Metrics.ConfusionMatrix(truelabels, predictedlabels)
AvgAcc = Metrics.AverageAccuracy(confusionMatrix)
AvgPrec = Metrics.AvgPrecision(confusionMatrix)
AvgRec = Metrics.AvgRecall(confusionMatrix)
AvgF = Metrics.AvgFMeasure(confusionMatrix)
AvgLift = Metrics.LiftMultiClass(confusionMatrix)
AvgMCC = Metrics.MCCMultiClass(confusionMatrix)
AvgInformation = Metrics.AvgMPIArray(confusionMatrix, truelabels, predictedlabels)
SimpleMSE = Metrics.SimpleMeanSquaredError(truelabels, predictedlabels)
metrics['Avg Accuracy'] = AvgAcc
metrics['MultiClass Precision'] = AvgPrec
metrics['MultiClass Recall'] = AvgRec
metrics['MultiClass FMeasure'] = AvgF
metrics['MultiClass Lift'] = AvgLift
metrics['MultiClass MCC'] = AvgMCC
metrics['MultiClass Information'] = AvgInformation
metrics['Simple MSE'] = SimpleMSE

return metrics

'''
Parse the timer data form a given string.

@param data - String to parse timer data from.
@return - Namedtuple that contains the timer data or -1 in case of an error.
'''
def parseTimer(self, data):
# Compile the regular expression pattern into a regular expression object to
# parse the timer data.
pattern = re.compile(br"""
.*?loading_data: (?P<loading_data>.*?)s.*?
.*?saving_data: (?P<saving_data>.*?)s.*?
.*?total_time: (?P<total_time>.*?)s.*?
""", re.VERBOSE|re.MULTILINE|re.DOTALL)

match = pattern.match(data)
if not match:
Log.Fatal("Can't parse the data: wrong format")
return -1
else:
# Create a namedtuple and return the timer data.
timer = collections.namedtuple('timer', ["loading_data", "total_time", "saving_data"])
return timer(float(match.group("loading_data")),
float(match.group("total_time")), float(match.group("saving_data")))
35 changes: 35 additions & 0 deletions tests/benchmark_decision_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,3 +80,38 @@ def test_RunMetrics(self):

if __name__ == '__main__':
unittest.main()

'''
Test the mlpack Decision Tree Prediction script.
'''
class DecisionTree_MLPACK_TEST(unittest.TestCase):

'''
Test initialization.
'''
def setUp(self):
self.dataset = ['datasets/iris_train.csv','datasets/iris_test.csv']
self.verbose = False
self.timeout = 9000

module = Loader.ImportModuleFromPath("methods/mlpack/decision_tree.py")
obj = getattr(module, "DTC")
self.instance = obj(self.dataset, verbose=self.verbose, timeout=self.timeout)

'''
Test the constructor.
'''
def test_Constructor(self):
self.assertEqual(self.instance.verbose, self.verbose)
self.assertEqual(self.instance.timeout, self.timeout)
self.assertEqual(self.instance.dataset, self.dataset)

'''
Test the 'RunMetrics' function.
'''
def test_RunMetrics(self):
result = self.instance.RunMetrics("")
self.assertTrue(result["Runtime"] > 0)

if __name__ == '__main__':
unittest.main()