From 5f090fc2f1c272b839cee8965c77293d018c18d1 Mon Sep 17 00:00:00 2001
From: Adam Roberts <aroberts@uk.ibm.com>
Date: Mon, 20 Jun 2016 14:30:30 +0100
Subject: [PATCH 01/22] Spark 2.0.0 support

This commit makes the default version of Spark "2.0.0-preview" and
consists of various configuration file changes and a couple of method changes.

We should remove the -preview from the project files once 2.0.0 is made
generally available (so we won't be relying on the preview builds).

* Several changes have been made including downloading Akka for streaming-tests

* Scala 2.11.8 is used

* config/py now looks for $SPARK_HOME instead of /root

* foreachRDD is used instead of foreach for a DStream

* awaitTerminationOrTimeout is used instead of awaitTermination for a StreamingContext

* json4s render call is removed owing to API changes
---
 config/config.py.template                          |  5 ++++-
 lib/sparkperf/testsuites.py                        |  2 +-
 mllib-tests/project/MLlibTestsBuild.scala          | 14 +++++++-------
 pyspark-tests/mllib_tests.py                       |  8 +++++---
 spark-tests/project/SparkTestsBuild.scala          |  8 ++++----
 .../src/main/scala/spark/perf/TestRunner.scala     |  2 +-
 streaming-tests/project/StreamingTestsBuild.scala  | 10 +++++++---
 .../scala/streaming/perf/HdfsRecoveryTest.scala    |  4 ++--
 .../src/main/scala/streaming/perf/KVDataTest.scala |  2 +-
 9 files changed, 32 insertions(+), 23 deletions(-)

diff --git a/config/config.py.template b/config/config.py.template
index a348b6f..bde48d2 100755
--- a/config/config.py.template
+++ b/config/config.py.template
@@ -18,7 +18,8 @@ from sparkperf.config_utils import FlagSet, JavaOptionSet, OptionSet, ConstantOp
 # ================================ #
 
 # Point to an installation of Spark on the cluster.
-SPARK_HOME_DIR = "/root/spark"
+DEFAULT_HOME=os.environ['HOME']
+SPARK_HOME_DIR = os.getenv('SPARK_HOME', DEFAULT_HOME)
 
 # Use a custom configuration directory
 SPARK_CONF_DIR = SPARK_HOME_DIR + "/conf"
@@ -370,6 +371,8 @@ MLLIB_PERF_TEST_RUNNER = "mllib.perf.TestRunner"
 #  * Build Spark locally by running `build/sbt assembly; build/sbt publishLocal` in the Spark root directory
 #  * Set `USE_CLUSTER_SPARK = True` and `MLLIB_SPARK_VERSION = {desired Spark version, e.g. 1.5}`
 #  * Don't use PREP_MLLIB_TESTS = True; instead manually run `cd mllib-tests; sbt/sbt -Dspark.version=1.5.0-SNAPSHOT clean assembly` to build perf tests
+
+# Can be changed to 2.0 for using Spark 2.0
 MLLIB_SPARK_VERSION = 1.5
 
 MLLIB_JAVA_OPTS = COMMON_JAVA_OPTS
diff --git a/lib/sparkperf/testsuites.py b/lib/sparkperf/testsuites.py
index 573ecf5..46ab349 100644
--- a/lib/sparkperf/testsuites.py
+++ b/lib/sparkperf/testsuites.py
@@ -252,7 +252,7 @@ class MLlibTests(JVMPerfTestSuite, MLlibTestHelper):
 
     @classmethod
     def build(cls, spark_version):
-        run_cmd("cd %s/mllib-tests; %s -Dspark.version=%s.0 clean assembly" % (PROJ_DIR, SBT_CMD, spark_version))
+        run_cmd("cd %s/mllib-tests; %s -Dspark.version=%s clean assembly" % (PROJ_DIR, SBT_CMD, spark_version))
 
     @classmethod
     def process_output(cls, config, short_name, opt_list, stdout_filename, stderr_filename):
diff --git a/mllib-tests/project/MLlibTestsBuild.scala b/mllib-tests/project/MLlibTestsBuild.scala
index 3347db4..64a46e9 100644
--- a/mllib-tests/project/MLlibTestsBuild.scala
+++ b/mllib-tests/project/MLlibTestsBuild.scala
@@ -16,14 +16,15 @@ object MLlibTestsBuild extends Build {
   lazy val commonSettings = Seq(
     organization := "org.spark-project",
     version := "0.1",
-    scalaVersion := "2.10.4",
-    sparkVersion := sys.props.getOrElse("spark.version", default="1.5.2"),
+    scalaVersion := "2.11.8",
+    sparkVersion := sys.props.getOrElse("spark.version", default="2.0.0-preview"),
     libraryDependencies ++= Seq(
       "net.sf.jopt-simple" % "jopt-simple" % "4.6",
       "org.scalatest" %% "scalatest" % "2.2.1" % "test",
       "org.slf4j" % "slf4j-log4j12" % "1.7.2",
-      "org.json4s" %% "json4s-native" % "3.2.9",
-      "org.apache.spark" %% "spark-mllib" % sparkVersion.value % "provided"
+      "org.json4s" %% "json4s-native" % "3.2.10",
+      "org.apache.spark" %% "spark-core" % "2.0.0-preview" % "provided",
+      "org.apache.spark" %% "spark-streaming" % "2.0.0-preview" % "provided"
     )
   )
 
@@ -35,9 +36,8 @@ object MLlibTestsBuild extends Build {
         val targetFolder = sparkVersion.value match {
           case v if v.startsWith("1.4.") => "v1p4"
           case v if v.startsWith("1.5.") => "v1p5"
-          case v if v.startsWith("1.6.") =>
-            "v1p5" // acceptable for now, but change later when new algs are added
-          case _ => throw new IllegalArgumentException(s"Do not support Spark ${sparkVersion.value}.")
+          case v if v.startsWith("2.0") => "v2p0"
+          case _ => throw new IllegalArgumentException(s"This Spark version isn't suppored: ${sparkVersion.value}.")
         }
         baseDirectory.value / targetFolder / "src" / "main" / "scala"
       },
diff --git a/pyspark-tests/mllib_tests.py b/pyspark-tests/mllib_tests.py
index 1b6a306..133d751 100644
--- a/pyspark-tests/mllib_tests.py
+++ b/pyspark-tests/mllib_tests.py
@@ -219,8 +219,8 @@ def __init__(self, sc):
 
     def createInputData(self):
         options = self.options
-        numTrain = options.num_examples
-        numTest = int(options.num_examples * 0.2)
+        numTrain = options.num_points
+        numTest = int(options.num_points * 0.2)
         self.trainRDD = LabeledDataGenerator.generateGLMData(
             self.sc, numTrain, options.num_features,
             options.num_partitions, options.random_seed, labelType=2)
@@ -242,7 +242,7 @@ def __init__(self, sc):
     def createInputData(self):
         options = self.options
         self.data = FeaturesGenerator.generateContinuousData(
-            self.sc, options.num_examples, options.num_features,
+            self.sc, options.num_points, options.num_columns,
             options.num_partitions, options.random_seed)
 
     def runTest(self):
@@ -368,6 +368,8 @@ def runTest(self):
     parser.add_option("--num-ratings", type="int", default=500)
     parser.add_option("--implicit-prefs", type="int", default=0)
     # MLLIB_CLUSTERING_TEST_OPTS
+    parser.add_option("--num-points", type="int", default=1000)
+    parser.add_option("--num-columns", type="int", default=10)
     parser.add_option("--num-centers", type="int", default=5)
     # MLLIB_LINALG_TEST_OPTS + MLLIB_STATS_TEST_OPTS
     parser.add_option("--num-rows", type="int", default=1000)
diff --git a/spark-tests/project/SparkTestsBuild.scala b/spark-tests/project/SparkTestsBuild.scala
index 4116326..697b28a 100644
--- a/spark-tests/project/SparkTestsBuild.scala
+++ b/spark-tests/project/SparkTestsBuild.scala
@@ -10,13 +10,13 @@ object SparkTestsBuild extends Build {
     settings = assemblySettings ++ Seq(
       organization := "org.spark-project",
       version := "0.1",
-      scalaVersion := "2.10.4",
+      scalaVersion := "2.11.8",
       libraryDependencies ++= Seq(
         "net.sf.jopt-simple" % "jopt-simple" % "4.6",
         "org.scalatest" %% "scalatest" % "2.2.1" % "test",
         "com.google.guava" % "guava" % "14.0.1",
-        "org.apache.spark" %% "spark-core" % "1.0.0" % "provided",
-        "org.json4s" %% "json4s-native" % "3.2.9"
+        "org.apache.spark" %% "spark-core" % "2.0.0-preview" % "provided",
+        "org.json4s" %% "json4s-native" % "3.2.10"
       ),
       test in assembly := {},
       outputPath in assembly := file("target/spark-perf-tests-assembly.jar"),
@@ -36,4 +36,4 @@ object SparkTestsBuild extends Build {
         case _ => MergeStrategy.first
       }
     ))
-}
\ No newline at end of file
+}
diff --git a/spark-tests/src/main/scala/spark/perf/TestRunner.scala b/spark-tests/src/main/scala/spark/perf/TestRunner.scala
index cbfcb0a..6c21f33 100644
--- a/spark-tests/src/main/scala/spark/perf/TestRunner.scala
+++ b/spark-tests/src/main/scala/spark/perf/TestRunner.scala
@@ -44,7 +44,7 @@ object TestRunner {
       ("sparkVersion" -> sc.version) ~
       ("systemProperties" -> System.getProperties.asScala.toMap) ~
       ("results" -> results)
-    println("results: " + compact(render(json)))
+    println("results: " + compact(json))
 
     // Gracefully stop the SparkContext so that the application web UI can be preserved
     // and viewed using the HistoryServer.
diff --git a/streaming-tests/project/StreamingTestsBuild.scala b/streaming-tests/project/StreamingTestsBuild.scala
index 7c8c903..fc2569c 100644
--- a/streaming-tests/project/StreamingTestsBuild.scala
+++ b/streaming-tests/project/StreamingTestsBuild.scala
@@ -10,14 +10,18 @@ object StreamingTestsBuild extends Build {
     settings = assemblySettings ++ Seq(
       organization := "org.spark-project",
       version := "0.1",
-      scalaVersion := "2.10.4",
+      scalaVersion := "2.11.8",
       libraryDependencies ++= Seq(
         "net.sf.jopt-simple" % "jopt-simple" % "4.5",
         "org.scalatest" %% "scalatest" % "2.2.1" % "test",
         "com.google.guava" % "guava" % "14.0.1",
+        "com.typesafe.akka" %% "akka-actor"   % "2.3.11",
+        "com.typesafe.akka" %% "akka-slf4j"   % "2.3.11",
+        "com.typesafe.akka" %% "akka-remote"  % "2.3.11",
+        "com.typesafe.akka" %% "akka-agent"   % "2.3.11",
         "org.slf4j" % "slf4j-log4j12" % "1.7.2",
-        "org.apache.spark" %% "spark-core" % "1.0.0" % "provided",
-        "org.apache.spark" %% "spark-streaming" % "1.0.0" % "provided"
+        "org.apache.spark" %% "spark-core" % "2.0.0-preview" % "provided",
+        "org.apache.spark" %% "spark-streaming" % "2.0.0-preview" % "provided"
       ),
       test in assembly := {},
       outputPath in assembly := file("target/streaming-perf-tests-assembly.jar"),
diff --git a/streaming-tests/src/main/scala/streaming/perf/HdfsRecoveryTest.scala b/streaming-tests/src/main/scala/streaming/perf/HdfsRecoveryTest.scala
index 23e8b80..841ae5e 100644
--- a/streaming-tests/src/main/scala/streaming/perf/HdfsRecoveryTest.scala
+++ b/streaming-tests/src/main/scala/streaming/perf/HdfsRecoveryTest.scala
@@ -39,7 +39,7 @@ class HdfsRecoveryTest extends PerfTest {
     // Verify the running counts. For any key the running count should be in the sequence
     // 1, 3, 6, 10, 15, 21, ... (i.e., nth number is sum of 1..n)
     val expectedCounts = (1L to maxRecordsPerFile).map(x => (1L to x).reduce(_ + _)).toSet
-    wordStream.foreach((rdd: RDD[(String, Long)], time: Time) => {
+    wordStream.foreachRDD((rdd: RDD[(String, Long)], time: Time) => {
       val partitionCounts = rdd.sparkContext.runJob(rdd.mapPartitions(iter => 
           iter.toSeq.groupBy(_._1).toSeq.map(x => (x._1, x._2.map(_._2).sum)).toIterator
           ), (iter: Iterator[(String, Long)]) => iter.toArray)
@@ -48,7 +48,7 @@ class HdfsRecoveryTest extends PerfTest {
       val counts = rdd.reduceByKey(_ + _, 1).collect()
       println(s"New total count at $time = " + counts.mkString("[", ", ", "]"))
     })
-    runningCountStream.foreach((rdd: RDD[(String, Long)], time: Time) => {
+    runningCountStream.foreachRDD((rdd: RDD[(String, Long)], time: Time) => {
       val counts = rdd.collect()
       val possibleCounts = expectedCounts
       val expected = counts.forall { case (word, count) => possibleCounts.contains(count) }
diff --git a/streaming-tests/src/main/scala/streaming/perf/KVDataTest.scala b/streaming-tests/src/main/scala/streaming/perf/KVDataTest.scala
index 628376d..9aeb989 100644
--- a/streaming-tests/src/main/scala/streaming/perf/KVDataTest.scala
+++ b/streaming-tests/src/main/scala/streaming/perf/KVDataTest.scala
@@ -70,7 +70,7 @@ abstract class KVDataTest extends PerfTest {
     // run test
     ssc.start()
     val startTime = System.currentTimeMillis
-    ssc.awaitTermination(totalDurationSec * 1000)
+    ssc.awaitTerminationOrTimeout(totalDurationSec * 1000)
     ssc.stop()
     processResults(statsReportListener)
   }

From a6c3403c15cc0db35d80186938edb73749490abc Mon Sep 17 00:00:00 2001
From: Adam Roberts <aroberts@uk.ibm.com>
Date: Mon, 20 Jun 2016 18:21:24 +0100
Subject: [PATCH 02/22] Fix dependency for mllib

---
 mllib-tests/project/MLlibTestsBuild.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib-tests/project/MLlibTestsBuild.scala b/mllib-tests/project/MLlibTestsBuild.scala
index 64a46e9..6799a93 100644
--- a/mllib-tests/project/MLlibTestsBuild.scala
+++ b/mllib-tests/project/MLlibTestsBuild.scala
@@ -24,7 +24,7 @@ object MLlibTestsBuild extends Build {
       "org.slf4j" % "slf4j-log4j12" % "1.7.2",
       "org.json4s" %% "json4s-native" % "3.2.10",
       "org.apache.spark" %% "spark-core" % "2.0.0-preview" % "provided",
-      "org.apache.spark" %% "spark-streaming" % "2.0.0-preview" % "provided"
+      "org.apache.spark" %% "spark-mllib" % "2.0.0-preview" % "provided"
     )
   )
 

From 27e640f3e11ed5d2b978f6b781f1db09ed04c99e Mon Sep 17 00:00:00 2001
From: Adam Roberts <aroberts@uk.ibm.com>
Date: Mon, 20 Jun 2016 18:24:01 +0100
Subject: [PATCH 03/22] Add 1.6 target folder back in

---
 mllib-tests/project/MLlibTestsBuild.scala | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mllib-tests/project/MLlibTestsBuild.scala b/mllib-tests/project/MLlibTestsBuild.scala
index 6799a93..cd392d8 100644
--- a/mllib-tests/project/MLlibTestsBuild.scala
+++ b/mllib-tests/project/MLlibTestsBuild.scala
@@ -36,6 +36,8 @@ object MLlibTestsBuild extends Build {
         val targetFolder = sparkVersion.value match {
           case v if v.startsWith("1.4.") => "v1p4"
           case v if v.startsWith("1.5.") => "v1p5"
+          case v if v.startsWith("1.6.") =>
+            "v1p5" // acceptable for now, but change later when new algs are added
           case v if v.startsWith("2.0") => "v2p0"
           case _ => throw new IllegalArgumentException(s"This Spark version isn't suppored: ${sparkVersion.value}.")
         }

From 54735062c85455e4925da883e22c33d72158f0ad Mon Sep 17 00:00:00 2001
From: Adam Roberts <aroberts@uk.ibm.com>
Date: Thu, 30 Jun 2016 16:47:21 +0100
Subject: [PATCH 04/22] Add ML changes for Spark 2, note requires Spark in lib
 folder currently!

---
 config/config.py.template                     | 801 ------------------
 mllib-tests/project/MLlibTestsBuild.scala     |  20 +-
 .../scala/mllib/perf/MLAlgorithmTests.scala   |  21 +-
 .../perf/clustering/GaussianMixtureTest.scala |  14 +-
 .../scala/mllib/perf/clustering/PICTest.scala |  13 +-
 .../scala/mllib/perf/util/DataGenerator.scala |   8 +-
 .../scala/mllib/perf/MLAlgorithmTests.scala   |  37 +-
 .../perf/clustering/GaussianMixtureTest.scala |  14 +-
 .../scala/mllib/perf/clustering/PICTest.scala |  13 +-
 .../scala/mllib/perf/util/DataGenerator.scala |   8 +-
 .../scala/mllib/perf/LinearAlgebraTests.scala |  68 ++
 .../scala/mllib/perf/MLAlgorithmTests.scala   | 779 +++++++++++++++++
 .../src/main/scala/mllib/perf/PerfTest.scala  | 134 +++
 .../src/main/scala/mllib/perf/StatTests.scala | 109 +++
 .../main/scala/mllib/perf/TestRunner.scala    |  87 ++
 .../perf/clustering/GaussianMixtureTest.scala |  63 ++
 .../scala/mllib/perf/clustering/LDATest.scala |  73 ++
 .../scala/mllib/perf/clustering/PICTest.scala |  53 ++
 .../mllib/perf/feature/Word2VecTest.scala     |  69 ++
 .../scala/mllib/perf/fpm/FPGrowthTest.scala   |  65 ++
 .../scala/mllib/perf/fpm/PrefixSpanTest.scala |  82 ++
 .../perf/linalg/BlockMatrixMultTest.scala     |  74 ++
 .../scala/mllib/perf/util/DataGenerator.scala | 586 +++++++++++++
 .../scala/mllib/perf/util/DataLoader.scala    | 143 ++++
 24 files changed, 2457 insertions(+), 877 deletions(-)
 delete mode 100755 config/config.py.template
 create mode 100644 mllib-tests/v2p0/src/main/scala/mllib/perf/LinearAlgebraTests.scala
 create mode 100644 mllib-tests/v2p0/src/main/scala/mllib/perf/MLAlgorithmTests.scala
 create mode 100644 mllib-tests/v2p0/src/main/scala/mllib/perf/PerfTest.scala
 create mode 100644 mllib-tests/v2p0/src/main/scala/mllib/perf/StatTests.scala
 create mode 100644 mllib-tests/v2p0/src/main/scala/mllib/perf/TestRunner.scala
 create mode 100644 mllib-tests/v2p0/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala
 create mode 100644 mllib-tests/v2p0/src/main/scala/mllib/perf/clustering/LDATest.scala
 create mode 100644 mllib-tests/v2p0/src/main/scala/mllib/perf/clustering/PICTest.scala
 create mode 100644 mllib-tests/v2p0/src/main/scala/mllib/perf/feature/Word2VecTest.scala
 create mode 100644 mllib-tests/v2p0/src/main/scala/mllib/perf/fpm/FPGrowthTest.scala
 create mode 100644 mllib-tests/v2p0/src/main/scala/mllib/perf/fpm/PrefixSpanTest.scala
 create mode 100644 mllib-tests/v2p0/src/main/scala/mllib/perf/linalg/BlockMatrixMultTest.scala
 create mode 100644 mllib-tests/v2p0/src/main/scala/mllib/perf/util/DataGenerator.scala
 create mode 100644 mllib-tests/v2p0/src/main/scala/mllib/perf/util/DataLoader.scala

diff --git a/config/config.py.template b/config/config.py.template
deleted file mode 100755
index bde48d2..0000000
--- a/config/config.py.template
+++ /dev/null
@@ -1,801 +0,0 @@
-"""
-Configuration options for running Spark performance tests.
-
-When updating `spark-perf`, you should probably use `diff` to compare the updated template to
-your modified `config.py` file and copy over any new configurations.
-"""
-
-import time
-import os
-import os.path
-import socket
-
-from sparkperf.config_utils import FlagSet, JavaOptionSet, OptionSet, ConstantOption
-
-
-# ================================ #
-#  Standard Configuration Options  #
-# ================================ #
-
-# Point to an installation of Spark on the cluster.
-DEFAULT_HOME=os.environ['HOME']
-SPARK_HOME_DIR = os.getenv('SPARK_HOME', DEFAULT_HOME)
-
-# Use a custom configuration directory
-SPARK_CONF_DIR = SPARK_HOME_DIR + "/conf"
-
-# Master used when submitting Spark jobs.
-# For local clusters: "spark://%s:7077" % socket.gethostname()
-# For Yarn clusters: "yarn"
-# Otherwise, the default uses the specified EC2 cluster
-SPARK_CLUSTER_URL = open("/root/spark-ec2/cluster-url", 'r').readline().strip()
-IS_YARN_MODE = "yarn" in SPARK_CLUSTER_URL
-IS_MESOS_MODE = "mesos" in SPARK_CLUSTER_URL
-
-# Specify URI to download spark executor. This only applied for running with Mesos.
-#SPARK_EXECUTOR_URI = "http://localhost:8000/spark.tgz"
-
-# Path to the Mesos native library. This is only required for running with Mesos.
-#MESOS_NATIVE_LIBRARY = "/usr/local/lib/libmesos.so"
-
-# Run Mesos client in coarse or fine grain mode. This is only applied for running with Mesos.
-#SPARK_MESOS_COARSE = True
-
-
-# If this is true, we'll submit your job using an existing Spark installation.
-# If this is false, we'll clone and build a specific version of Spark, and
-# copy configurations from your existing Spark installation.
-USE_CLUSTER_SPARK = True
-
-# URL of the HDFS installation in the Spark EC2 cluster
-HDFS_URL = "hdfs://%s:9000/test/" % socket.gethostname()
-
-# Set the following if not using existing Spark installation
-# Commit id and repo used if you are not using an existing Spark cluster
-# custom version of Spark. The remote name in your git repo is assumed
-# to be "origin".
-#
-# The commit ID can specify any of the following:
-#     1. A git commit hash         e.g. "4af93ff3"
-#     2. A branch name             e.g. "origin/branch-0.7"
-#     3. A tag name                e.g. "origin/tag/v0.8.0-incubating"
-#     4. A pull request            e.g. "origin/pr/675"
-SPARK_COMMIT_ID = ""
-SPARK_GIT_REPO = "https://github.com/apache/spark.git"
-SPARK_MERGE_COMMIT_INTO_MASTER = False # Whether to merge the commit into master
-
-# Whether to install and build Spark. Set this to true only for the
-# first installation if an existing one does not already exist.
-PREP_SPARK = not USE_CLUSTER_SPARK
-
-# Whether to restart the Master and all Workers
-# This should always be false for Yarn
-RESTART_SPARK_CLUSTER = True
-RESTART_SPARK_CLUSTER = RESTART_SPARK_CLUSTER and not IS_YARN_MODE
-
-# Rsync SPARK_HOME to all the slaves or not
-RSYNC_SPARK_HOME = True
-
-# Which tests to run
-RUN_SPARK_TESTS = True
-RUN_PYSPARK_TESTS = False
-RUN_STREAMING_TESTS = False
-RUN_MLLIB_TESTS = False
-RUN_PYTHON_MLLIB_TESTS = False
-
-# Which tests to prepare. Set this to true for the first
-# installation or whenever you make a change to the tests.
-PREP_SPARK_TESTS = True
-PREP_PYSPARK_TESTS = False
-PREP_STREAMING_TESTS = False
-PREP_MLLIB_TESTS = False
-
-# Whether to warm up local disks (warm-up is only necesary on EC2).
-DISK_WARMUP = False
-
-# Total number of bytes used to warm up each local directory.
-DISK_WARMUP_BYTES = 200 * 1024 * 1024
-
-# Number of files to create when warming up each local directory.
-# Bytes will be evenly divided across files.
-DISK_WARMUP_FILES = 200
-
-# Prompt for confirmation when deleting temporary files.
-PROMPT_FOR_DELETES = True
-
-# Files to write results to
-SPARK_OUTPUT_FILENAME = "results/spark_perf_output_%s_%s" % (
-    SPARK_COMMIT_ID.replace("/", "-"), time.strftime("%Y-%m-%d_%H-%M-%S"))
-PYSPARK_OUTPUT_FILENAME = "results/python_perf_output_%s_%s" % (
-    SPARK_COMMIT_ID.replace("/", "-"), time.strftime("%Y-%m-%d_%H-%M-%S"))
-STREAMING_OUTPUT_FILENAME = "results/streaming_perf_output_%s_%s" % (
-    SPARK_COMMIT_ID.replace("/", "-"), time.strftime("%Y-%m-%d_%H-%M-%S"))
-MLLIB_OUTPUT_FILENAME = "results/mllib_perf_output_%s_%s" % (
-    SPARK_COMMIT_ID.replace("/", "-"), time.strftime("%Y-%m-%d_%H-%M-%S"))
-PYTHON_MLLIB_OUTPUT_FILENAME = "results/python_mllib_perf_output_%s_%s" % (
-    SPARK_COMMIT_ID.replace("/", "-"), time.strftime("%Y-%m-%d_%H-%M-%S"))
-
-
-# ============================ #
-#  Test Configuration Options  #
-# ============================ #
-
-# The default values configured below are appropriate for approximately 20 m1.xlarge nodes,
-# in which each node has 15 GB of memory. Use this variable to scale the values (e.g.
-# number of records in a generated dataset) if you are running the tests with more
-# or fewer nodes. When developing new test suites, you might want to set this to a small
-# value suitable for a single machine, such as 0.001.
-SCALE_FACTOR = 1.0
-
-assert SCALE_FACTOR > 0, "SCALE_FACTOR must be > 0."
-
-# If set, removes the first N trials for each test from all reported statistics. Useful for
-# tests which have outlier behavior due to JIT and other system cache warm-ups. If any test
-# returns fewer N + 1 results, an exception is thrown.
-IGNORED_TRIALS = 2
-
-# Command used to launch Scala or Java.
-
-# Set up OptionSets. Note that giant cross product is done over all JavaOptionsSets + OptionSets
-# passed to each test which may be combinations of those set up here.
-
-# Java options.
-COMMON_JAVA_OPTS = [
-    # Fraction of JVM memory used for caching RDDs.
-    JavaOptionSet("spark.storage.memoryFraction", [0.66]),
-    JavaOptionSet("spark.serializer", ["org.apache.spark.serializer.JavaSerializer"]),
-    # JavaOptionSet("spark.executor.memory", ["9g"]),
-    # Turn event logging on in order better diagnose failed tests. Off by default as it crashes
-    # releases prior to 1.0.2
-    # JavaOptionSet("spark.eventLog.enabled", [True]),
-    # To ensure consistency across runs, we disable delay scheduling
-    JavaOptionSet("spark.locality.wait", [str(60 * 1000 * 1000)])
-]
-# Set driver memory here
-SPARK_DRIVER_MEMORY = "20g"
-# The following options value sets are shared among all tests.
-COMMON_OPTS = [
-    # How many times to run each experiment - used to warm up system caches.
-    # This OptionSet should probably only have a single value (i.e., length 1)
-    # since it doesn't make sense to have multiple values here.
-    OptionSet("num-trials", [10]),
-    # Extra pause added between trials, in seconds. For runs with large amounts
-    # of shuffle data, this gives time for buffer cache write-back.
-    OptionSet("inter-trial-wait", [3])
-]
-
-# The following options value sets are shared among all tests of
-# operations on key-value data.
-SPARK_KEY_VAL_TEST_OPTS = [
-    # The number of input partitions.
-    OptionSet("num-partitions", [400], can_scale=True),
-    # The number of reduce tasks.
-    OptionSet("reduce-tasks", [400], can_scale=True),
-    # A random seed to make tests reproducable.
-    OptionSet("random-seed", [5]),
-    # Input persistence strategy (can be "memory", "disk", or "hdfs").
-    # NOTE: If "hdfs" is selected, datasets will be re-used across runs of
-    #       this script. This means parameters here are effectively ignored if
-    #       an existing input dataset is present.
-    OptionSet("persistent-type", ["memory"]),
-    # Whether to wait for input in order to exit the JVM.
-    FlagSet("wait-for-exit", [False]),
-    # Total number of records to create.
-    OptionSet("num-records", [200 * 1000 * 1000], True),
-    # Number of unique keys to sample from.
-    OptionSet("unique-keys",[20 * 1000], True),
-    # Length in characters of each key.
-    OptionSet("key-length", [10]),
-    # Number of unique values to sample from.
-    OptionSet("unique-values", [1000 * 1000], True),
-    # Length in characters of each value.
-    OptionSet("value-length", [10]),
-    # Use hashes instead of padded numbers for keys and values
-    FlagSet("hash-records", [False]),
-    # Storage location if HDFS persistence is used
-    OptionSet("storage-location", [
-        HDFS_URL + "/spark-perf-kv-data"])
-]
-
-
-# ======================= #
-#  Spark Core Test Setup  #
-# ======================= #
-
-# Set up the actual tests. Each test is represtented by a tuple:
-# (short_name, test_cmd, scale_factor, list<JavaOptionSet>, list<OptionSet>)
-
-SPARK_KV_OPTS = COMMON_OPTS + SPARK_KEY_VAL_TEST_OPTS
-SPARK_TESTS = []
-
-SCHEDULING_THROUGHPUT_OPTS = [
-    # The number of tasks that should be launched in each job:
-    OptionSet("num-tasks", [10 * 1000]),
-    # The number of jobs that should be run:
-    OptionSet("num-jobs", [1]),
-    # The size of the task closure (in bytes):
-    OptionSet("closure-size", [0]),
-    # A random seed to make tests reproducible:
-    OptionSet("random-seed", [5]),
-]
-
-SPARK_TESTS += [("scheduling-throughput", "spark.perf.TestRunner",
-    SCALE_FACTOR, COMMON_JAVA_OPTS,
-    [ConstantOption("scheduling-throughput")] + COMMON_OPTS + SCHEDULING_THROUGHPUT_OPTS)]
-
-SPARK_TESTS += [("scala-agg-by-key", "spark.perf.TestRunner", SCALE_FACTOR,
-    COMMON_JAVA_OPTS, [ConstantOption("aggregate-by-key")] + SPARK_KV_OPTS)]
-
-# Scale the input for this test by 2x since ints are smaller.
-SPARK_TESTS += [("scala-agg-by-key-int", "spark.perf.TestRunner", SCALE_FACTOR * 2,
-    COMMON_JAVA_OPTS, [ConstantOption("aggregate-by-key-int")] + SPARK_KV_OPTS)]
-
-SPARK_TESTS += [("scala-agg-by-key-naive", "spark.perf.TestRunner", SCALE_FACTOR,
-    COMMON_JAVA_OPTS, [ConstantOption("aggregate-by-key-naive")] + SPARK_KV_OPTS)]
-
-# Scale the input for this test by 0.10.
-SPARK_TESTS += [("scala-sort-by-key", "spark.perf.TestRunner", SCALE_FACTOR * 0.1,
-    COMMON_JAVA_OPTS, [ConstantOption("sort-by-key")] + SPARK_KV_OPTS)]
-
-SPARK_TESTS += [("scala-sort-by-key-int", "spark.perf.TestRunner", SCALE_FACTOR * 0.2,
-    COMMON_JAVA_OPTS, [ConstantOption("sort-by-key-int")] + SPARK_KV_OPTS)]
-
-SPARK_TESTS += [("scala-count", "spark.perf.TestRunner", SCALE_FACTOR,
-    COMMON_JAVA_OPTS, [ConstantOption("count")] + SPARK_KV_OPTS)]
-
-SPARK_TESTS += [("scala-count-w-fltr", "spark.perf.TestRunner", SCALE_FACTOR,
-    COMMON_JAVA_OPTS, [ConstantOption("count-with-filter")] + SPARK_KV_OPTS)]
-
-
-# ==================== #
-#  Pyspark Test Setup  #
-# ==================== #
-
-PYSPARK_TESTS = []
-
-BROADCAST_TEST_OPTS = [
-    # The size of broadcast
-    OptionSet("broadcast-size", [200 << 20], can_scale=True),
-]
-
-PYSPARK_TESTS += [("python-scheduling-throughput", "core_tests.py",
-    SCALE_FACTOR, COMMON_JAVA_OPTS,
-    [ConstantOption("SchedulerThroughputTest"), OptionSet("num-tasks", [5000])] + COMMON_OPTS)]
-
-PYSPARK_TESTS += [("python-agg-by-key", "core_tests.py", SCALE_FACTOR,
-    COMMON_JAVA_OPTS, [ConstantOption("AggregateByKey")] + SPARK_KV_OPTS)]
-
-# Scale the input for this test by 2x since ints are smaller.
-PYSPARK_TESTS += [("python-agg-by-key-int", "core_tests.py", SCALE_FACTOR * 2,
-    COMMON_JAVA_OPTS, [ConstantOption("AggregateByKeyInt")] + SPARK_KV_OPTS)]
-
-PYSPARK_TESTS += [("python-agg-by-key-naive", "core_tests.py", SCALE_FACTOR,
-    COMMON_JAVA_OPTS, [ConstantOption("AggregateByKeyNaive")] + SPARK_KV_OPTS)]
-
-# Scale the input for this test by 0.10.
-PYSPARK_TESTS += [("python-sort-by-key", "core_tests.py", SCALE_FACTOR * 0.1,
-    COMMON_JAVA_OPTS, [ConstantOption("SortByKey")] + SPARK_KV_OPTS)]
-
-PYSPARK_TESTS += [("python-sort-by-key-int", "core_tests.py", SCALE_FACTOR * 0.2,
-    COMMON_JAVA_OPTS, [ConstantOption("SortByKeyInt")] + SPARK_KV_OPTS)]
-
-PYSPARK_TESTS += [("python-count", "core_tests.py", SCALE_FACTOR,
-                 COMMON_JAVA_OPTS, [ConstantOption("Count")] + SPARK_KV_OPTS)]
-
-PYSPARK_TESTS += [("python-count-w-fltr", "core_tests.py", SCALE_FACTOR,
-    COMMON_JAVA_OPTS, [ConstantOption("CountWithFilter")] + SPARK_KV_OPTS)]
-
-PYSPARK_TESTS += [("python-broadcast-w-bytes", "core_tests.py", SCALE_FACTOR,
-    COMMON_JAVA_OPTS, [ConstantOption("BroadcastWithBytes")] + SPARK_KV_OPTS + BROADCAST_TEST_OPTS)]
-
-PYSPARK_TESTS += [("python-broadcast-w-set", "core_tests.py", SCALE_FACTOR,
-    COMMON_JAVA_OPTS, [ConstantOption("BroadcastWithSet")] + SPARK_KV_OPTS + BROADCAST_TEST_OPTS)]
-
-
-# ============================ #
-#  Spark Streaming Test Setup  #
-# ============================ #
-
-STREAMING_TESTS = []
-
-# The following function generates options for setting batch duration in streaming tests
-def streaming_batch_duration_opts(duration):
-    return [OptionSet("batch-duration", [duration])]
-
-# The following function generates options for setting window duration in streaming tests
-def streaming_window_duration_opts(duration):
-    return [OptionSet("window-duration", [duration])]
-
-STREAMING_COMMON_OPTS = [
-    OptionSet("total-duration", [60]),
-    OptionSet("hdfs-url", [HDFS_URL]),
-]
-
-STREAMING_COMMON_JAVA_OPTS = [
-    # Fraction of JVM memory used for caching RDDs.
-    JavaOptionSet("spark.storage.memoryFraction", [0.66]),
-    JavaOptionSet("spark.serializer", ["org.apache.spark.serializer.JavaSerializer"]),
-    # JavaOptionSet("spark.executor.memory", ["9g"]),
-    JavaOptionSet("spark.executor.extraJavaOptions", [" -XX:+UseConcMarkSweepGC "])
-]
-
-STREAMING_KEY_VAL_TEST_OPTS = STREAMING_COMMON_OPTS + streaming_batch_duration_opts(2000) + [
-    # Number of input streams.
-    OptionSet("num-streams", [1], can_scale=True),
-    # Number of records per second per input stream
-    OptionSet("records-per-sec", [10 * 1000]),
-    # Number of reduce tasks.
-    OptionSet("reduce-tasks", [10], can_scale=True),
-    # memory serialization ("true" or "false").
-    OptionSet("memory-serialization", ["true"]),
-    # Number of unique keys to sample from.
-    OptionSet("unique-keys",[100 * 1000], can_scale=True),
-    # Length in characters of each key.
-    OptionSet("unique-values", [1000 * 1000], can_scale=True),
-    # Send data through receiver
-    OptionSet("use-receiver", ["true"]),
-]
-
-STREAMING_HDFS_RECOVERY_TEST_OPTS = STREAMING_COMMON_OPTS + streaming_batch_duration_opts(5000) + [
-    OptionSet("records-per-file", [10000]),
-    OptionSet("file-cleaner-delay", [300])
-]
-
-# This test is just to see if everything is setup properly
-STREAMING_TESTS += [("basic", "streaming.perf.TestRunner", SCALE_FACTOR,
-    STREAMING_COMMON_JAVA_OPTS, [ConstantOption("basic")] + STREAMING_COMMON_OPTS + streaming_batch_duration_opts(1000))]
-
-STREAMING_TESTS += [("state-by-key", "streaming.perf.TestRunner", SCALE_FACTOR,
-    STREAMING_COMMON_JAVA_OPTS, [ConstantOption("state-by-key")] + STREAMING_KEY_VAL_TEST_OPTS)]
-
-STREAMING_TESTS += [("group-by-key-and-window", "streaming.perf.TestRunner", SCALE_FACTOR,
-    STREAMING_COMMON_JAVA_OPTS, [ConstantOption("group-by-key-and-window")] + STREAMING_KEY_VAL_TEST_OPTS + streaming_window_duration_opts(10000) )]
-
-STREAMING_TESTS += [("reduce-by-key-and-window", "streaming.perf.TestRunner", SCALE_FACTOR,
-    STREAMING_COMMON_JAVA_OPTS, [ConstantOption("reduce-by-key-and-window")] + STREAMING_KEY_VAL_TEST_OPTS + streaming_window_duration_opts(10000) )]
-
-STREAMING_TESTS += [("hdfs-recovery", "streaming.perf.TestRunner", SCALE_FACTOR,
-    STREAMING_COMMON_JAVA_OPTS, [ConstantOption("hdfs-recovery")] + STREAMING_HDFS_RECOVERY_TEST_OPTS)]
-
-
-# ================== #
-#  MLlib Test Setup  #
-# ================== #
-
-MLLIB_TESTS = []
-MLLIB_PERF_TEST_RUNNER = "mllib.perf.TestRunner"
-
-# Set this to 1.0, 1.1, 1.2, ... (the major version) to test MLlib with a particular Spark version.
-# Note: You should also build mllib-perf using -Dspark.version to specify the same version.
-# Note: To run perf tests against a snapshot version of Spark which has not yet been packaged into a release:
-#  * Build Spark locally by running `build/sbt assembly; build/sbt publishLocal` in the Spark root directory
-#  * Set `USE_CLUSTER_SPARK = True` and `MLLIB_SPARK_VERSION = {desired Spark version, e.g. 1.5}`
-#  * Don't use PREP_MLLIB_TESTS = True; instead manually run `cd mllib-tests; sbt/sbt -Dspark.version=1.5.0-SNAPSHOT clean assembly` to build perf tests
-
-# Can be changed to 2.0 for using Spark 2.0
-MLLIB_SPARK_VERSION = 1.5
-
-MLLIB_JAVA_OPTS = COMMON_JAVA_OPTS
-if MLLIB_SPARK_VERSION >= 1.1:
-    MLLIB_JAVA_OPTS = MLLIB_JAVA_OPTS + [
-        # Shuffle manager: SORT, HASH
-        JavaOptionSet("spark.shuffle.manager", ["SORT"])
-    ]
-
-# The following options value sets are shared among all tests of
-# operations on MLlib algorithms.
-MLLIB_COMMON_OPTS = COMMON_OPTS + [
-    # The number of input partitions.
-    # The default setting is suitable for a 16-node m3.2xlarge EC2 cluster.
-    OptionSet("num-partitions", [128], can_scale=True),
-    # A random seed to make tests reproducable.
-    OptionSet("random-seed", [5])
-]
-
-# Algorithms available in Spark-1.0 #
-
-# Regression and Classification Tests #
-MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS = MLLIB_COMMON_OPTS + [
-    # The number of rows or examples
-    OptionSet("num-examples", [1000000], can_scale=True)
-]
-
-# Generalized Linear Model (GLM) Tests #
-MLLIB_GLM_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
-    # The scale factor for the noise in feature values.
-    # Currently ignored for regression.
-    OptionSet("feature-noise", [1.0]),
-    # The number of features per example
-    OptionSet("num-features", [10000], can_scale=False),
-    # The number of iterations for SGD
-    OptionSet("num-iterations", [20]),
-    # The step size for SGD
-    OptionSet("step-size", [0.001]),
-    # Regularization type: none, l1, l2
-    OptionSet("reg-type", ["l2"]),
-    # Regularization parameter
-    OptionSet("reg-param", [0.1])
-]
-if MLLIB_SPARK_VERSION >= 1.5:
-    MLLIB_GLM_TEST_OPTS += [
-        # Ignored, but required for config
-        OptionSet("elastic-net-param", [0.0])
-    ]
-
-# GLM Regression Tests #
-MLLIB_GLM_REGRESSION_TEST_OPTS = MLLIB_GLM_TEST_OPTS + [
-    # Optimization algorithm: sgd
-    OptionSet("optimizer", ["sgd"]),
-    # The intercept for the data
-    OptionSet("intercept", [0.0]),
-    # The scale factor for label noise
-    OptionSet("label-noise", [0.1]),
-    # Loss to minimize: l2 (squared error)
-    OptionSet("loss", ["l2"])
-]
-
-MLLIB_TESTS += [("glm-regression", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-    MLLIB_JAVA_OPTS, [ConstantOption("glm-regression")] + MLLIB_GLM_REGRESSION_TEST_OPTS)]
-
-# Classification Tests #
-MLLIB_CLASSIFICATION_TEST_OPTS = MLLIB_GLM_TEST_OPTS + [
-    # Expected fraction of examples which are negative
-    OptionSet("per-negative", [0.3]),
-    # Optimization algorithm: sgd, l-bfgs
-    OptionSet("optimizer", ["sgd", "l-bfgs"])
-]
-
-# GLM Classification Tests #
-MLLIB_GLM_CLASSIFICATION_TEST_OPTS = MLLIB_CLASSIFICATION_TEST_OPTS + [
-    # Loss to minimize: logistic, hinge (SVM)
-    OptionSet("loss", ["logistic"])
-]
-
-MLLIB_TESTS += [("glm-classification", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-                 MLLIB_JAVA_OPTS, [ConstantOption("glm-classification")] +
-                 MLLIB_GLM_CLASSIFICATION_TEST_OPTS)]
-
-if MLLIB_SPARK_VERSION >= 1.5:
-    MLLIB_GLM_ELASTIC_NET_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
-        # The max number of iterations for LBFGS/OWLQN
-        OptionSet("num-iterations", [20]),
-        # LBFGS/OWLQN is used with elastic-net regularization.
-        OptionSet("optimizer", ["auto"]),
-        # Using elastic-net regularization.
-        OptionSet("reg-type", ["elastic-net"]),
-        # Runs with L2 (param = 0.0), L1 (param = 1.0).
-        OptionSet("elastic-net-param", [0.0, 1.0]),
-        # Regularization param (lambda)
-        OptionSet("reg-param", [0.01]),
-        # The scale factor for the noise in feature values
-        OptionSet("feature-noise", [1.0]),
-        # The step size is not used in LBFGS, but this is required in parameter checking.
-        OptionSet("step-size", [0.0])
-    ]
-
-    MLLIB_GLM_ELASTIC_NET_REGRESSION_TEST_OPTS = MLLIB_GLM_ELASTIC_NET_TEST_OPTS + [
-        # The scale factor for the noise in label values
-        OptionSet("label-noise", [0.1]),
-        # The intercept for the data
-        OptionSet("intercept", [0.2]),
-        # Loss to minimize: l2 (squared error)
-        OptionSet("loss", ["l2"])
-    ]
-
-    # Test L-BFGS
-    MLLIB_TESTS += [("glm-regression", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-        MLLIB_JAVA_OPTS, [ConstantOption("glm-regression")] +
-        MLLIB_GLM_ELASTIC_NET_REGRESSION_TEST_OPTS +
-        [OptionSet("num-features", [10000], can_scale=False)])]
-    # Test normal equation solver
-    MLLIB_TESTS += [("glm-regression", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-                     MLLIB_JAVA_OPTS, [ConstantOption("glm-regression")] +
-                     MLLIB_GLM_ELASTIC_NET_REGRESSION_TEST_OPTS +
-                     [OptionSet("num-features", [200], can_scale=False)])]
-
-    MLLIB_GLM_ELASTIC_NET_CLASSIFICATION_TEST_OPTS = MLLIB_GLM_ELASTIC_NET_TEST_OPTS + [
-        # Expected fraction of examples which are negative
-        OptionSet("per-negative", [0.3]),
-        # In GLM classification with elastic-net regularization, only logistic loss is supported.
-        OptionSet("loss", ["logistic"])
-    ]
-
-    # Test L-BFGS
-    MLLIB_TESTS += [("glm-classification", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-                     MLLIB_JAVA_OPTS, [ConstantOption("glm-classification")] +
-                     MLLIB_GLM_ELASTIC_NET_CLASSIFICATION_TEST_OPTS +
-                     [OptionSet("num-features", [10000], can_scale=False)])]
-    # Test normal equation solver
-    MLLIB_TESTS += [("glm-classification", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-                     MLLIB_JAVA_OPTS, [ConstantOption("glm-classification")] +
-                     MLLIB_GLM_ELASTIC_NET_CLASSIFICATION_TEST_OPTS +
-                     [OptionSet("num-features", [200], can_scale=False)])]
-
-NAIVE_BAYES_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
-    # The number of features per example
-    OptionSet("num-features", [10000], can_scale=False),
-    # Expected fraction of examples which are negative
-    OptionSet("per-negative", [0.3]),
-    # The scale factor for the noise in feature values
-    OptionSet("feature-noise", [1.0]),
-    # Naive Bayes smoothing lambda.
-    OptionSet("nb-lambda", [1.0]),
-    # Model type: either multinomial or bernoulli (bernoulli only available in Spark 1.4+)
-    OptionSet("model-type", ["multinomial"]),
-]
-
-MLLIB_TESTS += [("naive-bayes", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-    MLLIB_JAVA_OPTS, [ConstantOption("naive-bayes")] +
-    NAIVE_BAYES_TEST_OPTS)]
-
-# Decision Trees #
-MLLIB_DECISION_TREE_TEST_OPTS = MLLIB_COMMON_OPTS + [
-    # The number of rows or examples
-    OptionSet("num-examples", [1000000], can_scale=True),
-    # The number of features per example
-    OptionSet("num-features", [500], can_scale=False),
-    # Type of label: 0 indicates regression, 2+ indicates classification with this many classes
-    # Note: multi-class (>2) is not supported in Spark 1.0.
-    OptionSet("label-type", [0, 2], can_scale=False),
-    # Fraction of features which are categorical
-    OptionSet("frac-categorical-features", [0.5], can_scale=False),
-    # Fraction of categorical features which are binary. Others have 20 categories.
-    OptionSet("frac-binary-features", [0.5], can_scale=False),
-    # Depth of true decision tree model used to label examples.
-    # WARNING: The meaning of depth changed from Spark 1.0 to Spark 1.1:
-    #          depth=N for Spark 1.0 should be depth=N-1 for Spark 1.1
-    OptionSet("tree-depth", [5, 10], can_scale=False),
-    # Maximum number of bins for the decision tree learning algorithm.
-    OptionSet("max-bins", [32], can_scale=False),
-]
-
-if MLLIB_SPARK_VERSION >= 1.2:
-    ensembleTypes = ["RandomForest"]
-    if MLLIB_SPARK_VERSION >= 1.3:
-        ensembleTypes.append("GradientBoostedTrees")
-    if MLLIB_SPARK_VERSION >= 1.4:
-        ensembleTypes.extend(["ml.RandomForest", "ml.GradientBoostedTrees"])
-    MLLIB_DECISION_TREE_TEST_OPTS += [
-        # Ensemble type: mllib.RandomForest, mllib.GradientBoostedTrees,
-        #                ml.RandomForest, ml.GradientBoostedTrees
-        OptionSet("ensemble-type", ensembleTypes),
-        # Path to training dataset (if not given, use random data).
-        OptionSet("training-data", [""]),
-        # Path to test dataset (only used if training dataset given).
-        # If not given, hold out part of training data for validation.
-        OptionSet("test-data", [""]),
-        # Fraction of data to hold out for testing
-        #  (Ignored if given training and test dataset, or if using synthetic data.)
-        OptionSet("test-data-fraction", [0.2], can_scale=False),
-        # Number of trees. If 1, then run DecisionTree. If >1, then run RandomForest.
-        OptionSet("num-trees", [1, 10], can_scale=False),
-        # Feature subset sampling strategy: auto, all, sqrt, log2, onethird
-        # (only used for RandomForest)
-        OptionSet("feature-subset-strategy", ["auto"])
-    ]
-
-MLLIB_TESTS += [("decision-tree", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-    MLLIB_JAVA_OPTS, [ConstantOption("decision-tree")] +
-    MLLIB_DECISION_TREE_TEST_OPTS)]
-
-# Recommendation Tests #
-MLLIB_RECOMMENDATION_TEST_OPTS = MLLIB_COMMON_OPTS + [
-     # The number of users
-     OptionSet("num-users", [6000000], can_scale=True),
-     # The number of products
-     OptionSet("num-products", [5000000], can_scale=False),
-     # The number of ratings
-     OptionSet("num-ratings", [50000000], can_scale=True),
-     # The number of iterations for ALS
-     OptionSet("num-iterations", [10]),
-     # The rank of the factorized matrix model
-     OptionSet("rank", [10]),
-     # The regularization parameter
-     OptionSet("reg-param", [0.1]),
-     # Whether to use implicit preferences or not
-     FlagSet("implicit-prefs", [False])
-]
-
-MLLIB_TESTS += [("als", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-    MLLIB_JAVA_OPTS, [ConstantOption("als")] +
-    MLLIB_RECOMMENDATION_TEST_OPTS)]
-
-# Clustering Tests #
-MLLIB_CLUSTERING_TEST_OPTS = MLLIB_COMMON_OPTS + [
-     # The number of examples
-     OptionSet("num-examples", [1000000], can_scale=True),
-     # The number of features per point
-     OptionSet("num-features", [10000], can_scale=False),
-     # The number of centers
-     OptionSet("num-centers", [20]),
-     # The number of iterations for KMeans
-     OptionSet("num-iterations", [20])
-]
-
-MLLIB_TESTS += [("kmeans", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-    MLLIB_JAVA_OPTS, [ConstantOption("kmeans")] + MLLIB_CLUSTERING_TEST_OPTS)]
-
-MLLIB_GMM_TEST_OPTS = MLLIB_COMMON_OPTS + [
-    OptionSet("num-examples", [1000000], can_scale=True),
-    OptionSet("num-features", [100], can_scale=False),
-    OptionSet("num-centers", [20], can_scale=False),
-    OptionSet("num-iterations", [20])]
-
-if MLLIB_SPARK_VERSION >= 1.3:
-    MLLIB_TESTS += [("gmm", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-        MLLIB_JAVA_OPTS, [ConstantOption("gmm")] + MLLIB_GMM_TEST_OPTS)]
-
-MLLIB_LDA_TEST_OPTS = MLLIB_COMMON_OPTS + [
-    OptionSet("num-documents", [50000], can_scale=True),
-    OptionSet("num-vocab", [10000], can_scale=False),
-    OptionSet("num-topics", [20], can_scale=False),
-    OptionSet("num-iterations", [20]),
-    OptionSet("document-length", [100]),
-    OptionSet("optimizer", ["em", "online"])]
-
-if MLLIB_SPARK_VERSION >= 1.4:
-    MLLIB_TESTS += [("lda", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-        MLLIB_JAVA_OPTS, [ConstantOption("lda")] + MLLIB_LDA_TEST_OPTS)]
-
-MLLIB_PIC_TEST_OPTS = MLLIB_COMMON_OPTS + [
-    OptionSet("num-examples", [10000000], can_scale=True),
-    OptionSet("node-degree", [20], can_scale=False),
-    OptionSet("num-centers", [40], can_scale=False),
-    OptionSet("num-iterations", [20])]
-
-if MLLIB_SPARK_VERSION >= 1.3:
-    MLLIB_TESTS += [("pic", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-        MLLIB_JAVA_OPTS, [ConstantOption("pic")] + MLLIB_PIC_TEST_OPTS)]
-
-# Linear Algebra Tests #
-MLLIB_LINALG_TEST_OPTS = MLLIB_COMMON_OPTS + [
-    # The number of rows for the matrix
-    OptionSet("num-rows", [1000000], can_scale=True),
-    # The number of columns for the matrix
-    OptionSet("num-cols", [1000], can_scale=False),
-    # The number of top singular values wanted for SVD and PCA
-    OptionSet("rank", [50], can_scale=False)
-]
-# Linear Algebra Tests which take more time (slightly smaller settings) #
-MLLIB_BIG_LINALG_TEST_OPTS = MLLIB_COMMON_OPTS + [
-    # The number of rows for the matrix
-    OptionSet("num-rows", [1000000], can_scale=True),
-    # The number of columns for the matrix
-    OptionSet("num-cols", [500], can_scale=False),
-    # The number of top singular values wanted for SVD and PCA
-    OptionSet("rank", [10], can_scale=False)
-]
-
-MLLIB_TESTS += [("svd", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-    MLLIB_JAVA_OPTS, [ConstantOption("svd")] + MLLIB_BIG_LINALG_TEST_OPTS)]
-
-MLLIB_TESTS += [("pca", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-    MLLIB_JAVA_OPTS, [ConstantOption("pca")] + MLLIB_LINALG_TEST_OPTS)]
-
-MLLIB_TESTS += [("summary-statistics", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-    MLLIB_JAVA_OPTS, [ConstantOption("summary-statistics")] +
-    MLLIB_BIG_LINALG_TEST_OPTS)]
-
-MLLIB_BLOCK_MATRIX_MULT_TEST_OPTS = MLLIB_COMMON_OPTS + [
-    OptionSet("m", [20000], can_scale=True),
-    OptionSet("k", [10000], can_scale=False),
-    OptionSet("n", [10000], can_scale=False),
-    OptionSet("block-size", [1024], can_scale=False)]
-
-if MLLIB_SPARK_VERSION >= 1.3:
-   MLLIB_TESTS += [("block-matrix-mult", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-                   MLLIB_JAVA_OPTS, [ConstantOption("block-matrix-mult")] + MLLIB_BLOCK_MATRIX_MULT_TEST_OPTS)]
-
-# Statistic Toolkit Tests #
-MLLIB_STATS_TEST_OPTS = MLLIB_COMMON_OPTS
-
-MLLIB_PEARSON_TEST_OPTS = MLLIB_STATS_TEST_OPTS + \
-                          [OptionSet("num-rows", [1000000], can_scale=True),
-                           OptionSet("num-cols", [1000], can_scale=False)]
-
-MLLIB_SPEARMAN_TEST_OPTS = MLLIB_STATS_TEST_OPTS + \
-                           [OptionSet("num-rows", [1000000], can_scale=True),
-                            OptionSet("num-cols", [100], can_scale=False)]
-
-MLLIB_CHI_SQ_FEATURE_TEST_OPTS = MLLIB_STATS_TEST_OPTS + \
-                                 [OptionSet("num-rows", [2000000], can_scale=True),
-                                  OptionSet("num-cols", [500], can_scale=False)]
-
-MLLIB_CHI_SQ_GOF_TEST_OPTS = MLLIB_STATS_TEST_OPTS + \
-                             [OptionSet("num-rows", [50000000], can_scale=True),
-                              OptionSet("num-cols", [0], can_scale=False)]
-
-MLLIB_CHI_SQ_MAT_TEST_OPTS = MLLIB_STATS_TEST_OPTS + \
-                             [OptionSet("num-rows", [20000], can_scale=True),
-                              OptionSet("num-cols", [0], can_scale=False)]
-
-if MLLIB_SPARK_VERSION >= 1.1:
-    MLLIB_TESTS += [("pearson", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-        MLLIB_JAVA_OPTS, [ConstantOption("pearson")] + MLLIB_PEARSON_TEST_OPTS)]
-
-    MLLIB_TESTS += [("spearman", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-        MLLIB_JAVA_OPTS, [ConstantOption("spearman")] + MLLIB_SPEARMAN_TEST_OPTS)]
-
-    MLLIB_TESTS += [("chi-sq-feature", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-        MLLIB_JAVA_OPTS, [ConstantOption("chi-sq-feature")] + MLLIB_CHI_SQ_FEATURE_TEST_OPTS)]
-
-    MLLIB_TESTS += [("chi-sq-gof", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-        MLLIB_JAVA_OPTS, [ConstantOption("chi-sq-gof")] + MLLIB_CHI_SQ_GOF_TEST_OPTS)]
-
-    MLLIB_TESTS += [("chi-sq-mat", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-        MLLIB_JAVA_OPTS, [ConstantOption("chi-sq-mat")] + MLLIB_CHI_SQ_MAT_TEST_OPTS)]
-
-# Feature Transformation Tests #
-
-MLLIB_FEATURE_TEST_OPTS = MLLIB_COMMON_OPTS
-
-MLLIB_WORD2VEC_TEST_OPTS = MLLIB_FEATURE_TEST_OPTS + \
-                           [OptionSet("num-sentences", [1000000], can_scale=True),
-                            OptionSet("num-words", [10000], can_scale=False),
-                            OptionSet("vector-size", [100], can_scale=False),
-                            OptionSet("num-iterations", [3], can_scale=False),
-                            OptionSet("min-count", [5], can_scale=False)]
-
-if MLLIB_SPARK_VERSION >= 1.3:  # TODO: make it work in 1.2
-    MLLIB_TESTS += [("word2vec", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-        MLLIB_JAVA_OPTS, [ConstantOption("word2vec")] + MLLIB_WORD2VEC_TEST_OPTS)]
-
-# Frequent Pattern Matching Tests #
-
-MLLIB_FPM_TEST_OPTS = MLLIB_COMMON_OPTS
-
-MLLIB_FP_GROWTH_TEST_OPTS = MLLIB_FPM_TEST_OPTS + \
-                            [OptionSet("num-baskets", [5000000], can_scale=True),
-                             OptionSet("avg-basket-size", [10], can_scale=False),
-                             OptionSet("num-items", [1000], can_scale=False),
-                             OptionSet("min-support", [0.01], can_scale=False)]
-
-if MLLIB_SPARK_VERSION >= 1.3:
-    MLLIB_TESTS += [("fp-growth", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-        MLLIB_JAVA_OPTS, [ConstantOption("fp-growth")] + MLLIB_FP_GROWTH_TEST_OPTS)]
-
-# TODO: tune test size to have runtime within 30-60 seconds
-MLLIB_PREFIX_SPAN_TEST_OPTS = MLLIB_FPM_TEST_OPTS + \
-                            [OptionSet("num-sequences", [5000000], can_scale=True),
-                             OptionSet("avg-sequence-size", [5], can_scale=False),
-                             OptionSet("avg-itemset-size", [2], can_scale=False),
-                             OptionSet("num-items", [500], can_scale=False),
-                             OptionSet("min-support", [0.5], can_scale=False),
-                             OptionSet("max-pattern-len", [10], can_scale=False),
-                             OptionSet("max-local-proj-db-size", [32000000], can_scale=False)]
-
-if MLLIB_SPARK_VERSION >= 1.5:
-    MLLIB_TESTS += [("prefix-span", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-        MLLIB_JAVA_OPTS, [ConstantOption("prefix-span")] + MLLIB_PREFIX_SPAN_TEST_OPTS)]
-
-# Python MLlib tests
-PYTHON_MLLIB_TESTS = []
-
-PYTHON_MLLIB_TESTS += [("python-glm-classification", "mllib_tests.py", SCALE_FACTOR,
-                         MLLIB_JAVA_OPTS, [ConstantOption("GLMClassificationTest")] +
-                         MLLIB_GLM_CLASSIFICATION_TEST_OPTS)]
-
-PYTHON_MLLIB_TESTS += [("python-glm-regression", "mllib_tests.py", SCALE_FACTOR,
-                         MLLIB_JAVA_OPTS, [ConstantOption("GLMRegressionTest")] +
-                         MLLIB_GLM_REGRESSION_TEST_OPTS)]
-
-PYTHON_MLLIB_TESTS += [("python-naive-bayes", "mllib_tests.py", SCALE_FACTOR,
-                         MLLIB_JAVA_OPTS, [ConstantOption("NaiveBayesTest")] +
-                         NAIVE_BAYES_TEST_OPTS)]
-
-PYTHON_MLLIB_TESTS += [("python-als", "mllib_tests.py", SCALE_FACTOR,
-                         MLLIB_JAVA_OPTS, [ConstantOption("ALSTest")] +
-                         MLLIB_RECOMMENDATION_TEST_OPTS)]
-
-PYTHON_MLLIB_TESTS += [("python-kmeans", "mllib_tests.py", SCALE_FACTOR,
-                         MLLIB_JAVA_OPTS, [ConstantOption("KMeansTest")] + MLLIB_CLUSTERING_TEST_OPTS)]
-
-if MLLIB_SPARK_VERSION >= 1.1:
-    PYTHON_MLLIB_TESTS += [("python-pearson", "mllib_tests.py", SCALE_FACTOR,
-                             MLLIB_JAVA_OPTS, [ConstantOption("PearsonCorrelationTest")] +
-                             MLLIB_PEARSON_TEST_OPTS)]
-
-    PYTHON_MLLIB_TESTS += [("python-spearman", "mllib_tests.py", SCALE_FACTOR,
-                             MLLIB_JAVA_OPTS, [ConstantOption("SpearmanCorrelationTest")] +
-                             MLLIB_SPEARMAN_TEST_OPTS)]
-
diff --git a/mllib-tests/project/MLlibTestsBuild.scala b/mllib-tests/project/MLlibTestsBuild.scala
index cd392d8..ebf12ab 100644
--- a/mllib-tests/project/MLlibTestsBuild.scala
+++ b/mllib-tests/project/MLlibTestsBuild.scala
@@ -17,14 +17,20 @@ object MLlibTestsBuild extends Build {
     organization := "org.spark-project",
     version := "0.1",
     scalaVersion := "2.11.8",
-    sparkVersion := sys.props.getOrElse("spark.version", default="2.0.0-preview"),
+    sparkVersion := sys.props.getOrElse("spark.version", default="2.0.0-SNAPSHOT"),
     libraryDependencies ++= Seq(
       "net.sf.jopt-simple" % "jopt-simple" % "4.6",
       "org.scalatest" %% "scalatest" % "2.2.1" % "test",
       "org.slf4j" % "slf4j-log4j12" % "1.7.2",
-      "org.json4s" %% "json4s-native" % "3.2.10",
-      "org.apache.spark" %% "spark-core" % "2.0.0-preview" % "provided",
-      "org.apache.spark" %% "spark-mllib" % "2.0.0-preview" % "provided"
+      "org.json4s" %% "json4s-native" % "3.2.10"
+
+      // IMPORTANT!
+      // We need to uncomment the below once Spark 2.0.0 becomes available
+      // This relies on using spark built under the lib folder 
+      // of this project
+      
+      //"org.apache.spark" %% "spark-core" % "2.0.0-SNAPSHOT" % "provided",
+      //"org.apache.spark" %% "spark-mllib" % "2.0.0-SNAPSHOT" % "provided"
     )
   )
 
@@ -33,12 +39,12 @@ object MLlibTestsBuild extends Build {
     file("."),
     settings = assemblySettings ++ commonSettings ++ Seq(
       scalaSource in Compile := {
+        println("sparkVersion.value is: " + sparkVersion.value)
         val targetFolder = sparkVersion.value match {
           case v if v.startsWith("1.4.") => "v1p4"
           case v if v.startsWith("1.5.") => "v1p5"
-          case v if v.startsWith("1.6.") =>
-            "v1p5" // acceptable for now, but change later when new algs are added
-          case v if v.startsWith("2.0") => "v2p0"
+          case v if v.startsWith("1.6.") => "v1p5"
+          case v if v.startsWith("2.0") => "v2p0" 
           case _ => throw new IllegalArgumentException(s"This Spark version isn't suppored: ${sparkVersion.value}.")
         }
         baseDirectory.value / targetFolder / "src" / "main" / "scala"
diff --git a/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala b/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala
index 4dd1d49..6f89aac 100644
--- a/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala
+++ b/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala
@@ -315,13 +315,13 @@ abstract class ClusteringTests(sc: SparkContext) extends PerfTest {
 
   def runTest(rdd: RDD[Vector]): KMeansModel
 
-  val NUM_EXAMPLES =    ("num-examples",   "number of examples for clustering tests")
-  val NUM_FEATURES =   ("num-features",   "number of features for each example for clustering tests")
+  val NUM_POINTS =    ("num-points",   "number of points for clustering tests")
+  val NUM_COLUMNS =   ("num-columns",   "number of columns for each point for clustering tests")
   val NUM_CENTERS =   ("num-centers",   "number of centers for clustering tests")
   val NUM_ITERATIONS =      ("num-iterations",   "number of iterations for the algorithm")
 
-  intOptions = intOptions ++ Seq(NUM_CENTERS, NUM_FEATURES, NUM_ITERATIONS)
-  longOptions = longOptions ++ Seq(NUM_EXAMPLES)
+  intOptions = intOptions ++ Seq(NUM_CENTERS, NUM_COLUMNS, NUM_ITERATIONS)
+  longOptions = longOptions ++ Seq(NUM_POINTS)
   val options = intOptions ++ stringOptions  ++ booleanOptions ++ longOptions ++ doubleOptions
   addOptionsToParser()
 
@@ -329,21 +329,21 @@ abstract class ClusteringTests(sc: SparkContext) extends PerfTest {
   var testRdd: RDD[Vector] = _
 
   def validate(model: KMeansModel, rdd: RDD[Vector]): Double = {
-    val numExamples = rdd.cache().count()
+    val numPoints = rdd.cache().count()
 
     val error = model.computeCost(rdd)
 
-    math.sqrt(error/numExamples)
+    math.sqrt(error/numPoints)
   }
 
   override def createInputData(seed: Long) = {
     val numPartitions: Int = intOptionValue(NUM_PARTITIONS)
 
-    val numExamples: Long = longOptionValue(NUM_EXAMPLES)
-    val numFeatures: Int = intOptionValue(NUM_FEATURES)
+    val numPoints: Long = longOptionValue(NUM_POINTS)
+    val numColumns: Int = intOptionValue(NUM_COLUMNS)
     val numCenters: Int = intOptionValue(NUM_CENTERS)
 
-    val data = DataGenerator.generateKMeansVectors(sc, math.ceil(numExamples*1.25).toLong, numFeatures,
+    val data = DataGenerator.generateKMeansVectors(sc, math.ceil(numPoints*1.25).toLong, numColumns,
       numCenters, numPartitions, seed)
 
     val split = data.randomSplit(Array(0.8, 0.2), seed)
@@ -441,10 +441,9 @@ class ALSTest(sc: SparkContext) extends RecommendationTests(sc) {
     val rank: Int = intOptionValue(RANK)
     val regParam = doubleOptionValue(REG_PARAM)
     val seed = intOptionValue(RANDOM_SEED) + 12
-    val implicitRatings: Boolean = booleanOptionValue(IMPLICIT)
 
     new ALS().setIterations(numIterations).setRank(rank).setSeed(seed).setLambda(regParam)
-      .setBlocks(rdd.partitions.length).setImplicitPrefs(implicitRatings).run(rdd)
+      .setBlocks(rdd.partitions.size).run(rdd)
   }
 }
 
diff --git a/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala b/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala
index 5903e2e..0004f8d 100644
--- a/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala
+++ b/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala
@@ -16,21 +16,21 @@ import mllib.perf.PerfTest
 class GaussianMixtureTest(sc: SparkContext) extends PerfTest {
 
   // TODO: refactor k-means and GMM code
-  val NUM_EXAMPLES = ("num-examples", "number of examples for clustering tests")
-  val NUM_FEATURES = ("num-features", "number of features for each example for clustering tests")
+  val NUM_POINTS = ("num-points", "number of points for clustering tests")
+  val NUM_COLUMNS = ("num-columns", "number of columns for each point for clustering tests")
   val NUM_CENTERS = ("num-centers", "number of centers for clustering tests")
   val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm")
 
-  intOptions ++= Seq(NUM_CENTERS, NUM_FEATURES, NUM_ITERATIONS)
-  longOptions ++= Seq(NUM_EXAMPLES)
+  intOptions ++= Seq(NUM_CENTERS, NUM_COLUMNS, NUM_ITERATIONS)
+  longOptions ++= Seq(NUM_POINTS)
   val options = intOptions ++ stringOptions  ++ booleanOptions ++ longOptions ++ doubleOptions
   addOptionsToParser()
 
   var data: RDD[Vector] = _
 
   override def createInputData(seed: Long): Unit = {
-    val m = longOptionValue(NUM_EXAMPLES)
-    val n = intOptionValue(NUM_FEATURES)
+    val m = longOptionValue(NUM_POINTS)
+    val n = intOptionValue(NUM_COLUMNS)
     val k = intOptionValue(NUM_CENTERS)
     val p = intOptionValue(NUM_PARTITIONS)
 
@@ -47,7 +47,7 @@ class GaussianMixtureTest(sc: SparkContext) extends PerfTest {
           Vectors.dense(y.data)
         }
       }.cache()
-    logInfo(s"Generated ${data.count()} examples.")
+    logInfo(s"Generated ${data.count()} points.")
   }
 
   override def run(): JValue = {
diff --git a/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/PICTest.scala b/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/PICTest.scala
index 2018c61..6832ffa 100644
--- a/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/PICTest.scala
+++ b/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/PICTest.scala
@@ -11,28 +11,28 @@ import mllib.perf.PerfTest
 
 class PICTest(sc: SparkContext) extends PerfTest {
 
-  val NUM_EXAMPLES = ("num-examples", "number of examples")
+  val NUM_POINTS = ("num-points", "number of points")
   val NODE_DEGREE = ("node-degree", "number of neighbors each node is connected to")
   val NUM_CENTERS = ("num-centers", "number of centers for clustering tests")
   val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm")
 
   intOptions ++= Seq(NODE_DEGREE, NUM_CENTERS, NUM_ITERATIONS)
-  longOptions ++= Seq(NUM_EXAMPLES)
+  longOptions ++= Seq(NUM_POINTS)
   val options = intOptions ++ stringOptions  ++ booleanOptions ++ longOptions ++ doubleOptions
   addOptionsToParser()
 
   var data: RDD[(Long, Long, Double)] = _
 
   override def createInputData(seed: Long): Unit = {
-    val numExamples = longOptionValue(NUM_EXAMPLES)
+    val numPoints = longOptionValue(NUM_POINTS)
     val nodeDegree = intOptionValue(NODE_DEGREE)
     val numPartitions = intOptionValue(NUM_PARTITIONS)
 
     // Generates a periodic banded matrix with bandwidth = nodeDegree
-    data = sc.parallelize(0L to numExamples, numPartitions)
+    val data = sc.parallelize(0L to numPoints, numPartitions)
       .flatMap { id =>
-        (((id - nodeDegree / 2) % numExamples) until id).map { nbr =>
-          (id, (nbr + numExamples) % numExamples, 1D)
+        (((id - nodeDegree / 2) % numPoints) until id).map { nbr =>
+          (id, (nbr + numPoints) % numPoints, 1D)
         }
       }
     logInfo(s"Generated ${data.count()} pairwise similarities.")
@@ -46,7 +46,6 @@ class PICTest(sc: SparkContext) extends PerfTest {
       .setK(k)
       .setMaxIterations(numIterations)
     val model = pic.run(data)
-    model.assignments.count()
     val duration = (System.currentTimeMillis() - start) / 1e3
     "time" -> duration
   }
diff --git a/mllib-tests/v1p4/src/main/scala/mllib/perf/util/DataGenerator.scala b/mllib-tests/v1p4/src/main/scala/mllib/perf/util/DataGenerator.scala
index f721ca7..6e354fd 100644
--- a/mllib-tests/v1p4/src/main/scala/mllib/perf/util/DataGenerator.scala
+++ b/mllib-tests/v1p4/src/main/scala/mllib/perf/util/DataGenerator.scala
@@ -509,7 +509,7 @@ class FeaturesGenerator(val categoricalArities: Array[Int], val numContinuous: I
 
 class KMeansDataGenerator(
     val numCenters: Int,
-    val numFeatures: Int,
+    val numColumns: Int,
     val seed: Long) extends RandomDataGenerator[Vector] {
 
   private val rng = new java.util.Random(seed)
@@ -528,7 +528,7 @@ class KMeansDataGenerator(
   }
 
   private val centers = (0 until numCenters).map{i =>
-    Array.fill(numFeatures)((2 * rng.nextDouble() - 1)*scale_factors(i))
+    Array.fill(numColumns)((2 * rng.nextDouble() - 1)*scale_factors(i))
   }
 
   override def nextValue(): Vector = {
@@ -536,12 +536,12 @@ class KMeansDataGenerator(
 
     val centerToAddTo = centers(concentrations.indexWhere(p => pick_center_rand <= p))
 
-    Vectors.dense(Array.tabulate(numFeatures)(i => centerToAddTo(i) + rng2.nextGaussian()))
+    Vectors.dense(Array.tabulate(numColumns)(i => centerToAddTo(i) + rng2.nextGaussian()))
   }
 
   override def setSeed(seed: Long) {
     rng.setSeed(seed)
   }
 
-  override def copy(): KMeansDataGenerator = new KMeansDataGenerator(numCenters, numFeatures, seed)
+  override def copy(): KMeansDataGenerator = new KMeansDataGenerator(numCenters, numColumns, seed)
 }
diff --git a/mllib-tests/v1p5/src/main/scala/mllib/perf/MLAlgorithmTests.scala b/mllib-tests/v1p5/src/main/scala/mllib/perf/MLAlgorithmTests.scala
index 1c06465..1f1ec27 100644
--- a/mllib-tests/v1p5/src/main/scala/mllib/perf/MLAlgorithmTests.scala
+++ b/mllib-tests/v1p5/src/main/scala/mllib/perf/MLAlgorithmTests.scala
@@ -97,12 +97,10 @@ abstract class GLMTests(sc: SparkContext)
 class GLMRegressionTest(sc: SparkContext) extends GLMTests(sc) {
 
   val INTERCEPT =  ("intercept",   "intercept for random data generation")
-  val FEATURE_NOISE =  ("feature-noise",
-    "scale factor for the noise during feature generation; CURRENTLY IGNORED")
   val LABEL_NOISE =  ("label-noise",   "scale factor for the noise during label generation")
   val LOSS =  ("loss",   "loss to minimize. Supported: l2 (squared error).")
 
-  doubleOptions = doubleOptions ++ Seq(INTERCEPT, FEATURE_NOISE, LABEL_NOISE)
+  doubleOptions = doubleOptions ++ Seq(INTERCEPT, LABEL_NOISE)
   stringOptions = stringOptions ++ Seq(LOSS)
 
   val options = intOptions ++ stringOptions  ++ booleanOptions ++ doubleOptions ++ longOptions
@@ -160,7 +158,6 @@ class GLMRegressionTest(sc: SparkContext) extends GLMTests(sc) {
         .setElasticNetParam(elasticNetParam)
         .setRegParam(regParam)
         .setMaxIter(numIterations)
-        .setTol(0.0)
       val sqlContext = new SQLContext(rdd.context)
       import sqlContext.implicits._
       val mlModel = rr.fit(rdd.toDF())
@@ -268,7 +265,6 @@ class GLMClassificationTest(sc: SparkContext) extends GLMTests(sc) {
             .setElasticNetParam(elasticNetParam)
             .setRegParam(regParam)
             .setMaxIter(numIterations)
-            .setTol(0.0)
           val sqlContext = new SQLContext(rdd.context)
           import sqlContext.implicits._
           val mlModel = lor.fit(rdd.toDF())
@@ -383,8 +379,6 @@ abstract class RecommendationTests(sc: SparkContext) extends PerfTest {
 
     val testMetric = validate(model, testRdd)
 
-    /*
-    // Removed temporarily because these methods are really slow.
     val numThingsToRecommend = 10
     start = System.currentTimeMillis()
     model.recommendProductsForUsers(numThingsToRecommend).count()
@@ -392,11 +386,11 @@ abstract class RecommendationTests(sc: SparkContext) extends PerfTest {
     start = System.currentTimeMillis()
     model.recommendUsersForProducts(numThingsToRecommend).count()
     val recommendUsersForProductsTime = (System.currentTimeMillis() - start).toDouble / 1000.0
-    */
+
     Map("trainingTime" -> trainingTime, "testTime" -> testTime,
-      "trainingMetric" -> trainingMetric, "testMetric" -> testMetric)
-    // "recommendProductsForUsersTime" -> recommendProductsForUsersTime,
-    // "recommendUsersForProductsTime" -> recommendUsersForProductsTime)
+      "trainingMetric" -> trainingMetric, "testMetric" -> testMetric,
+      "recommendProductsForUsersTime" -> recommendProductsForUsersTime,
+      "recommendUsersForProductsTime" -> recommendUsersForProductsTime)
   }
 }
 
@@ -404,13 +398,13 @@ abstract class ClusteringTests(sc: SparkContext) extends PerfTest {
 
   def runTest(rdd: RDD[Vector]): KMeansModel
 
-  val NUM_EXAMPLES =    ("num-examples",   "number of examples for clustering tests")
-  val NUM_FEATURES =   ("num-features",  "number of features for each example for clustering tests")
+  val NUM_POINTS =    ("num-points",   "number of points for clustering tests")
+  val NUM_COLUMNS =   ("num-columns",   "number of columns for each point for clustering tests")
   val NUM_CENTERS =   ("num-centers",   "number of centers for clustering tests")
   val NUM_ITERATIONS =      ("num-iterations",   "number of iterations for the algorithm")
 
-  intOptions = intOptions ++ Seq(NUM_CENTERS, NUM_FEATURES, NUM_ITERATIONS)
-  longOptions = longOptions ++ Seq(NUM_EXAMPLES)
+  intOptions = intOptions ++ Seq(NUM_CENTERS, NUM_COLUMNS, NUM_ITERATIONS)
+  longOptions = longOptions ++ Seq(NUM_POINTS)
   val options = intOptions ++ stringOptions  ++ booleanOptions ++ longOptions ++ doubleOptions
   addOptionsToParser()
 
@@ -418,21 +412,21 @@ abstract class ClusteringTests(sc: SparkContext) extends PerfTest {
   var testRdd: RDD[Vector] = _
 
   def validate(model: KMeansModel, rdd: RDD[Vector]): Double = {
-    val numExamples = rdd.cache().count()
+    val numPoints = rdd.cache().count()
 
     val error = model.computeCost(rdd)
 
-    math.sqrt(error/numExamples)
+    math.sqrt(error/numPoints)
   }
 
   override def createInputData(seed: Long) = {
     val numPartitions: Int = intOptionValue(NUM_PARTITIONS)
 
-    val numExamples: Long = longOptionValue(NUM_EXAMPLES)
-    val numFeatures: Int = intOptionValue(NUM_FEATURES)
+    val numPoints: Long = longOptionValue(NUM_POINTS)
+    val numColumns: Int = intOptionValue(NUM_COLUMNS)
     val numCenters: Int = intOptionValue(NUM_CENTERS)
 
-    val data = DataGenerator.generateKMeansVectors(sc, math.ceil(numExamples*1.25).toLong, numFeatures,
+    val data = DataGenerator.generateKMeansVectors(sc, math.ceil(numPoints*1.25).toLong, numColumns,
       numCenters, numPartitions, seed)
 
     val split = data.randomSplit(Array(0.8, 0.2), seed)
@@ -530,10 +524,9 @@ class ALSTest(sc: SparkContext) extends RecommendationTests(sc) {
     val rank: Int = intOptionValue(RANK)
     val regParam = doubleOptionValue(REG_PARAM)
     val seed = intOptionValue(RANDOM_SEED) + 12
-    val implicitRatings: Boolean = booleanOptionValue(IMPLICIT)
 
     new ALS().setIterations(numIterations).setRank(rank).setSeed(seed).setLambda(regParam)
-      .setBlocks(rdd.partitions.length).setImplicitPrefs(implicitRatings).run(rdd)
+      .setBlocks(rdd.partitions.length).run(rdd)
   }
 }
 
diff --git a/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala b/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala
index 13da1ac..95ce9c6 100644
--- a/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala
+++ b/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala
@@ -15,21 +15,21 @@ import mllib.perf.PerfTest
 
 class GaussianMixtureTest(sc: SparkContext) extends PerfTest {
 
-  val NUM_EXAMPLES = ("num-examples", "number of examples for clustering tests")
-  val NUM_FEATURES = ("num-features", "number of features for each example for clustering tests")
+  val NUM_POINTS = ("num-points", "number of points for clustering tests")
+  val NUM_COLUMNS = ("num-columns", "number of columns for each point for clustering tests")
   val NUM_CENTERS = ("num-centers", "number of centers for clustering tests")
   val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm")
 
-  intOptions ++= Seq(NUM_CENTERS, NUM_FEATURES, NUM_ITERATIONS)
-  longOptions ++= Seq(NUM_EXAMPLES)
+  intOptions ++= Seq(NUM_CENTERS, NUM_COLUMNS, NUM_ITERATIONS)
+  longOptions ++= Seq(NUM_POINTS)
   val options = intOptions ++ stringOptions  ++ booleanOptions ++ longOptions ++ doubleOptions
   addOptionsToParser()
 
   var data: RDD[Vector] = _
 
   override def createInputData(seed: Long): Unit = {
-    val m = longOptionValue(NUM_EXAMPLES)
-    val n = intOptionValue(NUM_FEATURES)
+    val m = longOptionValue(NUM_POINTS)
+    val n = intOptionValue(NUM_COLUMNS)
     val k = intOptionValue(NUM_CENTERS)
     val p = intOptionValue(NUM_PARTITIONS)
 
@@ -46,7 +46,7 @@ class GaussianMixtureTest(sc: SparkContext) extends PerfTest {
           Vectors.dense(y.data)
         }
       }.cache()
-    logInfo(s"Generated ${data.count()} examples.")
+    logInfo(s"Generated ${data.count()} points.")
   }
 
   override def run(): JValue = {
diff --git a/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/PICTest.scala b/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/PICTest.scala
index 2018c61..6832ffa 100644
--- a/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/PICTest.scala
+++ b/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/PICTest.scala
@@ -11,28 +11,28 @@ import mllib.perf.PerfTest
 
 class PICTest(sc: SparkContext) extends PerfTest {
 
-  val NUM_EXAMPLES = ("num-examples", "number of examples")
+  val NUM_POINTS = ("num-points", "number of points")
   val NODE_DEGREE = ("node-degree", "number of neighbors each node is connected to")
   val NUM_CENTERS = ("num-centers", "number of centers for clustering tests")
   val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm")
 
   intOptions ++= Seq(NODE_DEGREE, NUM_CENTERS, NUM_ITERATIONS)
-  longOptions ++= Seq(NUM_EXAMPLES)
+  longOptions ++= Seq(NUM_POINTS)
   val options = intOptions ++ stringOptions  ++ booleanOptions ++ longOptions ++ doubleOptions
   addOptionsToParser()
 
   var data: RDD[(Long, Long, Double)] = _
 
   override def createInputData(seed: Long): Unit = {
-    val numExamples = longOptionValue(NUM_EXAMPLES)
+    val numPoints = longOptionValue(NUM_POINTS)
     val nodeDegree = intOptionValue(NODE_DEGREE)
     val numPartitions = intOptionValue(NUM_PARTITIONS)
 
     // Generates a periodic banded matrix with bandwidth = nodeDegree
-    data = sc.parallelize(0L to numExamples, numPartitions)
+    val data = sc.parallelize(0L to numPoints, numPartitions)
       .flatMap { id =>
-        (((id - nodeDegree / 2) % numExamples) until id).map { nbr =>
-          (id, (nbr + numExamples) % numExamples, 1D)
+        (((id - nodeDegree / 2) % numPoints) until id).map { nbr =>
+          (id, (nbr + numPoints) % numPoints, 1D)
         }
       }
     logInfo(s"Generated ${data.count()} pairwise similarities.")
@@ -46,7 +46,6 @@ class PICTest(sc: SparkContext) extends PerfTest {
       .setK(k)
       .setMaxIterations(numIterations)
     val model = pic.run(data)
-    model.assignments.count()
     val duration = (System.currentTimeMillis() - start) / 1e3
     "time" -> duration
   }
diff --git a/mllib-tests/v1p5/src/main/scala/mllib/perf/util/DataGenerator.scala b/mllib-tests/v1p5/src/main/scala/mllib/perf/util/DataGenerator.scala
index 33f041e..e65a5a5 100644
--- a/mllib-tests/v1p5/src/main/scala/mllib/perf/util/DataGenerator.scala
+++ b/mllib-tests/v1p5/src/main/scala/mllib/perf/util/DataGenerator.scala
@@ -548,7 +548,7 @@ class FeaturesGenerator(val categoricalArities: Array[Int], val numContinuous: I
 
 class KMeansDataGenerator(
     val numCenters: Int,
-    val numFeatures: Int,
+    val numColumns: Int,
     val seed: Long) extends RandomDataGenerator[Vector] {
 
   private val rng = new java.util.Random(seed)
@@ -567,7 +567,7 @@ class KMeansDataGenerator(
   }
 
   private val centers = (0 until numCenters).map{i =>
-    Array.fill(numFeatures)((2 * rng.nextDouble() - 1)*scale_factors(i))
+    Array.fill(numColumns)((2 * rng.nextDouble() - 1)*scale_factors(i))
   }
 
   override def nextValue(): Vector = {
@@ -575,12 +575,12 @@ class KMeansDataGenerator(
 
     val centerToAddTo = centers(concentrations.indexWhere(p => pick_center_rand <= p))
 
-    Vectors.dense(Array.tabulate(numFeatures)(i => centerToAddTo(i) + rng2.nextGaussian()))
+    Vectors.dense(Array.tabulate(numColumns)(i => centerToAddTo(i) + rng2.nextGaussian()))
   }
 
   override def setSeed(seed: Long) {
     rng.setSeed(seed)
   }
 
-  override def copy(): KMeansDataGenerator = new KMeansDataGenerator(numCenters, numFeatures, seed)
+  override def copy(): KMeansDataGenerator = new KMeansDataGenerator(numCenters, numColumns, seed)
 }
diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/LinearAlgebraTests.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/LinearAlgebraTests.scala
new file mode 100644
index 0000000..b992173
--- /dev/null
+++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/LinearAlgebraTests.scala
@@ -0,0 +1,68 @@
+package mllib.perf
+
+import org.json4s.JsonAST.JValue
+import org.json4s.JsonDSL._
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.linalg.distributed.RowMatrix
+
+import mllib.perf.util.DataGenerator
+
+/** Parent class for linear algebra tests which run on a large dataset.
+  * Generated this way so that SVD / PCA can be added easily
+  */
+abstract class LinearAlgebraTests(sc: SparkContext) extends PerfTest {
+
+  def runTest(rdd: RowMatrix, rank: Int)
+
+  val NUM_ROWS =  ("num-rows",   "number of rows of the matrix")
+  val NUM_COLS =  ("num-cols",   "number of columns of the matrix")
+  val RANK =  ("rank",   "number of leading singular values")
+
+  longOptions = Seq(NUM_ROWS)
+  intOptions = intOptions ++ Seq(RANK, NUM_COLS)
+
+  var rdd: RowMatrix = _
+
+  val options = intOptions ++ stringOptions  ++ booleanOptions ++ doubleOptions ++ longOptions
+  addOptionsToParser()
+  override def createInputData(seed: Long) = {
+    val m: Long = longOptionValue(NUM_ROWS)
+    val n: Int = intOptionValue(NUM_COLS)
+    val numPartitions: Int = intOptionValue(NUM_PARTITIONS)
+
+    rdd = DataGenerator.generateDistributedSquareMatrix(sc, m, n, numPartitions, seed)
+  }
+
+  override def run(): JValue = {
+    val rank = intOptionValue(RANK)
+
+    val start = System.currentTimeMillis()
+    runTest(rdd, rank)
+    val end = System.currentTimeMillis()
+    val time = (end - start).toDouble / 1000.0
+
+    Map("time" -> time)
+  }
+}
+
+
+class SVDTest(sc: SparkContext) extends LinearAlgebraTests(sc) {
+  override def runTest(data: RowMatrix, rank: Int) {
+     data.computeSVD(rank, computeU = true)
+  }
+}
+
+class PCATest(sc: SparkContext) extends LinearAlgebraTests(sc) {
+  override def runTest(data: RowMatrix, rank: Int) {
+    val principal = data.computePrincipalComponents(rank)
+    sc.broadcast(principal)
+    data.multiply(principal)
+  }
+}
+
+class ColumnSummaryStatisticsTest(sc: SparkContext) extends LinearAlgebraTests(sc) {
+  override def runTest(data: RowMatrix, rank: Int) {
+    data.computeColumnSummaryStatistics()
+  }
+}
diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/MLAlgorithmTests.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/MLAlgorithmTests.scala
new file mode 100644
index 0000000..693ca7c
--- /dev/null
+++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/MLAlgorithmTests.scala
@@ -0,0 +1,779 @@
+package mllib.perf
+
+import org.json4s.JsonAST._
+import org.json4s.JsonDSL._
+
+import org.apache.spark.SparkContext
+import org.apache.spark.ml.PredictionModel
+import org.apache.spark.ml.classification.{GBTClassificationModel, GBTClassifier, RandomForestClassificationModel, RandomForestClassifier, LogisticRegression}
+import org.apache.spark.ml.regression.{GBTRegressionModel, GBTRegressor, RandomForestRegressionModel, RandomForestRegressor, LinearRegression}
+import org.apache.spark.mllib.classification._
+import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
+import org.apache.spark.mllib.optimization.{SquaredL2Updater, L1Updater, SimpleUpdater}
+import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating}
+import org.apache.spark.mllib.regression._
+import org.apache.spark.mllib.tree.{GradientBoostedTrees, RandomForest}
+import org.apache.spark.mllib.tree.configuration.{Algo, BoostingStrategy, QuantileStrategy, Strategy}
+import org.apache.spark.mllib.tree.impurity.Variance
+import org.apache.spark.mllib.tree.loss.{LogLoss, SquaredError}
+import org.apache.spark.mllib.tree.model.{GradientBoostedTreesModel, RandomForestModel}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.SQLContext
+
+import mllib.perf.util.{DataGenerator, DataLoader}
+
+/** Parent class for tests which run on a large dataset. */
+abstract class RegressionAndClassificationTests[M](sc: SparkContext) extends PerfTest {
+
+  def runTest(rdd: RDD[LabeledPoint]): M
+
+  def validate(model: M, rdd: RDD[LabeledPoint]): Double
+
+  val NUM_EXAMPLES =  ("num-examples",   "number of examples for regression tests")
+  val NUM_FEATURES =  ("num-features",   "number of features of each example for regression tests")
+
+  intOptions = intOptions ++ Seq(NUM_FEATURES)
+  longOptions = Seq(NUM_EXAMPLES)
+
+  var rdd: RDD[LabeledPoint] = _
+  var testRdd: RDD[LabeledPoint] = _
+
+  override def run(): JValue = {
+    var start = System.currentTimeMillis()
+    val model = runTest(rdd)
+    val trainingTime = (System.currentTimeMillis() - start).toDouble / 1000.0
+
+    start = System.currentTimeMillis()
+    val trainingMetric = validate(model, rdd)
+    val testTime = (System.currentTimeMillis() - start).toDouble / 1000.0
+
+    val testMetric = validate(model, testRdd)
+    Map("trainingTime" -> trainingTime, "testTime" -> testTime,
+      "trainingMetric" -> trainingMetric, "testMetric" -> testMetric)
+  }
+
+  /**
+   * For classification
+   * @param predictions RDD over (prediction, truth) for each instance
+   * @return Percent correctly classified
+   */
+  def calculateAccuracy(predictions: RDD[(Double, Double)], numExamples: Long): Double = {
+    predictions.map{case (pred, label) =>
+      if (pred == label) 1.0 else 0.0
+    }.sum() * 100.0 / numExamples
+  }
+
+  /**
+   * For regression
+   * @param predictions RDD over (prediction, truth) for each instance
+   * @return Root mean squared error (RMSE)
+   */
+  def calculateRMSE(predictions: RDD[(Double, Double)], numExamples: Long): Double = {
+    val error = predictions.map{ case (pred, label) =>
+      (pred - label) * (pred - label)
+    }.sum()
+    math.sqrt(error / numExamples)
+  }
+}
+
+/** Parent class for Generalized Linear Model (GLM) tests */
+abstract class GLMTests(sc: SparkContext)
+  extends RegressionAndClassificationTests[GeneralizedLinearModel](sc) {
+
+  val STEP_SIZE =         ("step-size",   "step size for SGD")
+  val NUM_ITERATIONS =    ("num-iterations",   "number of iterations for the algorithm")
+  val REG_TYPE =          ("reg-type",   "type of regularization: none, l1, l2, elastic-net")
+  val ELASTIC_NET_PARAM = ("elastic-net-param",   "elastic-net param, 0.0 for L2, and 1.0 for L1")
+  val REG_PARAM =         ("reg-param",   "the regularization parameter against overfitting")
+  val OPTIMIZER =         ("optimizer", "optimization algorithm (elastic-net only supports l-bfgs): sgd, l-bfgs")
+
+  intOptions = intOptions ++ Seq(NUM_ITERATIONS)
+  doubleOptions = doubleOptions ++ Seq(ELASTIC_NET_PARAM, STEP_SIZE, REG_PARAM)
+  stringOptions = stringOptions ++ Seq(REG_TYPE, OPTIMIZER)
+}
+
+class GLMRegressionTest(sc: SparkContext) extends GLMTests(sc) {
+
+  val INTERCEPT =  ("intercept",   "intercept for random data generation")
+  val LABEL_NOISE =  ("label-noise",   "scale factor for the noise during label generation")
+  val LOSS =  ("loss",   "loss to minimize. Supported: l2 (squared error).")
+
+  doubleOptions = doubleOptions ++ Seq(INTERCEPT, LABEL_NOISE)
+  stringOptions = stringOptions ++ Seq(LOSS)
+
+  val options = intOptions ++ stringOptions  ++ booleanOptions ++ doubleOptions ++ longOptions
+  addOptionsToParser()
+
+  override def createInputData(seed: Long) = {
+    val numExamples: Long = longOptionValue(NUM_EXAMPLES)
+    val numFeatures: Int = intOptionValue(NUM_FEATURES)
+    val numPartitions: Int = intOptionValue(NUM_PARTITIONS)
+
+    val intercept: Double = doubleOptionValue(INTERCEPT)
+    val labelNoise: Double = doubleOptionValue(LABEL_NOISE)
+
+    val data = DataGenerator.generateLabeledPoints(sc, math.ceil(numExamples * 1.25).toLong,
+      numFeatures, intercept, labelNoise, numPartitions, seed)
+
+    val split = data.randomSplit(Array(0.8, 0.2), seed)
+
+    rdd = split(0).cache()
+    testRdd = split(1)
+
+    // Materialize rdd
+    println("Num Examples: " + rdd.count())
+  }
+
+  override def validate(model: GeneralizedLinearModel, rdd: RDD[LabeledPoint]): Double = {
+    val numExamples = rdd.count()
+    val predictions: RDD[(Double, Double)] = rdd.map { example =>
+      (model.predict(example.features), example.label)
+    }
+    calculateRMSE(predictions, numExamples)
+  }
+
+  override def runTest(rdd: RDD[LabeledPoint]): GeneralizedLinearModel = {
+    val stepSize = doubleOptionValue(STEP_SIZE)
+    val loss = stringOptionValue(LOSS)
+    val regType = stringOptionValue(REG_TYPE)
+    val regParam = doubleOptionValue(REG_PARAM)
+    val elasticNetParam = doubleOptionValue(ELASTIC_NET_PARAM)
+    val numIterations = intOptionValue(NUM_ITERATIONS)
+    val optimizer = stringOptionValue(OPTIMIZER)
+
+    // Linear Regression only supports squared loss for now.
+    if (!Array("l2").contains(loss)) {
+      throw new IllegalArgumentException(
+        s"GLMRegressionTest run with unknown loss ($loss).  Supported values: l2.")
+    }
+
+    if (regType == "elastic-net") {  // use spark.ml
+      assert(optimizer == "auto" || optimizer == "l-bfgs", "GLMClassificationTest with" +
+        s" regType=elastic-net expects optimizer to be in {auto, l-bfgs}, but found: $optimizer")
+      println("WARNING: Linear Regression with elastic-net in ML package uses LBFGS/OWLQN for" +
+        " optimization which ignores stepSize in Spark 1.5.")
+      val rr = new LinearRegression()
+        .setElasticNetParam(elasticNetParam)
+        .setRegParam(regParam)
+        .setMaxIter(numIterations)
+      val sqlContext = new SQLContext(rdd.context)
+      import sqlContext.implicits._
+      val mlModel = rr.fit(rdd.toDF())
+
+      new LinearRegressionModel(Vectors.fromML(mlModel.coefficients), 
+        mlModel.intercept)
+
+    } else {
+      assert(optimizer == "sgd", "GLMClassificationTest with" +
+        s" regType!=elastic-net expects optimizer to be sgd, but found: $optimizer")
+      (loss, regType) match {
+        case ("l2", "none") =>
+          val lr = new LinearRegressionWithSGD().setIntercept(addIntercept = true)
+          lr.optimizer
+            .setNumIterations(numIterations)
+            .setStepSize(stepSize)
+            .setConvergenceTol(0.0)
+          lr.run(rdd)
+        case ("l2", "l1") =>
+          val lasso = new LassoWithSGD().setIntercept(addIntercept = true)
+          lasso.optimizer
+            .setNumIterations(numIterations)
+            .setStepSize(stepSize)
+            .setRegParam(regParam)
+            .setConvergenceTol(0.0)
+          lasso.run(rdd)
+        case ("l2", "l2") =>
+          val rr = new RidgeRegressionWithSGD().setIntercept(addIntercept = true)
+          rr.optimizer
+            .setNumIterations(numIterations)
+            .setStepSize(stepSize)
+            .setRegParam(regParam)
+            .setConvergenceTol(0.0)
+          rr.run(rdd)
+        case _ =>
+          throw new IllegalArgumentException(
+            s"GLMRegressionTest given incompatible (loss, regType) = ($loss, $regType)." +
+              s" Note the set of supported combinations increases in later Spark versions.")
+      }
+    }
+  }
+}
+
+class GLMClassificationTest(sc: SparkContext) extends GLMTests(sc) {
+
+  val THRESHOLD =  ("per-negative",   "probability for a negative label during data generation")
+  val FEATURE_NOISE =  ("feature-noise",   "scale factor for the noise during feature generation")
+  val LOSS =  ("loss",   "loss to minimize. Supported: logistic, hinge (SVM).")
+
+  doubleOptions = doubleOptions ++ Seq(THRESHOLD, FEATURE_NOISE)
+  stringOptions = stringOptions ++ Seq(LOSS)
+
+  val options = intOptions ++ stringOptions  ++ booleanOptions ++ doubleOptions ++ longOptions
+  addOptionsToParser()
+
+  override def validate(model: GeneralizedLinearModel, rdd: RDD[LabeledPoint]): Double = {
+    val numExamples = rdd.count()
+    val predictions: RDD[(Double, Double)] = rdd.map { example =>
+      (model.predict(example.features), example.label)
+    }
+    calculateAccuracy(predictions, numExamples)
+  }
+
+  override def createInputData(seed: Long) = {
+    val numExamples: Long = longOptionValue(NUM_EXAMPLES)
+    val numFeatures: Int = intOptionValue(NUM_FEATURES)
+    val numPartitions: Int = intOptionValue(NUM_PARTITIONS)
+
+    val threshold: Double = doubleOptionValue(THRESHOLD)
+    val featureNoise: Double = doubleOptionValue(FEATURE_NOISE)
+
+    val data = DataGenerator.generateClassificationLabeledPoints(sc,
+      math.ceil(numExamples * 1.25).toLong, numFeatures, threshold, featureNoise, numPartitions,
+      seed)
+
+    val split = data.randomSplit(Array(0.8, 0.2), seed)
+
+    rdd = split(0).cache()
+    testRdd = split(1)
+
+    // Materialize rdd
+    println("Num Examples: " + rdd.count())
+  }
+
+  override def runTest(rdd: RDD[LabeledPoint]): GeneralizedLinearModel = {
+    val stepSize = doubleOptionValue(STEP_SIZE)
+    val loss = stringOptionValue(LOSS)
+    val regType = stringOptionValue(REG_TYPE)
+    val regParam = doubleOptionValue(REG_PARAM)
+    val elasticNetParam = doubleOptionValue(ELASTIC_NET_PARAM)
+    val numIterations = intOptionValue(NUM_ITERATIONS)
+    val optimizer = stringOptionValue(OPTIMIZER)
+
+    // For classification problem in GLM, we currently support logistic loss and hinge loss.
+    if (!Array("logistic", "hinge").contains(loss)) {
+      throw new IllegalArgumentException(
+        s"GLMClassificationTest run with unknown loss ($loss).  Supported values: logistic, hinge.")
+    }
+
+    if (regType == "elastic-net") {  // use spark.ml
+      assert(optimizer == "auto" || optimizer == "l-bfgs", "GLMClassificationTest with" +
+        " regType=elastic-net expects optimizer to be in {auto, l-bfgs}")
+      loss match {
+        case "logistic" =>
+          println("WARNING: Logistic Regression with elastic-net in ML package uses LBFGS/OWLQN" +
+            " for optimization which ignores stepSize in Spark 1.5.")
+          val lor = new LogisticRegression()
+            .setElasticNetParam(elasticNetParam)
+            .setRegParam(regParam)
+            .setMaxIter(numIterations)
+          val sqlContext = new SQLContext(rdd.context)
+          import sqlContext.implicits._
+          val mlModel = lor.fit(rdd.toDF())
+          new LogisticRegressionModel(Vectors.fromML(mlModel.coefficients), mlModel.intercept)
+        case _ =>
+          throw new IllegalArgumentException(
+            s"GLMClassificationTest given unsupported loss = $loss." +
+              s" Note the set of supported combinations increases in later Spark versions.")
+      }
+    } else {
+      val updater = regType match {
+        case "none" => new SimpleUpdater
+        case "l1" => new L1Updater
+        case "l2" => new SquaredL2Updater
+      }
+      (loss, optimizer) match {
+        case ("logistic", "sgd") =>
+          val lr = new LogisticRegressionWithSGD()
+          lr.optimizer
+            .setStepSize(stepSize)
+            .setNumIterations(numIterations)
+            .setConvergenceTol(0.0)
+            .setUpdater(updater)
+          lr.run(rdd)
+        case ("logistic", "l-bfgs") =>
+          println("WARNING: LogisticRegressionWithLBFGS ignores stepSize in this Spark version.")
+          val lr = new LogisticRegressionWithLBFGS()
+          lr.optimizer
+            .setNumIterations(numIterations)
+            .setConvergenceTol(0.0)
+            .setUpdater(updater)
+          lr.run(rdd)
+        case ("hinge", "sgd") =>
+          val svm = new SVMWithSGD()
+          svm.optimizer
+            .setNumIterations(numIterations)
+            .setStepSize(stepSize)
+            .setRegParam(regParam)
+            .setConvergenceTol(0.0)
+            .setUpdater(updater)
+          svm.run(rdd)
+        case _ =>
+          throw new IllegalArgumentException(
+            s"GLMClassificationTest given incompatible (loss, regType) = ($loss, $regType)." +
+              s" Supported combinations include: (elastic-net, _), (logistic, sgd), (logistic, l-bfgs), (hinge, sgd)." +
+              s" Note the set of supported combinations increases in later Spark versions.")
+      }
+    }
+  }
+}
+
+abstract class RecommendationTests(sc: SparkContext) extends PerfTest {
+
+  def runTest(rdd: RDD[Rating]): MatrixFactorizationModel
+
+  val NUM_USERS =    ("num-users",   "number of users for recommendation tests")
+  val NUM_PRODUCTS = ("num-products", "number of features of each example for recommendation tests")
+  val NUM_RATINGS =  ("num-ratings",   "number of ratings for recommendation tests")
+  val RANK =         ("rank", "rank of factorized matrices for recommendation tests")
+  val IMPLICIT =     ("implicit-prefs", "use implicit ratings")
+  val NUM_ITERATIONS =  ("num-iterations",   "number of iterations for the algorithm")
+  val REG_PARAM =      ("reg-param",   "the regularization parameter against overfitting")
+
+  intOptions = intOptions ++ Seq(NUM_USERS, NUM_PRODUCTS, RANK, NUM_ITERATIONS)
+  longOptions = longOptions ++ Seq(NUM_RATINGS)
+  booleanOptions = booleanOptions ++ Seq(IMPLICIT)
+  doubleOptions = doubleOptions ++ Seq(REG_PARAM)
+  val options = intOptions ++ stringOptions  ++ booleanOptions ++ longOptions ++ doubleOptions
+  addOptionsToParser()
+
+  var rdd: RDD[Rating] = _
+  var testRdd: RDD[Rating] = _
+
+  override def createInputData(seed: Long) = {
+    val numPartitions: Int = intOptionValue(NUM_PARTITIONS)
+
+    val numUsers: Int = intOptionValue(NUM_USERS)
+    val numProducts: Int = intOptionValue(NUM_PRODUCTS)
+    val numRatings: Long = longOptionValue(NUM_RATINGS)
+    val implicitRatings: Boolean = booleanOptionValue(IMPLICIT)
+
+    val data = DataGenerator.generateRatings(sc, numUsers, numProducts,
+      numRatings, implicitRatings, numPartitions, seed)
+
+    rdd = data._1.cache()
+    testRdd = data._2
+
+    // Materialize rdd
+    println("Num Examples: " + rdd.count())
+  }
+
+  def validate(model: MatrixFactorizationModel,
+               data: RDD[Rating]): Double = {
+    val implicitPrefs: Boolean = booleanOptionValue(IMPLICIT)
+    val predictions: RDD[Rating] = model.predict(data.map(x => (x.user, x.product)))
+    val predictionsAndRatings: RDD[(Double, Double)] = predictions.map{ x =>
+      def mapPredictedRating(r: Double) = if (implicitPrefs) math.max(math.min(r, 1.0), 0.0) else r
+      ((x.user, x.product), mapPredictedRating(x.rating))
+    }.join(data.map(x => ((x.user, x.product), x.rating))).values
+
+    math.sqrt(predictionsAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).mean())
+  }
+
+  override def run(): JValue = {
+    var start = System.currentTimeMillis()
+    val model = runTest(rdd)
+    val trainingTime = (System.currentTimeMillis() - start).toDouble / 1000.0
+
+    start = System.currentTimeMillis()
+    val trainingMetric = validate(model, rdd)
+    val testTime = (System.currentTimeMillis() - start).toDouble / 1000.0
+
+    val testMetric = validate(model, testRdd)
+
+    val numThingsToRecommend = 10
+    start = System.currentTimeMillis()
+    model.recommendProductsForUsers(numThingsToRecommend).count()
+    val recommendProductsForUsersTime = (System.currentTimeMillis() - start).toDouble / 1000.0
+    start = System.currentTimeMillis()
+    model.recommendUsersForProducts(numThingsToRecommend).count()
+    val recommendUsersForProductsTime = (System.currentTimeMillis() - start).toDouble / 1000.0
+
+    Map("trainingTime" -> trainingTime, "testTime" -> testTime,
+      "trainingMetric" -> trainingMetric, "testMetric" -> testMetric,
+      "recommendProductsForUsersTime" -> recommendProductsForUsersTime,
+      "recommendUsersForProductsTime" -> recommendUsersForProductsTime)
+  }
+}
+
+abstract class ClusteringTests(sc: SparkContext) extends PerfTest {
+
+  def runTest(rdd: RDD[Vector]): KMeansModel
+
+  val NUM_POINTS =    ("num-points",   "number of points for clustering tests")
+  val NUM_COLUMNS =   ("num-columns",   "number of columns for each point for clustering tests")
+  val NUM_CENTERS =   ("num-centers",   "number of centers for clustering tests")
+  val NUM_ITERATIONS =      ("num-iterations",   "number of iterations for the algorithm")
+
+  intOptions = intOptions ++ Seq(NUM_CENTERS, NUM_COLUMNS, NUM_ITERATIONS)
+  longOptions = longOptions ++ Seq(NUM_POINTS)
+  val options = intOptions ++ stringOptions  ++ booleanOptions ++ longOptions ++ doubleOptions
+  addOptionsToParser()
+
+  var rdd: RDD[Vector] = _
+  var testRdd: RDD[Vector] = _
+
+  def validate(model: KMeansModel, rdd: RDD[Vector]): Double = {
+    val numPoints = rdd.cache().count()
+
+    val error = model.computeCost(rdd)
+
+    math.sqrt(error/numPoints)
+  }
+
+  override def createInputData(seed: Long) = {
+    val numPartitions: Int = intOptionValue(NUM_PARTITIONS)
+
+    val numPoints: Long = longOptionValue(NUM_POINTS)
+    val numColumns: Int = intOptionValue(NUM_COLUMNS)
+    val numCenters: Int = intOptionValue(NUM_CENTERS)
+
+    val data = DataGenerator.generateKMeansVectors(sc, math.ceil(numPoints*1.25).toLong, numColumns,
+      numCenters, numPartitions, seed)
+
+    val split = data.randomSplit(Array(0.8, 0.2), seed)
+
+    rdd = split(0).cache()
+    testRdd = split(1)
+
+    // Materialize rdd
+    println("Num Examples: " + rdd.count())
+  }
+
+  override def run(): JValue = {
+    var start = System.currentTimeMillis()
+    val model = runTest(rdd)
+    val trainingTime = (System.currentTimeMillis() - start).toDouble / 1000.0
+
+    start = System.currentTimeMillis()
+    val trainingMetric = validate(model, rdd)
+    val testTime = (System.currentTimeMillis() - start).toDouble / 1000.0
+
+    val testMetric = validate(model, testRdd)
+    Map("trainingTime" -> trainingTime, "testTime" -> testTime,
+      "trainingMetric" -> trainingMetric, "testMetric" -> testMetric)
+  }
+}
+
+// Classification Algorithms
+
+class NaiveBayesTest(sc: SparkContext)
+  extends RegressionAndClassificationTests[NaiveBayesModel](sc) {
+
+  val THRESHOLD =  ("per-negative",   "probability for a negative label during data generation")
+  val FEATURE_NOISE =  ("feature-noise",   "scale factor for the noise during feature generation")
+  val SMOOTHING =     ("nb-lambda",   "the smoothing parameter lambda for Naive Bayes")
+  val MODEL_TYPE = ("model-type", "either multinomial (default) or bernoulli")
+
+  doubleOptions = doubleOptions ++ Seq(THRESHOLD, FEATURE_NOISE, SMOOTHING)
+  stringOptions = stringOptions ++ Seq(MODEL_TYPE)
+  val options = intOptions ++ stringOptions  ++ booleanOptions ++ doubleOptions ++ longOptions
+  addOptionsToParser()
+
+  /** Note: using same data generation as for GLMClassificationTest, but should change later */
+  override def createInputData(seed: Long) = {
+    val numExamples: Long = longOptionValue(NUM_EXAMPLES)
+    val numFeatures: Int = intOptionValue(NUM_FEATURES)
+    val numPartitions: Int = intOptionValue(NUM_PARTITIONS)
+
+    val threshold: Double = doubleOptionValue(THRESHOLD)
+    val featureNoise: Double = doubleOptionValue(FEATURE_NOISE)
+    val modelType = stringOptionValue(MODEL_TYPE)
+
+    val data = if (modelType == "bernoulli") {
+      DataGenerator.generateBinaryLabeledPoints(sc,
+        math.ceil(numExamples * 1.25).toLong, numFeatures, threshold, numPartitions, seed)
+    } else {
+      val negdata = DataGenerator.generateClassificationLabeledPoints(sc,
+        math.ceil(numExamples * 1.25).toLong, numFeatures, threshold, featureNoise, numPartitions,
+        seed)
+      val dataNonneg = negdata.map { lp =>
+        LabeledPoint(lp.label, Vectors.dense(lp.features.toArray.map(math.abs)))
+      }
+      dataNonneg
+    }
+
+    val split = data.randomSplit(Array(0.8, 0.2), seed)
+
+    rdd = split(0).cache()
+    testRdd = split(1)
+
+    // Materialize rdd
+    println("Num Examples: " + rdd.count())
+  }
+
+  override def validate(model: NaiveBayesModel, rdd: RDD[LabeledPoint]): Double = {
+    val numExamples = rdd.count()
+    val predictions: RDD[(Double, Double)] = rdd.map { example =>
+      (model.predict(example.features), example.label)
+    }
+    calculateAccuracy(predictions, numExamples)
+  }
+
+  override def runTest(rdd: RDD[LabeledPoint]): NaiveBayesModel = {
+    val lambda = doubleOptionValue(SMOOTHING)
+
+    val modelType = stringOptionValue(MODEL_TYPE)
+    NaiveBayes.train(rdd, lambda, modelType)
+  }
+}
+
+
+// Recommendation
+class ALSTest(sc: SparkContext) extends RecommendationTests(sc) {
+  override def runTest(rdd: RDD[Rating]): MatrixFactorizationModel = {
+    val numIterations: Int = intOptionValue(NUM_ITERATIONS)
+    val rank: Int = intOptionValue(RANK)
+    val regParam = doubleOptionValue(REG_PARAM)
+    val seed = intOptionValue(RANDOM_SEED) + 12
+
+    new ALS().setIterations(numIterations).setRank(rank).setSeed(seed).setLambda(regParam)
+      .setBlocks(rdd.partitions.length).run(rdd)
+  }
+}
+
+// Clustering
+// TODO: refactor into mllib.perf.clustering like the other clustering tests
+class KMeansTest(sc: SparkContext) extends ClusteringTests(sc) {
+  override def runTest(rdd: RDD[Vector]): KMeansModel = {
+    val numIterations: Int = intOptionValue(NUM_ITERATIONS)
+    val k: Int = intOptionValue(NUM_CENTERS)
+    KMeans.train(rdd, k, numIterations)
+  }
+}
+
+// Decision-tree
+sealed trait TreeBasedModel
+case class MLlibRFModel(model: RandomForestModel) extends TreeBasedModel
+case class MLlibGBTModel(model: GradientBoostedTreesModel) extends TreeBasedModel
+case class MLRFRegressionModel(model: RandomForestRegressionModel) extends TreeBasedModel
+case class MLRFClassificationModel(model: RandomForestClassificationModel) extends TreeBasedModel
+case class MLGBTRegressionModel(model: GBTRegressionModel) extends TreeBasedModel
+case class MLGBTClassificationModel(model: GBTClassificationModel) extends TreeBasedModel
+
+/**
+ * Parent class for DecisionTree-based tests which run on a large dataset.
+ */
+abstract class DecisionTreeTests(sc: SparkContext)
+  extends RegressionAndClassificationTests[TreeBasedModel](sc) {
+
+  val TEST_DATA_FRACTION =
+    ("test-data-fraction",  "fraction of data to hold out for testing (ignored if given training and test dataset)")
+  val LABEL_TYPE =
+    ("label-type", "Type of label: 0 indicates regression, 2+ indicates " +
+      "classification with this many classes")
+  val FRAC_CATEGORICAL_FEATURES = ("frac-categorical-features",
+    "Fraction of features which are categorical")
+  val FRAC_BINARY_FEATURES =
+    ("frac-binary-features", "Fraction of categorical features which are binary. " +
+      "Others have 20 categories.")
+  val TREE_DEPTH = ("tree-depth", "Depth of true decision tree model used to label examples.")
+  val MAX_BINS = ("max-bins", "Maximum number of bins for the decision tree learning algorithm.")
+  val NUM_TREES = ("num-trees", "Number of trees to train.  If 1, run DecisionTree.  If >1, run an ensemble method (RandomForest).")
+  val FEATURE_SUBSET_STRATEGY =
+    ("feature-subset-strategy", "Strategy for feature subset sampling. Supported: auto, all, sqrt, log2, onethird.")
+
+  intOptions = intOptions ++ Seq(LABEL_TYPE, TREE_DEPTH, MAX_BINS, NUM_TREES)
+  doubleOptions = doubleOptions ++ Seq(TEST_DATA_FRACTION, FRAC_CATEGORICAL_FEATURES, FRAC_BINARY_FEATURES)
+  stringOptions = stringOptions ++ Seq(FEATURE_SUBSET_STRATEGY)
+
+  addOptionalOptionToParser("training-data", "path to training dataset (if not given, use random data)", "", classOf[String])
+  addOptionalOptionToParser("test-data", "path to test dataset (only used if training dataset given)" +
+      " (if not given, hold out part of training data for validation)", "", classOf[String])
+
+  var categoricalFeaturesInfo: Map[Int, Int] = Map.empty
+
+  protected var labelType = -1
+
+  def validate(model: TreeBasedModel, rdd: RDD[LabeledPoint]): Double = {
+    val numExamples = rdd.count()
+    val predictions: RDD[(Double, Double)] = model match {
+      case MLlibRFModel(rfModel) => rfModel.predict(rdd.map(_.features)).zip(rdd.map(_.label))
+      case MLlibGBTModel(gbtModel) => gbtModel.predict(rdd.map(_.features)).zip(rdd.map(_.label))
+      case MLRFRegressionModel(rfModel) => makePredictions(rfModel, rdd)
+      case MLRFClassificationModel(rfModel) => makePredictions(rfModel, rdd)
+      case MLGBTRegressionModel(gbtModel) => makePredictions(gbtModel, rdd)
+      case MLGBTClassificationModel(gbtModel) => makePredictions(gbtModel, rdd)
+    }
+    val labelType: Int = intOptionValue(LABEL_TYPE)
+    if (labelType == 0) {
+      calculateRMSE(predictions, numExamples)
+    } else {
+      calculateAccuracy(predictions, numExamples)
+    }
+  }
+
+  // TODO: generate DataFrame outside of `runTest` so it is not included in timing results
+  private def makePredictions(
+      model: PredictionModel[org.apache.spark.ml.linalg.Vector, _], rdd: RDD[LabeledPoint]): RDD[(Double, Double)] = {
+    val labelType: Int = intOptionValue(LABEL_TYPE)
+    val dataFrame = DataGenerator.setMetadata(rdd, categoricalFeaturesInfo, labelType)
+    val results = model.transform(dataFrame)
+    results
+      .select(model.getPredictionCol, model.getLabelCol)
+      .rdd
+      .map { case Row(prediction: Double, label: Double) => (prediction, label) }
+  }
+}
+
+class DecisionTreeTest(sc: SparkContext) extends DecisionTreeTests(sc) {
+  val supportedTreeTypes = Array("RandomForest", "GradientBoostedTrees",
+    "ml.RandomForest", "ml.GradientBoostedTrees")
+
+  val ENSEMBLE_TYPE = ("ensemble-type", "Type of ensemble algorithm: " + supportedTreeTypes.mkString(" "))
+
+  stringOptions = stringOptions ++ Seq(ENSEMBLE_TYPE)
+
+  val options = intOptions ++ stringOptions ++ booleanOptions ++ doubleOptions ++ longOptions
+  addOptionsToParser()
+
+  private def getTestDataFraction: Double = {
+    val testDataFraction: Double = doubleOptionValue(TEST_DATA_FRACTION)
+    assert(testDataFraction >= 0 && testDataFraction <= 1, s"Bad testDataFraction: $testDataFraction")
+    testDataFraction
+  }
+
+  override def createInputData(seed: Long) = {
+    val trainingDataPath: String = optionValue[String]("training-data")
+    val (rdds, categoricalFeaturesInfo_, numClasses) = if (trainingDataPath != "") {
+      println(s"LOADING FILE: $trainingDataPath")
+      val numPartitions: Int = intOptionValue(NUM_PARTITIONS)
+      val testDataPath: String = optionValue[String]("test-data")
+      val testDataFraction: Double = getTestDataFraction
+      DataLoader.loadLibSVMFiles(sc, numPartitions, trainingDataPath, testDataPath,
+        testDataFraction, seed)
+    } else {
+      createSyntheticInputData(seed)
+    }
+    assert(rdds.length == 2)
+    rdd = rdds(0).cache()
+    testRdd = rdds(1)
+    categoricalFeaturesInfo = categoricalFeaturesInfo_
+    this.labelType = numClasses
+
+    // Materialize rdd
+    println("Num Examples: " + rdd.count())
+  }
+
+  /**
+   * Create synthetic training and test datasets.
+   * @return (trainTestDatasets, categoricalFeaturesInfo, numClasses) where
+   *          trainTestDatasets = Array(trainingData, testData),
+   *          categoricalFeaturesInfo is a map of categorical feature arities, and
+   *          numClasses = number of classes label can take.
+   */
+  private def createSyntheticInputData(
+      seed: Long): (Array[RDD[LabeledPoint]], Map[Int, Int], Int) = {
+    // Generic test options
+    val numPartitions: Int = intOptionValue(NUM_PARTITIONS)
+    // Data dimensions and type
+    val numExamples: Long = longOptionValue(NUM_EXAMPLES)
+    val numFeatures: Int = intOptionValue(NUM_FEATURES)
+    val labelType: Int = intOptionValue(LABEL_TYPE)
+    val fracCategoricalFeatures: Double = doubleOptionValue(FRAC_CATEGORICAL_FEATURES)
+    val fracBinaryFeatures: Double = doubleOptionValue(FRAC_BINARY_FEATURES)
+    // Model specification
+    val treeDepth: Int = intOptionValue(TREE_DEPTH)
+
+    val (rdd_, categoricalFeaturesInfo_) =
+      DataGenerator.generateDecisionTreeLabeledPoints(sc, math.ceil(numExamples * 1.25).toLong,
+        numFeatures, numPartitions, labelType,
+        fracCategoricalFeatures, fracBinaryFeatures, treeDepth, seed)
+
+    val splits = rdd_.randomSplit(Array(0.8, 0.2), seed)
+    (splits, categoricalFeaturesInfo_, labelType)
+  }
+
+  // TODO: generate DataFrame outside of `runTest` so it is not included in timing results
+  override def runTest(rdd: RDD[LabeledPoint]): TreeBasedModel = {
+    val treeDepth: Int = intOptionValue(TREE_DEPTH)
+    val maxBins: Int = intOptionValue(MAX_BINS)
+    val numTrees: Int = intOptionValue(NUM_TREES)
+    val featureSubsetStrategy: String = stringOptionValue(FEATURE_SUBSET_STRATEGY)
+    val ensembleType: String = stringOptionValue(ENSEMBLE_TYPE)
+    if (!supportedTreeTypes.contains(ensembleType)) {
+      throw new IllegalArgumentException(
+        s"DecisionTreeTest given unknown ensembleType param: $ensembleType." +
+        " Supported values: " + supportedTreeTypes.mkString(" "))
+    }
+    if (labelType == 0) {
+      // Regression
+      ensembleType match {
+        case "RandomForest" =>
+          MLlibRFModel(RandomForest.trainRegressor(rdd, categoricalFeaturesInfo, numTrees,
+            featureSubsetStrategy, "variance", treeDepth, maxBins, this.getRandomSeed))
+        case "ml.RandomForest" =>
+          val labelType: Int = intOptionValue(LABEL_TYPE)
+          val dataset = DataGenerator.setMetadata(rdd, categoricalFeaturesInfo, labelType)
+          val model = new RandomForestRegressor()
+            .setImpurity("variance")
+            .setMaxDepth(treeDepth)
+            .setMaxBins(maxBins)
+            .setNumTrees(numTrees)
+            .setFeatureSubsetStrategy(featureSubsetStrategy)
+            .setSeed(this.getRandomSeed)
+            .fit(dataset)
+          MLRFRegressionModel(model)
+        case "GradientBoostedTrees" =>
+          val treeStrategy = new Strategy(Algo.Regression, Variance, treeDepth,
+            labelType, maxBins, QuantileStrategy.Sort, categoricalFeaturesInfo)
+          val boostingStrategy = BoostingStrategy(treeStrategy, SquaredError, numTrees,
+            learningRate = 0.1)
+          MLlibGBTModel(GradientBoostedTrees.train(rdd, boostingStrategy))
+        case "ml.GradientBoostedTrees" =>
+          val labelType: Int = intOptionValue(LABEL_TYPE)
+          val dataset = DataGenerator.setMetadata(rdd, categoricalFeaturesInfo, labelType)
+          val model = new GBTRegressor()
+            .setLossType("squared")
+            .setMaxBins(maxBins)
+            .setMaxDepth(treeDepth)
+            .setMaxIter(numTrees)
+            .setStepSize(0.1)
+            .setSeed(this.getRandomSeed)
+            .fit(dataset)
+          MLGBTRegressionModel(model)
+      }
+    } else if (labelType >= 2) {
+      // Classification
+      ensembleType match {
+        case "RandomForest" =>
+          MLlibRFModel(RandomForest.trainClassifier(rdd, labelType, categoricalFeaturesInfo, numTrees,
+            featureSubsetStrategy, "gini", treeDepth, maxBins, this.getRandomSeed))
+        case "ml.RandomForest" =>
+          val labelType: Int = intOptionValue(LABEL_TYPE)
+          val dataset = DataGenerator.setMetadata(rdd, categoricalFeaturesInfo, labelType)
+          val model = new RandomForestClassifier()
+            .setImpurity("gini")
+            .setMaxDepth(treeDepth)
+            .setMaxBins(maxBins)
+            .setNumTrees(numTrees)
+            .setFeatureSubsetStrategy(featureSubsetStrategy)
+            .setSeed(this.getRandomSeed)
+            .fit(dataset)
+          MLRFClassificationModel(model)
+        case "GradientBoostedTrees" =>
+          val treeStrategy = new Strategy(Algo.Classification, Variance, treeDepth,
+            labelType, maxBins, QuantileStrategy.Sort, categoricalFeaturesInfo)
+          val boostingStrategy = BoostingStrategy(treeStrategy, LogLoss, numTrees,
+            learningRate = 0.1)
+          MLlibGBTModel(GradientBoostedTrees.train(rdd, boostingStrategy))
+        case "ml.GradientBoostedTrees" =>
+          val labelType: Int = intOptionValue(LABEL_TYPE)
+          val dataset = DataGenerator.setMetadata(rdd, categoricalFeaturesInfo, labelType)
+          val model = new GBTClassifier()
+            .setLossType("logistic")
+            .setMaxBins(maxBins)
+            .setMaxDepth(treeDepth)
+            .setMaxIter(numTrees)
+            .setStepSize(0.1)
+            .setSeed(this.getRandomSeed)
+            .fit(dataset)
+          MLGBTClassificationModel(model)
+      }
+    } else {
+      throw new IllegalArgumentException(s"Bad label-type parameter " +
+        s"given to DecisionTreeTest: $labelType")
+    }
+  }
+}
diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/PerfTest.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/PerfTest.scala
new file mode 100644
index 0000000..bf51482
--- /dev/null
+++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/PerfTest.scala
@@ -0,0 +1,134 @@
+package mllib.perf
+
+import scala.collection.JavaConverters._
+
+import joptsimple.{OptionSet, OptionParser}
+
+import org.json4s._
+
+import org.slf4j._
+
+abstract class PerfTest {
+
+  val NUM_TRIALS =          ("num-trials",    "number of trials to run")
+  val INTER_TRIAL_WAIT =    ("inter-trial-wait",   "seconds to sleep between trials")
+  val NUM_PARTITIONS =      ("num-partitions", "number of input partitions")
+  val RANDOM_SEED =         ("random-seed", "seed for random number generator")
+
+  val log = LoggerFactory.getLogger("PerfTest")
+  def logInfo(msg: String) {
+    if (log.isInfoEnabled) {
+      log.info(msg)
+    }
+  }
+ 
+  /** Initialize internal state based on arguments */
+  def initialize(testName_ : String, otherArgs: Array[String]) {
+    testName = testName_
+    optionSet = parser.parse(otherArgs:_*)
+  }
+
+  def getRandomSeed: Int = {
+    intOptionValue(RANDOM_SEED)
+  }
+
+  def getNumTrials: Int = {
+    intOptionValue(NUM_TRIALS)
+  }
+
+  def getWait: Int = {
+    intOptionValue(INTER_TRIAL_WAIT) * 1000
+  }
+
+  def createInputData(seed: Long)
+
+  /**
+   * Runs the test and returns a JSON object that captures performance metrics, such as time taken,
+   * and values of any parameters.
+   *
+   * The rendered JSON will look like this (except it will be minified):
+   *
+   *    {
+   *       "options": {
+   *         "num-partitions": "10",
+   *         "unique-values": "10",
+   *         ...
+   *       },
+   *       "results": [
+   *         {
+   *           "trainingTime": 0.211,
+   *           "trainingMetric": 98.1,
+   *           ...
+   *         },
+   *         ...
+   *       ]
+   *     }
+   *
+   * @return metrics from run (e.g. ("time" -> time)
+   *  */
+  def run(): JValue
+
+  val parser = new OptionParser()
+  var optionSet: OptionSet = _
+  var testName: String = _
+
+  var intOptions: Seq[(String, String)] = Seq(NUM_TRIALS, INTER_TRIAL_WAIT, NUM_PARTITIONS,
+    RANDOM_SEED)
+
+  var doubleOptions: Seq[(String, String)] = Seq()
+  var longOptions: Seq[(String, String)] = Seq()
+
+  var stringOptions: Seq[(String, String)] = Seq()
+  var booleanOptions: Seq[(String, String)] = Seq()
+
+  def addOptionsToParser() {
+    // add all the options to parser
+    stringOptions.map{case (opt, desc) =>
+      parser.accepts(opt, desc).withRequiredArg().ofType(classOf[String]).required()
+    }
+    booleanOptions.map{case (opt, desc) =>
+      parser.accepts(opt, desc)
+    }
+    intOptions.map{case (opt, desc) =>
+      parser.accepts(opt, desc).withRequiredArg().ofType(classOf[Int]).required()
+    }
+    doubleOptions.map{case (opt, desc) =>
+      parser.accepts(opt, desc).withRequiredArg().ofType(classOf[Double]).required()
+    }
+    longOptions.map{case (opt, desc) =>
+      parser.accepts(opt, desc).withRequiredArg().ofType(classOf[Long]).required()
+    }
+  }
+
+  def addOptionalOptionToParser[T](opt: String, desc: String, default: T, clazz: Class[T]): Unit = {
+    parser.accepts(opt, desc).withOptionalArg().ofType(clazz).defaultsTo(default)
+  }
+
+  def intOptionValue(option: (String, String)) =
+    optionSet.valueOf(option._1).asInstanceOf[Int]
+
+  def stringOptionValue(option: (String, String)) =
+    optionSet.valueOf(option._1).asInstanceOf[String]
+
+  def booleanOptionValue(option: (String, String)) =
+    optionSet.has(option._1)
+
+  def doubleOptionValue(option: (String, String)) =
+    optionSet.valueOf(option._1).asInstanceOf[Double]
+
+  def longOptionValue(option: (String, String)) =
+    optionSet.valueOf(option._1).asInstanceOf[Long]
+
+  def optionValue[T](option: String) =
+    optionSet.valueOf(option).asInstanceOf[T]
+
+  def getOptions: Map[String, String] = {
+    optionSet.asMap().asScala.flatMap { case (spec, values) =>
+      if (spec.options().size() == 1 && values.size() == 1) {
+        Some((spec.options().iterator().next(), values.iterator().next().toString))
+      } else {
+        None
+      }
+    }.toMap
+  }
+}
diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/StatTests.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/StatTests.scala
new file mode 100644
index 0000000..21c286c
--- /dev/null
+++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/StatTests.scala
@@ -0,0 +1,109 @@
+package mllib.perf
+
+import org.json4s.JsonDSL._
+import org.json4s.JsonAST._
+
+import scala.util.Random
+
+import org.apache.spark.mllib.linalg.{Matrices, Vectors, Matrix, Vector}
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.random.RandomRDDs
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.stat.Statistics
+import org.apache.spark.rdd.RDD
+
+import mllib.perf.util.DataGenerator
+
+
+/**
+ * Parent class for the tests for the statistics toolbox
+ */
+abstract class StatTests[T](sc: SparkContext) extends PerfTest {
+
+  def runTest(rdd: T)
+
+  val NUM_ROWS =  ("num-rows",   "number of rows of the matrix")
+  val NUM_COLS =  ("num-cols",   "number of columns of the matrix")
+
+  longOptions = Seq(NUM_ROWS)
+  intOptions = intOptions ++ Seq(NUM_COLS)
+
+  var rdd: T = _
+
+  val options = intOptions ++ stringOptions  ++ booleanOptions ++ doubleOptions ++ longOptions
+  addOptionsToParser()
+
+  override def run(): JValue = {
+    val start = System.currentTimeMillis()
+    runTest(rdd)
+    val end = System.currentTimeMillis()
+    val time = (end - start).toDouble / 1000.0
+    Map("time" -> time)
+  }
+}
+
+abstract class CorrelationTests(sc: SparkContext) extends StatTests[RDD[Vector]](sc){
+  override def createInputData(seed: Long) = {
+    val m: Long = longOptionValue(NUM_ROWS)
+    val n: Int = intOptionValue(NUM_COLS)
+    val numPartitions: Int = intOptionValue(NUM_PARTITIONS)
+
+    rdd = RandomRDDs.normalVectorRDD(sc, m, n, numPartitions, seed).cache()
+
+    // Materialize rdd
+    println("Num Examples: " + rdd.count())
+  }
+}
+
+class PearsonCorrelationTest(sc: SparkContext) extends CorrelationTests(sc) {
+  override def runTest(data: RDD[Vector]) {
+     Statistics.corr(data)
+  }
+}
+
+class SpearmanCorrelationTest(sc: SparkContext) extends CorrelationTests(sc) {
+  override def runTest(data: RDD[Vector]) {
+    Statistics.corr(data, "spearman")
+  }
+}
+
+class ChiSquaredFeatureTest(sc: SparkContext) extends StatTests[RDD[LabeledPoint]](sc) {
+  override def createInputData(seed: Long) = {
+    val m: Long = longOptionValue(NUM_ROWS)
+    val n: Int = intOptionValue(NUM_COLS)
+    val numPartitions: Int = intOptionValue(NUM_PARTITIONS)
+
+    rdd = DataGenerator.generateClassificationLabeledPoints(sc, m, n, 0.5, 1.0, numPartitions,
+      seed, chiSq = true).cache()
+
+    // Materialize rdd
+    println("Num Examples: " + rdd.count())
+  }
+  override def runTest(data: RDD[LabeledPoint]) {
+    Statistics.chiSqTest(data)
+  }
+}
+
+class ChiSquaredGoFTest(sc: SparkContext) extends StatTests[Vector](sc) {
+  override def createInputData(seed: Long) = {
+    val m: Long = longOptionValue(NUM_ROWS)
+    val rng = new Random(seed)
+
+    rdd = Vectors.dense(Array.fill(m.toInt)(rng.nextDouble()))
+  }
+  override def runTest(data: Vector) {
+    Statistics.chiSqTest(data)
+  }
+}
+
+class ChiSquaredMatTest(sc: SparkContext) extends StatTests[Matrix](sc) {
+  override def createInputData(seed: Long) = {
+    val m: Long = longOptionValue(NUM_ROWS)
+    val rng = new Random(seed)
+
+    rdd = Matrices.dense(m.toInt, m.toInt, Array.fill(m.toInt * m.toInt)(rng.nextDouble()))
+  }
+  override def runTest(data: Matrix) {
+    Statistics.chiSqTest(data)
+  }
+}
diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/TestRunner.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/TestRunner.scala
new file mode 100644
index 0000000..421b62f
--- /dev/null
+++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/TestRunner.scala
@@ -0,0 +1,87 @@
+package mllib.perf
+
+import scala.collection.JavaConverters._
+
+import org.json4s.JsonAST._
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods._
+
+import org.apache.spark.{SparkConf, SparkContext}
+
+import mllib.perf.clustering.{GaussianMixtureTest, LDATest, PICTest}
+import mllib.perf.feature.Word2VecTest
+import mllib.perf.fpm.{FPGrowthTest, PrefixSpanTest}
+import mllib.perf.linalg.BlockMatrixMultTest
+
+object TestRunner {
+    def main(args: Array[String]) {
+      if (args.length < 1) {
+        println(
+          "mllib.perf.TestRunner requires 1 or more args, you gave %s, exiting".format(args.length))
+        System.exit(1)
+      }
+      val testName = args(0)
+      val perfTestArgs = args.slice(1, args.length)
+      val sc = new SparkContext(new SparkConf().setAppName("TestRunner: " + testName))
+
+      // Unfortunate copy of code because there are Perf Tests in both projects and the compiler doesn't like it
+      val test: PerfTest = testName match {
+        case "glm-regression" => new GLMRegressionTest(sc)
+        case "glm-classification" => new GLMClassificationTest(sc)
+        case "naive-bayes" => new NaiveBayesTest(sc)
+        // recommendation
+        case "als" => new ALSTest(sc)
+        // clustering
+        case "gmm" => new GaussianMixtureTest(sc)
+        case "kmeans" => new KMeansTest(sc)
+        case "lda" => new LDATest(sc)
+        case "pic" => new PICTest(sc)
+        // trees
+        case "decision-tree" => new DecisionTreeTest(sc)
+        // linalg
+        case "svd" => new SVDTest(sc)
+        case "pca" => new PCATest(sc)
+        case "block-matrix-mult" => new BlockMatrixMultTest(sc)
+        // stats
+        case "summary-statistics" => new ColumnSummaryStatisticsTest(sc)
+        case "pearson" => new PearsonCorrelationTest(sc)
+        case "spearman" => new SpearmanCorrelationTest(sc)
+        case "chi-sq-feature" => new ChiSquaredFeatureTest(sc)
+        case "chi-sq-gof" => new ChiSquaredGoFTest(sc)
+        case "chi-sq-mat" => new ChiSquaredMatTest(sc)
+        // feature
+        case "word2vec" => new Word2VecTest(sc)
+        // frequent pattern mining
+        case "fp-growth" => new FPGrowthTest(sc)
+        case "prefix-span" => new PrefixSpanTest(sc)
+      }
+      test.initialize(testName, perfTestArgs)
+      // Generate a new dataset for each test
+      val rand = new java.util.Random(test.getRandomSeed)
+
+      val numTrials = test.getNumTrials
+      val interTrialWait = test.getWait
+
+      var testOptions: JValue = test.getOptions
+      val results: Seq[JValue] = (1 to numTrials).map { i =>
+        test.createInputData(rand.nextLong())
+        val res: JValue = test.run()
+        System.gc()
+        Thread.sleep(interTrialWait)
+        res
+      }
+      // Report the test results as a JSON object describing the test options, Spark
+      // configuration, Java system properties, as well as the per-test results.
+      // This extra information helps to ensure reproducibility and makes automatic analysis easier.
+      val json: JValue =
+        ("testName" -> testName) ~
+        ("options" -> testOptions) ~
+        ("sparkConf" -> sc.getConf.getAll.toMap) ~
+        ("sparkVersion" -> sc.version) ~
+        ("systemProperties" -> System.getProperties.asScala.toMap) ~
+        ("results" -> results)
+      println("results: " + compact(render(json)))
+
+      sc.stop()
+  }
+}
diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala
new file mode 100644
index 0000000..95ce9c6
--- /dev/null
+++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala
@@ -0,0 +1,63 @@
+package mllib.perf.clustering
+
+import java.util.Random
+
+import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV}
+import org.json4s.JValue
+import org.json4s.JsonDSL._
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.clustering.GaussianMixture
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
+import org.apache.spark.rdd.RDD
+
+import mllib.perf.PerfTest
+
+class GaussianMixtureTest(sc: SparkContext) extends PerfTest {
+
+  val NUM_POINTS = ("num-points", "number of points for clustering tests")
+  val NUM_COLUMNS = ("num-columns", "number of columns for each point for clustering tests")
+  val NUM_CENTERS = ("num-centers", "number of centers for clustering tests")
+  val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm")
+
+  intOptions ++= Seq(NUM_CENTERS, NUM_COLUMNS, NUM_ITERATIONS)
+  longOptions ++= Seq(NUM_POINTS)
+  val options = intOptions ++ stringOptions  ++ booleanOptions ++ longOptions ++ doubleOptions
+  addOptionsToParser()
+
+  var data: RDD[Vector] = _
+
+  override def createInputData(seed: Long): Unit = {
+    val m = longOptionValue(NUM_POINTS)
+    val n = intOptionValue(NUM_COLUMNS)
+    val k = intOptionValue(NUM_CENTERS)
+    val p = intOptionValue(NUM_PARTITIONS)
+
+    val random = new Random(seed ^ 8793480384L)
+    val mu = Array.fill(k)(new BDV[Double](Array.fill(n)(random.nextGaussian())))
+    val f = Array.fill(k)(new BDM[Double](n, n, Array.fill(n * n)(random.nextGaussian())))
+    data = sc.parallelize(0L until m, p)
+      .mapPartitionsWithIndex { (idx, part) =>
+        val rng = new Random(seed & idx)
+        part.map { _ =>
+          val i = (rng.nextDouble() * k).toInt
+          val x = new BDV[Double](Array.fill(n)(rng.nextGaussian()))
+          val y = f(i) * x + mu(i)
+          Vectors.dense(y.data)
+        }
+      }.cache()
+    logInfo(s"Generated ${data.count()} points.")
+  }
+
+  override def run(): JValue = {
+    val numIterations = intOptionValue(NUM_ITERATIONS)
+    val k = intOptionValue(NUM_CENTERS)
+    val start = System.currentTimeMillis()
+    val gmm = new GaussianMixture()
+      .setK(k)
+      .setMaxIterations(numIterations)
+    val model = gmm.run(data)
+    val duration = (System.currentTimeMillis() - start) / 1e3
+    "time" -> duration
+  }
+}
diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/clustering/LDATest.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/clustering/LDATest.scala
new file mode 100644
index 0000000..812f2da
--- /dev/null
+++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/clustering/LDATest.scala
@@ -0,0 +1,73 @@
+package mllib.perf.clustering
+
+import mllib.perf.PerfTest
+
+import org.json4s.JValue
+import org.json4s.JsonDSL._
+
+import scala.collection.mutable.{HashMap => MHashMap}
+
+import org.apache.commons.math3.random.Well19937c
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.clustering.LDA
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
+import org.apache.spark.rdd.RDD
+
+class LDATest(sc: SparkContext) extends PerfTest {
+
+  val NUM_DOCUMENTS = ("num-documents", "number of documents in corpus")
+  val NUM_VOCABULARY = ("num-vocab", "number of terms in vocabulary")
+  val NUM_TOPICS = ("num-topics", "number of topics to infer")
+  val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm")
+  val DOCUMENT_LENGTH = ("document-length", "number of words per document for the algorithm")
+  val OPTIMIZER = ("optimizer", "optimization algorithm: em or online")
+
+  intOptions ++= Seq(NUM_VOCABULARY, NUM_TOPICS, NUM_ITERATIONS, DOCUMENT_LENGTH)
+  longOptions ++= Seq(NUM_DOCUMENTS)
+  stringOptions ++= Seq(OPTIMIZER)
+  val options = intOptions ++ stringOptions  ++ booleanOptions ++ longOptions ++ doubleOptions
+  addOptionsToParser()
+
+  var data: RDD[(Long, Vector)] = _
+
+  override def createInputData(seed: Long): Unit = {
+    val numDocs = longOptionValue(NUM_DOCUMENTS)
+    val numVocab = intOptionValue(NUM_VOCABULARY)
+    val k = intOptionValue(NUM_TOPICS)
+
+    val numPartitions = intOptionValue(NUM_PARTITIONS)
+    val docLength = intOptionValue(DOCUMENT_LENGTH)
+
+    data = sc.parallelize(0L until numDocs, numPartitions)
+      .mapPartitionsWithIndex { (idx, part) =>
+      val rng = new Well19937c(seed ^ idx)
+      part.map { case docIndex =>
+        var currentSize = 0
+        val entries = MHashMap[Int, Int]()
+        while (currentSize < docLength) {
+          val index = rng.nextInt(numVocab)
+          entries(index) = entries.getOrElse(index, 0) + 1
+          currentSize += 1
+        }
+
+        val iter = entries.toSeq.map(v => (v._1, v._2.toDouble))
+        (docIndex, Vectors.sparse(numVocab, iter))
+      }
+    }.cache()
+    logInfo(s"Number of documents = ${data.count()}.")
+  }
+
+  override def run(): JValue = {
+    val k = intOptionValue(NUM_TOPICS)
+    val numIterations = intOptionValue(NUM_ITERATIONS)
+    val optimizer = stringOptionValue(OPTIMIZER)
+    val start = System.currentTimeMillis()
+    val lda = new LDA()
+      .setK(k)
+      .setMaxIterations(numIterations)
+      .setOptimizer(optimizer)
+    val model = lda.run(data)
+    val duration = (System.currentTimeMillis() - start) / 1e3
+    "time" -> duration
+  }
+}
diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/clustering/PICTest.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/clustering/PICTest.scala
new file mode 100644
index 0000000..6832ffa
--- /dev/null
+++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/clustering/PICTest.scala
@@ -0,0 +1,53 @@
+package mllib.perf.clustering
+
+import org.json4s.JValue
+import org.json4s.JsonDSL._
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.clustering.PowerIterationClustering
+import org.apache.spark.rdd.RDD
+
+import mllib.perf.PerfTest
+
+class PICTest(sc: SparkContext) extends PerfTest {
+
+  val NUM_POINTS = ("num-points", "number of points")
+  val NODE_DEGREE = ("node-degree", "number of neighbors each node is connected to")
+  val NUM_CENTERS = ("num-centers", "number of centers for clustering tests")
+  val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm")
+
+  intOptions ++= Seq(NODE_DEGREE, NUM_CENTERS, NUM_ITERATIONS)
+  longOptions ++= Seq(NUM_POINTS)
+  val options = intOptions ++ stringOptions  ++ booleanOptions ++ longOptions ++ doubleOptions
+  addOptionsToParser()
+
+  var data: RDD[(Long, Long, Double)] = _
+
+  override def createInputData(seed: Long): Unit = {
+    val numPoints = longOptionValue(NUM_POINTS)
+    val nodeDegree = intOptionValue(NODE_DEGREE)
+    val numPartitions = intOptionValue(NUM_PARTITIONS)
+
+    // Generates a periodic banded matrix with bandwidth = nodeDegree
+    val data = sc.parallelize(0L to numPoints, numPartitions)
+      .flatMap { id =>
+        (((id - nodeDegree / 2) % numPoints) until id).map { nbr =>
+          (id, (nbr + numPoints) % numPoints, 1D)
+        }
+      }
+    logInfo(s"Generated ${data.count()} pairwise similarities.")
+  }
+
+  override def run(): JValue = {
+    val numIterations = intOptionValue(NUM_ITERATIONS)
+    val k = intOptionValue(NUM_CENTERS)
+    val start = System.currentTimeMillis()
+    val pic = new PowerIterationClustering()
+      .setK(k)
+      .setMaxIterations(numIterations)
+    val model = pic.run(data)
+    val duration = (System.currentTimeMillis() - start) / 1e3
+    "time" -> duration
+  }
+}
+
diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/feature/Word2VecTest.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/feature/Word2VecTest.scala
new file mode 100644
index 0000000..389d094
--- /dev/null
+++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/feature/Word2VecTest.scala
@@ -0,0 +1,69 @@
+package mllib.perf.feature
+
+import scala.collection.mutable
+
+import org.apache.commons.math3.random.Well19937c
+import org.json4s.JValue
+import org.json4s.JsonDSL._
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.feature.Word2Vec
+import org.apache.spark.rdd.RDD
+
+import mllib.perf.PerfTest
+
+class Word2VecTest(sc: SparkContext) extends PerfTest {
+
+  val NUM_SENTENCES = ("num-sentences", "number of sentences")
+  val NUM_WORDS = ("num-words", "vocabulary size")
+  val VECTOR_SIZE = ("vector-size", "vector size")
+  val NUM_ITERATIONS = ("num-iterations", "number of iterations")
+  val MIN_COUNT = ("min-count", "minimum count for a word to be included")
+
+  intOptions ++= Seq(NUM_SENTENCES, NUM_WORDS, VECTOR_SIZE, NUM_ITERATIONS, MIN_COUNT)
+
+  val options = intOptions ++ stringOptions  ++ booleanOptions ++ doubleOptions ++ longOptions
+  addOptionsToParser()
+
+  private val avgSentenceLength = 16
+  private var sentences: RDD[Seq[String]] = _
+
+  override def createInputData(seed: Long): Unit = {
+    val numSentences = intOptionValue(NUM_SENTENCES)
+    val numPartitions = intOptionValue(NUM_PARTITIONS)
+    val numWords = intOptionValue(NUM_WORDS)
+    val p = 1.0 / avgSentenceLength
+    sentences = sc.parallelize(0 until numSentences, numPartitions)
+      .mapPartitionsWithIndex { (idx, part) =>
+        val rng = new Well19937c(seed ^ idx)
+        part.map { case i =>
+          var cur = rng.nextInt(numWords)
+          val sentence = mutable.ArrayBuilder.make[Int]
+          while (rng.nextDouble() > p) {
+            cur = (cur + rng.nextGaussian() * 10).toInt % numWords
+            if (cur < 0) {
+              cur += numWords
+            }
+            sentence += cur
+          }
+          sentence.result().map(_.toString).toSeq
+        }
+      }.cache()
+    logInfo(s"Number of sentences = ${sentences.count()}.")
+  }
+
+  override def run(): JValue = {
+    val start = System.currentTimeMillis()
+    val numIterations = intOptionValue(NUM_ITERATIONS)
+    val numPartitions = math.ceil(math.pow(numIterations, 1.5)).toInt
+    val w2v = new Word2Vec()
+      .setNumPartitions(numPartitions)
+      .setNumIterations(numIterations)
+      .setVectorSize(intOptionValue(VECTOR_SIZE))
+      .setMinCount(intOptionValue(MIN_COUNT))
+      .setSeed(0L)
+    val model = w2v.fit(sentences)
+    val duration = (System.currentTimeMillis() - start) / 1e3
+    "time" -> duration
+  }
+}
diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/fpm/FPGrowthTest.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/fpm/FPGrowthTest.scala
new file mode 100644
index 0000000..ddc19d0
--- /dev/null
+++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/fpm/FPGrowthTest.scala
@@ -0,0 +1,65 @@
+package mllib.perf.fpm
+
+import org.apache.commons.math3.distribution.BinomialDistribution
+import org.apache.commons.math3.random.Well19937c
+import org.json4s.JValue
+import org.json4s.JsonDSL._
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.fpm.FPGrowth
+import org.apache.spark.rdd.RDD
+
+import mllib.perf.PerfTest
+
+class FPGrowthTest(sc: SparkContext) extends PerfTest {
+
+  val NUM_BASKETS = ("num-baskets", "number of baskets")
+  val AVG_BASKET_SIZE = ("avg-basket-size", "average basket size. " +
+    "The distribution of basket sizes follows binomial distribution with B(10n,1/10).")
+  val NUM_ITEMS = ("num-items", "number of distinct items")
+  val MIN_SUPPORT = ("min-support", "minimum support level")
+
+  intOptions = intOptions ++ Seq(NUM_BASKETS, AVG_BASKET_SIZE, NUM_ITEMS)
+  doubleOptions = doubleOptions ++ Seq(MIN_SUPPORT)
+
+  val options = intOptions ++ stringOptions ++ booleanOptions ++ doubleOptions ++ longOptions
+  addOptionsToParser()
+
+  private var baskets: RDD[Array[Int]] = _
+
+  override def createInputData(seed: Long): Unit = {
+    val numPartitions = intOptionValue(NUM_PARTITIONS)
+    val numBaskets = intOptionValue(NUM_BASKETS)
+    val numItems = intOptionValue(NUM_ITEMS)
+    val avgBasketSize = intOptionValue(AVG_BASKET_SIZE)
+    val maxRatio = 10
+    baskets = sc.parallelize(0 until numBaskets, numPartitions)
+      .mapPartitionsWithIndex { (idx, part) =>
+        val rng = new Well19937c(seed ^ idx)
+        val binom = new BinomialDistribution(rng, maxRatio * avgBasketSize, 1.0 / maxRatio)
+        part.map { i =>
+          val basketSize = binom.sample()
+          // Use math.pow to create a skewed item distribution.
+          val items = Array.fill(basketSize)((numItems * math.pow(rng.nextDouble(), 0.1)).toInt)
+          items.toSet[Int].toArray // dedup
+        }.filter(_.nonEmpty)
+      }.cache()
+    val exactNumBaskets = baskets.count()
+    logInfo(s"Number of baskets: $exactNumBaskets.")
+    val totalNumItems = baskets.map(_.length.toLong).reduce(_ + _)
+    logInfo(s"Total number of items: $totalNumItems.")
+    logInfo(s"Average basket size: ${totalNumItems.toDouble/exactNumBaskets}.")
+  }
+
+  override def run(): JValue = {
+    val start = System.currentTimeMillis()
+    val model = new FPGrowth()
+      .setMinSupport(doubleOptionValue(MIN_SUPPORT))
+      .setNumPartitions(baskets.partitions.length * 8)
+      .run(baskets)
+    val numFreqItemsets = model.freqItemsets.count()
+    val duration = (System.currentTimeMillis() - start) / 1000.0
+    logInfo(s"Number of frequent itemsets: $numFreqItemsets.")
+    "time" -> duration
+  }
+}
diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/fpm/PrefixSpanTest.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/fpm/PrefixSpanTest.scala
new file mode 100644
index 0000000..b8fc590
--- /dev/null
+++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/fpm/PrefixSpanTest.scala
@@ -0,0 +1,82 @@
+package mllib.perf.fpm
+
+import org.apache.commons.math3.distribution.BinomialDistribution
+import org.apache.commons.math3.random.Well19937c
+import org.json4s.JValue
+import org.json4s.JsonDSL._
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.fpm.PrefixSpan
+import org.apache.spark.rdd.RDD
+
+import mllib.perf.PerfTest
+
+class PrefixSpanTest(sc: SparkContext) extends PerfTest {
+
+  val NUM_SEQUENCES = ("num-sequences", "number of itemset sequences")
+  val AVG_SEQUENCE_SIZE = ("avg-sequence-size", "average number of itemsets in a sequence. " +
+    "The distribution of itemset sequence sizes follows binomial distribution with B(10n,1/10).")
+  val AVG_ITEMSET_SIZE = ("avg-itemset-size", "average number of items in a itemset. " +
+    "The distribution of itemset sizes follows binomial distribution with B(10n,1/10).")
+  val NUM_ITEMS = ("num-items", "number of distinct items")
+  val MIN_SUPPORT = ("min-support", "minimum support level")
+  val MAX_PATTERN_LEN = ("max-pattern-len", "maximum length of frequent itemset sequences")
+  val MAX_LOCAL_PROJ_DB_SIZE = ("max-local-proj-db-size", "maximum number of items allowed in a " +
+    "locally processed projected database")
+
+  intOptions ++= Seq(NUM_SEQUENCES, AVG_SEQUENCE_SIZE, AVG_ITEMSET_SIZE, NUM_ITEMS,
+    MAX_PATTERN_LEN, MAX_LOCAL_PROJ_DB_SIZE)
+  doubleOptions ++= Seq(MIN_SUPPORT)
+  longOptions ++= Seq(MAX_LOCAL_PROJ_DB_SIZE)
+
+
+  val options = intOptions ++ stringOptions ++ booleanOptions ++ doubleOptions ++ longOptions
+  addOptionsToParser()
+
+  private var sequences: RDD[Array[Array[Int]]] = _
+
+  override def createInputData(seed: Long): Unit = {
+    val numPartitions = intOptionValue(NUM_PARTITIONS)
+    val numSequences = intOptionValue(NUM_SEQUENCES)
+    val numItems = intOptionValue(NUM_ITEMS)
+    val avgSequenceSize = intOptionValue(AVG_SEQUENCE_SIZE)
+    val avgItemsetSize = intOptionValue(AVG_ITEMSET_SIZE)
+    val maxRatio = 10
+    sequences = sc.parallelize(0 until numSequences, numPartitions)
+      .mapPartitionsWithIndex { (idx, part) =>
+      val rng = new Well19937c(seed ^ idx)
+      val binomSeq = new BinomialDistribution(rng, maxRatio * avgSequenceSize, 1.0 / maxRatio)
+      val binomItemset = new BinomialDistribution(rng, maxRatio * avgItemsetSize, 1.0 / maxRatio)
+      part.map { i =>
+        val seqSize = binomSeq.sample()
+        // Use math.pow to create a skewed item distribution.
+        val items = Array.fill(seqSize)(
+          Array.fill(binomItemset.sample())((numItems * math.pow(rng.nextDouble(), 0.1)).toInt)
+        )
+        items.map(_.toSet[Int].toArray) // dedup
+      }.filter(_.nonEmpty)
+    }.cache()
+    val exactNumSeqs = sequences.count()
+    logInfo(s"Number of sequences: $exactNumSeqs.")
+    val totalNumItems = sequences.map(_.flatten.length.toLong).reduce(_ + _)
+    val totalNumItemsets = sequences.map(_.length.toLong).reduce(_ + _)
+    logInfo(s"Total number of items: $totalNumItems.")
+    logInfo(s"Total number of itemsets: $totalNumItemsets.")
+    logInfo(s"Average num itemsets per sequence: ${totalNumItemsets.toDouble/exactNumSeqs}.")
+    logInfo(s"Average num items per itemset: ${totalNumItems.toDouble/totalNumItemsets}.")
+  }
+
+  override def run(): JValue = {
+    val start = System.currentTimeMillis()
+    val model = new PrefixSpan()
+      .setMinSupport(doubleOptionValue(MIN_SUPPORT))
+      .setMaxPatternLength(intOptionValue(MAX_PATTERN_LEN))
+      .setMaxLocalProjDBSize(longOptionValue(MAX_LOCAL_PROJ_DB_SIZE))
+      .run(sequences)
+    val numFreqItemsets = model.freqSequences.count()
+    val duration = (System.currentTimeMillis() - start) / 1000.0
+    logInfo(s"Number of frequent sequences: $numFreqItemsets.")
+    "time" -> duration
+  }
+}
+
diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/linalg/BlockMatrixMultTest.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/linalg/BlockMatrixMultTest.scala
new file mode 100644
index 0000000..123368a
--- /dev/null
+++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/linalg/BlockMatrixMultTest.scala
@@ -0,0 +1,74 @@
+package mllib.perf.linalg
+
+import java.util.Random
+
+import org.json4s.JValue
+import org.json4s.JsonDSL._
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.linalg.Matrices
+import org.apache.spark.mllib.linalg.distributed.BlockMatrix
+
+import mllib.perf.PerfTest
+
+class BlockMatrixMultTest(sc: SparkContext) extends PerfTest {
+
+  val M = ("m", "number of rows of A")
+  val K = ("k", "number of columns of A, the same as number of rows of B")
+  val N = ("n", "number of columns of B")
+  val BLOCK_SIZE = ("block-size", "block size")
+
+  intOptions ++= Seq(BLOCK_SIZE)
+  longOptions ++= Seq(M, K, N)
+
+  val options = intOptions ++ stringOptions  ++ booleanOptions ++ doubleOptions ++ longOptions
+  addOptionsToParser()
+
+  private var A: BlockMatrix = _
+  private var B: BlockMatrix = _
+
+  override def createInputData(seed: Long): Unit = {
+    val m = longOptionValue(M)
+    val k = longOptionValue(K)
+    val n = longOptionValue(N)
+    val blockSize = intOptionValue(BLOCK_SIZE)
+    val numPartitions = intOptionValue(NUM_PARTITIONS)
+
+    val random = new Random(seed)
+
+    A = randn(m, k, blockSize, numPartitions, seed ^ random.nextLong())
+    B = randn(k, n, blockSize, numPartitions, seed ^ random.nextLong())
+  }
+
+  def randn(
+      m: Long,
+      n: Long,
+      blockSize: Int,
+      numPartitions: Int,
+      seed: Long): BlockMatrix = {
+    val numRowBlocks = math.ceil(m / blockSize).toInt
+    val numColBlocks = math.ceil(n / blockSize).toInt
+    val sqrtParts = math.ceil(math.sqrt(numPartitions)).toInt
+    val rowBlockIds = sc.parallelize(0 until numRowBlocks, sqrtParts)
+    val colBlockIds = sc.parallelize(0 until numColBlocks, sqrtParts)
+    val blockIds = rowBlockIds.cartesian(colBlockIds)
+    val blocks = blockIds.mapPartitionsWithIndex { (idx, ids) =>
+      val random = new Random(idx ^ seed)
+      ids.map { case (rowBlockId, colBlockId) =>
+        val mi = math.min(m - rowBlockId * blockSize, blockSize).toInt
+        val ni = math.min(n - colBlockId * blockSize, blockSize).toInt
+        ((rowBlockId, colBlockId), Matrices.randn(mi, ni, random))
+      }
+    }.cache()
+    logInfo(s"Generated ${blocks.count()} blocks.")
+    new BlockMatrix(blocks, blockSize, blockSize, m, n)
+  }
+
+  override def run(): JValue = {
+    val start = System.currentTimeMillis()
+    val C = A.multiply(B)
+    C.blocks.count()
+    val duration = (System.currentTimeMillis() - start) / 1e3
+    "time" -> duration
+  }
+}
diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/util/DataGenerator.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/util/DataGenerator.scala
new file mode 100644
index 0000000..e65a5a5
--- /dev/null
+++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/util/DataGenerator.scala
@@ -0,0 +1,586 @@
+package mllib.perf.util
+
+import org.apache.spark.ml.attribute.{AttributeGroup, NumericAttribute, NominalAttribute}
+import org.apache.spark.sql.{SQLContext, DataFrame}
+
+import scala.collection.mutable
+
+import org.apache.spark.mllib.linalg.{Vectors, Vector}
+import org.apache.spark.mllib.linalg.distributed.RowMatrix
+import org.apache.spark.mllib.random._
+import org.apache.spark.mllib.recommendation.Rating
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.tree.configuration.{Algo, FeatureType}
+import org.apache.spark.mllib.tree.model.{Split, DecisionTreeModel, Node, Predict}
+import org.apache.spark.rdd.{PairRDDFunctions, RDD}
+import org.apache.spark.SparkContext
+
+object DataGenerator {
+
+  def generateLabeledPoints(
+      sc: SparkContext,
+      numRows: Long,
+      numCols: Int,
+      intercept: Double,
+      labelNoise: Double,
+      numPartitions: Int,
+      seed: Long = System.currentTimeMillis(),
+      problem: String = ""): RDD[LabeledPoint] = {
+
+    RandomRDDs.randomRDD(sc, new LinearDataGenerator(numCols,intercept, seed, labelNoise, problem),
+      numRows, numPartitions, seed)
+
+  }
+
+  def generateDistributedSquareMatrix(
+      sc: SparkContext,
+      m: Long,
+      n: Int,
+      numPartitions: Int,
+      seed: Long = System.currentTimeMillis()): RowMatrix = {
+
+    val data: RDD[Vector] = RandomRDDs.normalVectorRDD(sc, m, n, numPartitions, seed)
+
+    new RowMatrix(data,m,n)
+  }
+
+  def generateClassificationLabeledPoints(
+      sc: SparkContext,
+      numRows: Long,
+      numCols: Int,
+      threshold: Double,
+      featureNoise: Double,
+      numPartitions: Int,
+      seed: Long = System.currentTimeMillis(),
+      chiSq: Boolean = false): RDD[LabeledPoint] = {
+
+    RandomRDDs.randomRDD(sc, new ClassLabelGenerator(numCols,threshold, featureNoise, chiSq),
+      numRows, numPartitions, seed)
+  }
+
+  def generateBinaryLabeledPoints(
+      sc: SparkContext,
+      numRows: Long,
+      numCols: Int,
+      threshold: Double,
+      numPartitions: Int,
+      seed: Long = System.currentTimeMillis()): RDD[LabeledPoint] = {
+
+    RandomRDDs.randomRDD(sc, new BinaryLabeledDataGenerator(numCols,threshold),
+      numRows, numPartitions, seed)
+  }
+
+  /**
+   * @param labelType  0 = regression with labels in [0,1].  Values >= 2 indicate classification.
+   * @param fracCategorical  Fraction of columns/features to be categorical.
+   * @param fracBinary   Fraction of categorical features to be binary.  Others are high-arity (20).
+   * @param treeDepth  Depth of "true" tree used to label points.
+   * @return (data, categoricalFeaturesInfo)
+   *         data is an RDD of data points.
+   *         categoricalFeaturesInfo is a map storing the arity of categorical features.
+   *         E.g., an entry (n -> k) indicates that feature n is categorical
+   *         with k categories indexed from 0: {0, 1, ..., k-1}.
+   */
+  def generateDecisionTreeLabeledPoints(
+      sc: SparkContext,
+      numRows: Long,
+      numCols: Int,
+      numPartitions: Int,
+      labelType: Int,
+      fracCategorical: Double,
+      fracBinary: Double,
+      treeDepth: Int,
+      seed: Long = System.currentTimeMillis()): (RDD[LabeledPoint], Map[Int, Int]) = {
+
+    val highArity = 20
+
+    require(fracCategorical >= 0 && fracCategorical <= 1,
+      s"fracCategorical must be in [0,1], but it is $fracCategorical")
+    require(fracBinary >= 0 && fracBinary <= 1,
+      s"fracBinary must be in [0,1], but it is $fracBinary")
+
+    val isRegression = labelType == 0
+    if (!isRegression) {
+      require(labelType >= 2, s"labelType must be >= 2 for classification. 0 indicates regression.")
+    }
+    val numCategorical = (numCols * fracCategorical).toInt
+    val numContinuous = numCols - numCategorical
+    val numBinary = (numCategorical * fracBinary).toInt
+    val numHighArity = numCategorical - numBinary
+    val categoricalArities = Array.concat(Array.fill(numBinary)(2),
+      Array.fill(numHighArity)(highArity))
+
+    val featuresGenerator = new FeaturesGenerator(categoricalArities, numContinuous)
+    val featureMatrix = RandomRDDs.randomRDD(sc, featuresGenerator,
+      numRows, numPartitions, seed)
+
+    // Create random DecisionTree.
+    val featureArity = Array.concat(categoricalArities, Array.fill(numContinuous)(0))
+    val trueModel = randomBalancedDecisionTree(treeDepth, labelType, featureArity, seed)
+    println(trueModel)
+
+    // Label points using tree.
+    val labelVector = featureMatrix.map(trueModel.predict)
+
+    val data = labelVector.zip(featureMatrix).map(pair => new LabeledPoint(pair._1, pair._2))
+    val categoricalFeaturesInfo = featuresGenerator.getCategoricalFeaturesInfo
+    (data, categoricalFeaturesInfo)
+  }
+
+  /**
+   * From spark.ml.impl.TreeTests
+   *
+   * Convert the given data to a DataFrame, and set the features and label metadata.
+   * @param data  Dataset.  Categorical features and labels must already have 0-based indices.
+   *              This must be non-empty.
+   * @param categoricalFeatures  Map: categorical feature index -> number of distinct values
+   * @param numClasses  Number of classes label can take.  If 0, mark as continuous.
+   * @return DataFrame with metadata
+   */
+  def setMetadata(
+      data: RDD[LabeledPoint],
+      categoricalFeatures: Map[Int, Int],
+      numClasses: Int): DataFrame = {
+    val sqlContext = SQLContext.getOrCreate(data.sparkContext)
+    import sqlContext.implicits._
+    val df = data.toDF()
+    val numFeatures = data.first().features.size
+    val featuresAttributes = Range(0, numFeatures).map { feature =>
+      if (categoricalFeatures.contains(feature)) {
+        NominalAttribute.defaultAttr.withIndex(feature).withNumValues(categoricalFeatures(feature))
+      } else {
+        NumericAttribute.defaultAttr.withIndex(feature)
+      }
+    }.toArray
+    val featuresMetadata = new AttributeGroup("features", featuresAttributes).toMetadata()
+    val labelAttribute = if (numClasses == 0) {
+      NumericAttribute.defaultAttr.withName("label")
+    } else {
+      NominalAttribute.defaultAttr.withName("label").withNumValues(numClasses)
+    }
+    val labelMetadata = labelAttribute.toMetadata()
+    df.select(df("features").as("features", featuresMetadata),
+      df("label").as("label", labelMetadata))
+  }
+
+
+  def randomBalancedDecisionTree(
+      depth: Int,
+      labelType: Int,
+      featureArity: Array[Int],
+      seed: Long = System.currentTimeMillis()): DecisionTreeModel = {
+
+    require(depth >= 0, s"randomBalancedDecisionTree given depth < 0.")
+    require(depth <= featureArity.size,
+      s"randomBalancedDecisionTree requires depth <= featureArity.size," +
+      s" but depth = $depth and featureArity.size = ${featureArity.size}")
+    val isRegression = labelType == 0
+    if (!isRegression) {
+      require(labelType >= 2, s"labelType must be >= 2 for classification. 0 indicates regression.")
+    }
+
+    val rng = new scala.util.Random()
+    rng.setSeed(seed)
+
+    val labelGenerator = if (isRegression) {
+      new RealLabelPairGenerator()
+    } else {
+      new ClassLabelPairGenerator(labelType)
+    }
+
+    val topNode = randomBalancedDecisionTreeHelper(0, depth, featureArity, labelGenerator,
+      Set.empty, rng)
+    if (isRegression) {
+      new DecisionTreeModel(topNode, Algo.Regression)
+    } else {
+      new DecisionTreeModel(topNode, Algo.Classification)
+    }
+  }
+
+  /**
+   * Create an internal node.  Either create the leaf nodes beneath it, or recurse as needed.
+   * @param nodeIndex  Index of node.
+   * @param subtreeDepth  Depth of subtree to build.  Depth 0 means this is a leaf node.
+   * @param featureArity  Indicates feature type.  Value 0 indicates continuous feature.
+   *                      Other values >= 2 indicate a categorical feature,
+   *                      where the value is the number of categories.
+   * @param usedFeatures  Features appearing in the path from the tree root to the node
+   *                      being constructed.
+   * @param labelGenerator  Generates pairs of distinct labels.
+   * @return
+   */
+  def randomBalancedDecisionTreeHelper(
+      nodeIndex: Int,
+      subtreeDepth: Int,
+      featureArity: Array[Int],
+      labelGenerator: RandomDataGenerator[Pair[Double, Double]],
+      usedFeatures: Set[Int],
+      rng: scala.util.Random): Node = {
+
+    if (subtreeDepth == 0) {
+      // This case only happens for a depth 0 tree.
+      return new Node(id = nodeIndex, predict = new Predict(0), impurity = 0, isLeaf = true,
+        split = None, leftNode = None, rightNode = None, stats = None)
+    }
+
+    val numFeatures = featureArity.size
+    if (usedFeatures.size >= numFeatures) {
+      // Should not happen.
+      throw new RuntimeException(s"randomBalancedDecisionTreeSplitNode ran out of " +
+        s"features for splits.")
+    }
+
+    // Make node internal.
+    var feature: Int = rng.nextInt(numFeatures)
+    while (usedFeatures.contains(feature)) {
+      feature = rng.nextInt(numFeatures)
+    }
+    val split: Split = if (featureArity(feature) == 0) {
+      // continuous feature
+      new Split(feature = feature, threshold = rng.nextDouble(),
+        featureType = FeatureType.Continuous, categories = List())
+    } else {
+      // categorical feature
+      // Put nCatsSplit categories on left, and the rest on the right.
+      // nCatsSplit is in {1,...,arity-1}.
+      val nCatsSplit = rng.nextInt(featureArity(feature) - 1) + 1
+      val splitCategories = rng.shuffle(Range(0,featureArity(feature)).toList).take(nCatsSplit)
+      new Split(feature = feature, threshold = 0,
+        featureType = FeatureType.Categorical, categories =
+          splitCategories.asInstanceOf[List[Double]])
+    }
+
+    val leftChildIndex = nodeIndex * 2 + 1
+    val rightChildIndex = nodeIndex * 2 + 2
+    if (subtreeDepth == 1) {
+      // Add leaf nodes.
+      val predictions = labelGenerator.nextValue()
+      new Node(id = nodeIndex, predict = new Predict(0), impurity = 0, isLeaf = false, split = Some(split),
+        leftNode = Some(new Node(id = leftChildIndex, predict = new Predict(predictions._1), impurity = 0, isLeaf = true,
+          split = None, leftNode = None, rightNode = None, stats = None)),
+        rightNode = Some(new Node(id = rightChildIndex, predict = new Predict(predictions._2), impurity = 0, isLeaf = true,
+          split = None, leftNode = None, rightNode = None, stats = None)), stats = None)
+    } else {
+      new Node(id = nodeIndex, predict = new Predict(0), impurity = 0, isLeaf = false, split = Some(split),
+        leftNode = Some(randomBalancedDecisionTreeHelper(leftChildIndex, subtreeDepth - 1,
+          featureArity, labelGenerator, usedFeatures + feature, rng)),
+        rightNode = Some(randomBalancedDecisionTreeHelper(rightChildIndex, subtreeDepth - 1,
+          featureArity, labelGenerator, usedFeatures + feature, rng)), stats = None)
+    }
+  }
+
+  def generateKMeansVectors(
+      sc: SparkContext,
+      numRows: Long,
+      numCols: Int,
+      numCenters: Int,
+      numPartitions: Int,
+      seed: Long = System.currentTimeMillis()): RDD[Vector] = {
+
+    RandomRDDs.randomRDD(sc, new KMeansDataGenerator(numCenters, numCols, seed),
+      numRows, numPartitions, seed)
+  }
+
+
+  // Problems with having a userID or productID in the test set but not training set
+  // leads to a lot of work...
+  def generateRatings(
+      sc: SparkContext,
+      numUsers: Int,
+      numProducts: Int,
+      numRatings: Long,
+      implicitPrefs: Boolean,
+      numPartitions: Int,
+      seed: Long = System.currentTimeMillis()): (RDD[Rating],RDD[Rating]) = {
+
+    val train = RandomRDDs.randomRDD(sc,
+      new RatingGenerator(numUsers, numProducts,implicitPrefs),
+      numRatings, numPartitions, seed).cache()
+
+    val test = RandomRDDs.randomRDD(sc,
+      new RatingGenerator(numUsers, numProducts,implicitPrefs),
+      math.ceil(numRatings * 0.25).toLong, numPartitions, seed + 24)
+
+    // Now get rid of duplicate ratings and remove non-existant userID's
+    // and prodID's from the test set
+    val commons: PairRDDFunctions[(Int,Int),Rating] =
+      new PairRDDFunctions(train.keyBy(rating => (rating.user, rating.product)).cache())
+
+    val exact = commons.join(test.keyBy(rating => (rating.user, rating.product)))
+
+    val trainPruned = commons.subtractByKey(exact).map(_._2).cache()
+
+    // Now get rid of users that don't exist in the train set
+    val trainUsers: RDD[(Int,Rating)] = trainPruned.keyBy(rating => rating.user)
+    val testUsers: PairRDDFunctions[Int,Rating] =
+      new PairRDDFunctions(test.keyBy(rating => rating.user))
+    val testWithAdditionalUsers = testUsers.subtractByKey(trainUsers)
+
+    val userPrunedTestProds: RDD[(Int,Rating)] =
+      testUsers.subtractByKey(testWithAdditionalUsers).map(_._2).keyBy(rating => rating.product)
+
+    val trainProds: RDD[(Int,Rating)] = trainPruned.keyBy(rating => rating.product)
+
+    val testWithAdditionalProds =
+      new PairRDDFunctions[Int, Rating](userPrunedTestProds).subtractByKey(trainProds)
+    val finalTest =
+      new PairRDDFunctions[Int, Rating](userPrunedTestProds).subtractByKey(testWithAdditionalProds)
+        .map(_._2)
+
+    (trainPruned, finalTest)
+  }
+
+}
+
+class RatingGenerator(
+    private val numUsers: Int,
+    private val numProducts: Int,
+    private val implicitPrefs: Boolean) extends RandomDataGenerator[Rating] {
+
+  private val rng = new java.util.Random()
+
+  private val observed = new mutable.HashMap[(Int, Int), Boolean]()
+
+  override def nextValue(): Rating = {
+    var tuple = (rng.nextInt(numUsers),rng.nextInt(numProducts))
+    while (observed.getOrElse(tuple,false)){
+      tuple = (rng.nextInt(numUsers),rng.nextInt(numProducts))
+    }
+    observed += (tuple -> true)
+
+    val rating = if (implicitPrefs) rng.nextInt(2)*1.0 else rng.nextDouble()*5
+
+    new Rating(tuple._1, tuple._2, rating)
+  }
+
+  override def setSeed(seed: Long) {
+    rng.setSeed(seed)
+  }
+
+  override def copy(): RatingGenerator = new RatingGenerator(numUsers, numProducts, implicitPrefs)
+}
+
+// For general classification
+class ClassLabelGenerator(
+    private val numFeatures: Int,
+    private val threshold: Double,
+    private val featureNoise: Double,
+    private val chiSq: Boolean) extends RandomDataGenerator[LabeledPoint] {
+
+  private val rng = new java.util.Random()
+
+  override def nextValue(): LabeledPoint = {
+    val y = if (rng.nextDouble() < threshold) 0.0 else 1.0
+    val x = Array.fill[Double](numFeatures) {
+      if (!chiSq) rng.nextGaussian() + (y * featureNoise) else rng.nextInt(6) * 1.0
+    }
+
+    LabeledPoint(y, Vectors.dense(x))
+  }
+
+  override def setSeed(seed: Long) {
+    rng.setSeed(seed)
+  }
+
+  override def copy(): ClassLabelGenerator =
+    new ClassLabelGenerator(numFeatures, threshold, featureNoise, chiSq)
+}
+
+class BinaryLabeledDataGenerator(
+  private val numFeatures: Int,
+  private val threshold: Double) extends RandomDataGenerator[LabeledPoint] {
+
+  private val rng = new java.util.Random()
+
+  override def nextValue(): LabeledPoint = {
+    val y = if (rng.nextDouble() < threshold) 0.0 else 1.0
+    val x = Array.fill[Double](numFeatures) {
+      if (rng.nextDouble() < threshold) 0.0 else 1.0
+    }
+    LabeledPoint(y, Vectors.dense(x))
+  }
+
+  override def setSeed(seed: Long) {
+    rng.setSeed(seed)
+  }
+
+  override def copy(): BinaryLabeledDataGenerator =
+    new BinaryLabeledDataGenerator(numFeatures, threshold)
+
+}
+
+class LinearDataGenerator(
+    val numFeatures: Int,
+    val intercept: Double,
+    val seed: Long,
+    val labelNoise: Double,
+    val problem: String = "",
+    val sparsity: Double = 1.0) extends RandomDataGenerator[LabeledPoint] {
+
+  private val rng = new java.util.Random(seed)
+
+  private val weights = Array.fill(numFeatures)(rng.nextDouble())
+  private val nnz: Int = math.ceil(numFeatures*sparsity).toInt
+
+  override def nextValue(): LabeledPoint = {
+    val x = Array.fill[Double](nnz)(2*rng.nextDouble()-1)
+
+    val y = weights.zip(x).map(p => p._1 * p._2).sum + intercept + labelNoise*rng.nextGaussian()
+    val yD =
+      if (problem == "SVM"){
+        if (y < 0.0) 0.0 else 1.0
+      } else{
+        y
+      }
+
+    LabeledPoint(yD, Vectors.dense(x))
+  }
+
+  override def setSeed(seed: Long) {
+    rng.setSeed(seed)
+  }
+
+  override def copy(): LinearDataGenerator =
+    new LinearDataGenerator(numFeatures, intercept, seed, labelNoise, problem, sparsity)
+}
+
+
+/**
+ * Generator for a pair of distinct class labels from the set {0,...,numClasses-1}.
+ * @param numClasses  Number of classes.
+ */
+class ClassLabelPairGenerator(val numClasses: Int)
+  extends RandomDataGenerator[Pair[Double, Double]] {
+
+  require(numClasses >= 2,
+    s"ClassLabelPairGenerator given label numClasses = $numClasses, but numClasses should be >= 2.")
+
+  private val rng = new java.util.Random()
+
+  override def nextValue(): Pair[Double, Double] = {
+    val left = rng.nextInt(numClasses)
+    var right = rng.nextInt(numClasses)
+    while (right == left) {
+      right = rng.nextInt(numClasses)
+    }
+    new Pair[Double, Double](left, right)
+  }
+
+  override def setSeed(seed: Long) {
+    rng.setSeed(seed)
+  }
+
+  override def copy(): ClassLabelPairGenerator = new ClassLabelPairGenerator(numClasses)
+}
+
+
+/**
+ * Generator for a pair of real-valued labels.
+ */
+class RealLabelPairGenerator() extends RandomDataGenerator[Pair[Double, Double]] {
+
+  private val rng = new java.util.Random()
+
+  override def nextValue(): Pair[Double, Double] =
+    new Pair[Double, Double](rng.nextDouble(), rng.nextDouble())
+
+  override def setSeed(seed: Long) {
+    rng.setSeed(seed)
+  }
+
+  override def copy(): RealLabelPairGenerator = new RealLabelPairGenerator()
+}
+
+
+/**
+ * Generator for a feature vector which can include a mix of categorical and continuous features.
+ * @param categoricalArities  Specifies the number of categories for each categorical feature.
+ * @param numContinuous  Number of continuous features.  Feature values are in range [0,1].
+ */
+class FeaturesGenerator(val categoricalArities: Array[Int], val numContinuous: Int)
+  extends RandomDataGenerator[Vector] {
+
+  categoricalArities.foreach { arity =>
+    require(arity >= 2, s"FeaturesGenerator given categorical arity = $arity, " +
+      s"but arity should be >= 2.")
+  }
+
+  val numFeatures = categoricalArities.size + numContinuous
+
+  private val rng = new java.util.Random()
+
+  /**
+   * Generates vector with categorical features first, and continuous features in [0,1] second.
+   */
+  override def nextValue(): Vector = {
+    // Feature ordering matches getCategoricalFeaturesInfo.
+    val arr = new Array[Double](numFeatures)
+    var j = 0
+    while (j < categoricalArities.size) {
+      arr(j) = rng.nextInt(categoricalArities(j))
+      j += 1
+    }
+    while (j < numFeatures) {
+      arr(j) = rng.nextDouble()
+      j += 1
+    }
+    Vectors.dense(arr)
+  }
+
+  override def setSeed(seed: Long) {
+    rng.setSeed(seed)
+  }
+
+  override def copy(): FeaturesGenerator = new FeaturesGenerator(categoricalArities, numContinuous)
+
+  /**
+   * @return categoricalFeaturesInfo Map storing arity of categorical features.
+   *                                 E.g., an entry (n -> k) indicates that feature n is categorical
+   *                                 with k categories indexed from 0: {0, 1, ..., k-1}.
+   */
+  def getCategoricalFeaturesInfo: Map[Int, Int] = {
+    // Categorical features are indexed from 0 because of the implementation of nextValue().
+    categoricalArities.zipWithIndex.map(_.swap).toMap
+  }
+
+}
+
+
+class KMeansDataGenerator(
+    val numCenters: Int,
+    val numColumns: Int,
+    val seed: Long) extends RandomDataGenerator[Vector] {
+
+  private val rng = new java.util.Random(seed)
+  private val rng2 = new java.util.Random(seed + 24)
+  private val scale_factors = Array.fill(numCenters)(rng.nextInt(20) - 10)
+
+  // Have a random number of points around a cluster
+  private val concentrations: Seq[Double] = {
+    val rand = Array.fill(numCenters)(rng.nextDouble())
+    val randSum = rand.sum
+    val scaled = rand.map(x => x / randSum)
+
+    (1 to numCenters).map{i =>
+      scaled.slice(0, i).sum
+    }
+  }
+
+  private val centers = (0 until numCenters).map{i =>
+    Array.fill(numColumns)((2 * rng.nextDouble() - 1)*scale_factors(i))
+  }
+
+  override def nextValue(): Vector = {
+    val pick_center_rand = rng2.nextDouble()
+
+    val centerToAddTo = centers(concentrations.indexWhere(p => pick_center_rand <= p))
+
+    Vectors.dense(Array.tabulate(numColumns)(i => centerToAddTo(i) + rng2.nextGaussian()))
+  }
+
+  override def setSeed(seed: Long) {
+    rng.setSeed(seed)
+  }
+
+  override def copy(): KMeansDataGenerator = new KMeansDataGenerator(numCenters, numColumns, seed)
+}
diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/util/DataLoader.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/util/DataLoader.scala
new file mode 100644
index 0000000..f0bd48c
--- /dev/null
+++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/util/DataLoader.scala
@@ -0,0 +1,143 @@
+package mllib.perf.util
+
+import org.apache.spark.SparkContext
+import org.apache.spark.mllib.regression._
+import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.rdd.RDD
+
+object DataLoader {
+
+  // For DecisionTreeTest: PartitionLabelStats tracks the stats for each partition.
+  class PartitionLabelStats(
+      var min: Double,
+      var max: Double,
+      var distinct: Long,
+      var nonInteger: Boolean)
+    extends Serializable
+
+  object PartitionLabelStats extends Serializable {
+    /** Max categories allowed for categorical label (for inferring labelType) */
+    val MAX_CATEGORIES = 1000
+
+    def labelSeqOp(lps: Iterator[LabeledPoint]): Iterator[PartitionLabelStats] = {
+      val stats = new PartitionLabelStats(Double.MaxValue, Double.MinValue, 0, false)
+      val labelSet = new scala.collection.mutable.HashSet[Double]()
+      lps.foreach { lp =>
+        if (lp.label.toInt != lp.label) {
+          stats.nonInteger = true
+        }
+        stats.min = Math.min(lp.label, stats.min)
+        stats.max = Math.max(lp.label, stats.max)
+        if (labelSet.size <= MAX_CATEGORIES) {
+          labelSet.add(lp.label)
+        }
+      }
+      stats.distinct = labelSet.size
+      Iterator(stats)
+      Iterator(new PartitionLabelStats(0,0,0,false))
+    }
+
+    def labelCombOp(
+        labelStatsA: PartitionLabelStats,
+        labelStatsB: PartitionLabelStats): PartitionLabelStats = {
+      labelStatsA.min = Math.min(labelStatsA.min, labelStatsB.min)
+      labelStatsA.max = Math.max(labelStatsA.max, labelStatsB.max)
+      labelStatsA.distinct = Math.max(labelStatsB.distinct, labelStatsB.distinct)
+      labelStatsA
+    }
+  }
+
+  /** Infer label type from data */
+  private def isClassification(data: RDD[LabeledPoint]): Boolean = {
+    val labelStats =
+      data.mapPartitions(PartitionLabelStats.labelSeqOp)
+        .fold(new PartitionLabelStats(Double.MaxValue, Double.MinValue, 0, false))(
+          PartitionLabelStats.labelCombOp)
+    labelStats.distinct <= PartitionLabelStats.MAX_CATEGORIES && !labelStats.nonInteger
+  }
+
+  /**
+   * Load training and test LibSVM-format data files.
+   * @return (trainTestDatasets, categoricalFeaturesInfo, numClasses) where
+   *         trainTestDatasets = Array(trainingData, testData),
+   *         categoricalFeaturesInfo is a map of categorical feature arities, and
+   *         numClasses = number of classes label can take.
+   */
+  private[perf] def loadLibSVMFiles(
+      sc: SparkContext,
+      numPartitions: Int,
+      trainingDataPath: String,
+      testDataPath: String,
+      testDataFraction: Double,
+      seed: Long): (Array[RDD[LabeledPoint]], Map[Int, Int], Int) = {
+
+    val trainingData = MLUtils.loadLibSVMFile(sc, trainingDataPath, -1, numPartitions)
+
+    val (rdds, categoricalFeaturesInfo_) = if (testDataPath == "") {
+      // randomly split trainingData into train, test
+      val splits = trainingData.randomSplit(Array(1.0 - testDataFraction, testDataFraction), seed)
+      (splits, Map.empty[Int, Int])
+    } else {
+      // load test data
+      val numFeatures = trainingData.take(1)(0).features.size
+      val testData = MLUtils.loadLibSVMFile(sc, testDataPath, numFeatures, numPartitions)
+      (Array(trainingData, testData), Map.empty[Int, Int])
+    }
+
+    // For classification, re-index classes if needed.
+    val (finalDatasets, classIndexMap, numClasses) = {
+      if (isClassification(rdds(0)) && isClassification(rdds(1))) {
+        // classCounts: class --> # examples in class
+        val classCounts: Map[Double, Long] = {
+          val trainClassCounts = rdds(0).map(_.label).countByValue()
+          val testClassCounts = rdds(1).map(_.label).countByValue()
+          val mutableClassCounts = new scala.collection.mutable.HashMap[Double, Long]()
+          trainClassCounts.foreach { case (label, cnt) =>
+            mutableClassCounts(label) = mutableClassCounts.getOrElseUpdate(label, 0) + cnt
+          }
+          testClassCounts.foreach { case (label, cnt) =>
+            mutableClassCounts(label) = mutableClassCounts.getOrElseUpdate(label, 0) + cnt
+          }
+          mutableClassCounts.toMap
+        }
+        val sortedClasses = classCounts.keys.toList.sorted
+        val numClasses = classCounts.size
+        // classIndexMap: class --> index in 0,...,numClasses-1
+        val classIndexMap = {
+          if (classCounts.keySet != Set(0.0, 1.0)) {
+            sortedClasses.zipWithIndex.toMap
+          } else {
+            Map[Double, Int]()
+          }
+        }
+        val indexedRdds = {
+          if (classIndexMap.isEmpty) {
+            rdds
+          } else {
+            rdds.map { rdd =>
+              rdd.map(lp => LabeledPoint(classIndexMap(lp.label), lp.features))
+            }
+          }
+        }
+        val numTrain = indexedRdds(0).count()
+        val numTest = indexedRdds(1).count()
+        val numTotalInstances = numTrain + numTest
+        println(s"numTrain: $numTrain")
+        println(s"numTest: $numTest")
+        println(s"numClasses: $numClasses")
+        println(s"Per-class example fractions, counts:")
+        println(s"Class\tFrac\tCount")
+        sortedClasses.foreach { c =>
+          val frac = classCounts(c) / numTotalInstances.toDouble
+          println(s"$c\t$frac\t${classCounts(c)}")
+        }
+        (indexedRdds, classIndexMap, numClasses)
+      } else {
+        (rdds, null, 0)
+      }
+    }
+
+    (finalDatasets, categoricalFeaturesInfo_, numClasses)
+  }
+
+}

From b1d19475996ba279c24e6c11e5d8e166787d6df4 Mon Sep 17 00:00:00 2001
From: Adam Roberts <aroberts@uk.ibm.com>
Date: Thu, 30 Jun 2016 16:58:47 +0100
Subject: [PATCH 05/22] Add config file template back in

---
 config/config.py.template | 797 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 797 insertions(+)
 create mode 100755 config/config.py.template

diff --git a/config/config.py.template b/config/config.py.template
new file mode 100755
index 0000000..019cbac
--- /dev/null
+++ b/config/config.py.template
@@ -0,0 +1,797 @@
+"""
+Configuration options for running Spark performance tests.
+
+When updating `spark-perf`, you should probably use `diff` to compare the updated template to
+your modified `config.py` file and copy over any new configurations.
+"""
+
+import time
+import os
+import os.path
+import socket
+
+from sparkperf.config_utils import FlagSet, JavaOptionSet, OptionSet, ConstantOption
+
+
+# ================================ #
+#  Standard Configuration Options  #
+# ================================ #
+
+DEFAULT_HOME=os.environ['HOME']
+
+SPARK_HOME_DIR = os.getenv('SPARK_HOME', DEFAULT_HOME)
+
+# Use a custom configuration directory
+SPARK_CONF_DIR = SPARK_HOME_DIR + "/conf"
+
+# Master used when submitting Spark jobs.
+# For local clusters: "spark://%s:7077" % socket.gethostname()
+# For Yarn clusters: "yarn"
+# Otherwise, the default uses the specified EC2 cluster
+
+SPARK_CLUSTER_URL = "spark://%s:7077" % socket.gethostname()
+IS_YARN_MODE = "yarn" in SPARK_CLUSTER_URL
+IS_MESOS_MODE = "mesos" in SPARK_CLUSTER_URL
+
+# Specify URI to download spark executor. This only applied for running with Mesos.
+#SPARK_EXECUTOR_URI = "http://localhost:8000/spark.tgz"
+
+# Path to the Mesos native library. This is only required for running with Mesos.
+#MESOS_NATIVE_LIBRARY = "/usr/local/lib/libmesos.so"
+
+# Run Mesos client in coarse or fine grain mode. This is only applied for running with Mesos.
+#SPARK_MESOS_COARSE = True
+
+
+# If this is true, we'll submit your job using an existing Spark installation.
+# If this is false, we'll clone and build a specific version of Spark, and
+# copy configurations from your existing Spark installation.
+USE_CLUSTER_SPARK = True
+
+# URL of the HDFS installation in the Spark EC2 cluster
+HDFS_URL = "hdfs://%s:9000/test/" % socket.gethostname()
+
+# Set the following if not using existing Spark installation
+# Commit id and repo used if you are not using an existing Spark cluster
+# custom version of Spark. The remote name in your git repo is assumed
+# to be "origin".
+#
+# The commit ID can specify any of the following:
+#     1. A git commit hash         e.g. "4af93ff3"
+#     2. A branch name             e.g. "origin/branch-0.7"
+#     3. A tag name                e.g. "origin/tag/v0.8.0-incubating"
+#     4. A pull request            e.g. "origin/pr/675"
+SPARK_COMMIT_ID = ""
+SPARK_GIT_REPO = "https://github.com/apache/spark.git"
+SPARK_MERGE_COMMIT_INTO_MASTER = False # Whether to merge the commit into master
+
+# Whether to install and build Spark. Set this to true only for the
+# first installation if an existing one does not already exist.
+PREP_SPARK = not USE_CLUSTER_SPARK
+
+# Whether to restart the Master and all Workers
+# This should always be false for Yarn
+RESTART_SPARK_CLUSTER = True
+RESTART_SPARK_CLUSTER = RESTART_SPARK_CLUSTER and not IS_YARN_MODE
+
+# Rsync SPARK_HOME to all the slaves or not
+RSYNC_SPARK_HOME = True
+
+# Which tests to run
+RUN_SPARK_TESTS = False
+RUN_PYSPARK_TESTS = False
+RUN_STREAMING_TESTS = False
+RUN_MLLIB_TESTS = True
+RUN_PYTHON_MLLIB_TESTS = True
+
+# Which tests to prepare. Set this to true for the first
+# installation or whenever you make a change to the tests.
+PREP_SPARK_TESTS = True
+PREP_PYSPARK_TESTS = True
+PREP_STREAMING_TESTS = True
+PREP_MLLIB_TESTS = True
+
+# Whether to warm up local disks (warm-up is only necesary on EC2).
+DISK_WARMUP = False
+
+# Total number of bytes used to warm up each local directory.
+DISK_WARMUP_BYTES = 200 * 1024 * 1024
+
+# Number of files to create when warming up each local directory.
+# Bytes will be evenly divided across files.
+DISK_WARMUP_FILES = 200
+
+# Prompt for confirmation when deleting temporary files.
+PROMPT_FOR_DELETES = False
+
+# Files to write results to
+SPARK_OUTPUT_FILENAME = "results/spark_perf_output_%s_%s" % (
+    SPARK_COMMIT_ID.replace("/", "-"), time.strftime("%Y-%m-%d_%H-%M-%S"))
+PYSPARK_OUTPUT_FILENAME = "results/python_perf_output_%s_%s" % (
+    SPARK_COMMIT_ID.replace("/", "-"), time.strftime("%Y-%m-%d_%H-%M-%S"))
+STREAMING_OUTPUT_FILENAME = "results/streaming_perf_output_%s_%s" % (
+    SPARK_COMMIT_ID.replace("/", "-"), time.strftime("%Y-%m-%d_%H-%M-%S"))
+MLLIB_OUTPUT_FILENAME = "results/mllib_perf_output_%s_%s" % (
+    SPARK_COMMIT_ID.replace("/", "-"), time.strftime("%Y-%m-%d_%H-%M-%S"))
+PYTHON_MLLIB_OUTPUT_FILENAME = "results/python_mllib_perf_output_%s_%s" % (
+    SPARK_COMMIT_ID.replace("/", "-"), time.strftime("%Y-%m-%d_%H-%M-%S"))
+
+
+# ============================ #
+#  Test Configuration Options  #
+# ============================ #
+
+# The default values configured below are appropriate for approximately 20 m1.xlarge nodes,
+# in which each node has 15 GB of memory. Use this variable to scale the values (e.g.
+# number of records in a generated dataset) if you are running the tests with more
+# or fewer nodes. When developing new test suites, you might want to set this to a small
+# value suitable for a single machine, such as 0.001.
+SCALE_FACTOR = 0.01
+
+assert SCALE_FACTOR > 0, "SCALE_FACTOR must be > 0."
+
+# If set, removes the first N trials for each test from all reported statistics. Useful for
+# tests which have outlier behavior due to JIT and other system cache warm-ups. If any test
+# returns fewer N + 1 results, an exception is thrown.
+IGNORED_TRIALS = 2
+
+# Command used to launch Scala or Java.
+
+# Set up OptionSets. Note that giant cross product is done over all JavaOptionsSets + OptionSets
+# passed to each test which may be combinations of those set up here.
+
+# Java options.
+COMMON_JAVA_OPTS = [
+    # Fraction of JVM memory used for caching RDDs.
+    JavaOptionSet("spark.storage.memoryFraction", [0.66]),
+    JavaOptionSet("spark.serializer", ["org.apache.spark.serializer.JavaSerializer"]),
+    JavaOptionSet("spark.executor.memory", ["2g"]),
+    # Turn event logging on in order better diagnose failed tests. Off by default as it crashes
+    # releases prior to 1.0.2
+    # JavaOptionSet("spark.eventLog.enabled", [True]),
+    # To ensure consistency across runs, we disable delay scheduling
+    JavaOptionSet("spark.locality.wait", [str(60 * 1000 * 1000)])
+]
+# Set driver memory here
+SPARK_DRIVER_MEMORY = "2g"
+# The following options value sets are shared among all tests.
+COMMON_OPTS = [
+    # How many times to run each experiment - used to warm up system caches.
+    # This OptionSet should probably only have a single value (i.e., length 1)
+    # since it doesn't make sense to have multiple values here.
+    OptionSet("num-trials", [10]),
+    # Extra pause added between trials, in seconds. For runs with large amounts
+    # of shuffle data, this gives time for buffer cache write-back.
+    OptionSet("inter-trial-wait", [3])
+]
+
+# The following options value sets are shared among all tests of
+# operations on key-value data.
+SPARK_KEY_VAL_TEST_OPTS = [
+    # The number of input partitions.
+    OptionSet("num-partitions", [400], can_scale=True),
+    # The number of reduce tasks.
+    OptionSet("reduce-tasks", [400], can_scale=True),
+    # A random seed to make tests reproducable.
+    OptionSet("random-seed", [5]),
+    # Input persistence strategy (can be "memory", "disk", or "hdfs").
+    # NOTE: If "hdfs" is selected, datasets will be re-used across runs of
+    #       this script. This means parameters here are effectively ignored if
+    #       an existing input dataset is present.
+    OptionSet("persistent-type", ["memory"]),
+    # Whether to wait for input in order to exit the JVM.
+    FlagSet("wait-for-exit", [False]),
+    # Total number of records to create.
+    OptionSet("num-records", [200 * 1000 * 1000], True),
+    # Number of unique keys to sample from.
+    OptionSet("unique-keys",[20 * 1000], True),
+    # Length in characters of each key.
+    OptionSet("key-length", [10]),
+    # Number of unique values to sample from.
+    OptionSet("unique-values", [1000 * 1000], True),
+    # Length in characters of each value.
+    OptionSet("value-length", [10]),
+    # Use hashes instead of padded numbers for keys and values
+    FlagSet("hash-records", [False]),
+    # Storage location if HDFS persistence is used
+    OptionSet("storage-location", [
+        HDFS_URL + "/spark-perf-kv-data"])
+]
+
+
+# ======================= #
+#  Spark Core Test Setup  #
+# ======================= #
+
+# Set up the actual tests. Each test is represtented by a tuple:
+# (short_name, test_cmd, scale_factor, list<JavaOptionSet>, list<OptionSet>)
+
+SPARK_KV_OPTS = COMMON_OPTS + SPARK_KEY_VAL_TEST_OPTS
+SPARK_TESTS = []
+
+SCHEDULING_THROUGHPUT_OPTS = [
+    # The number of tasks that should be launched in each job:
+    OptionSet("num-tasks", [10 * 1000]),
+    # The number of jobs that should be run:
+    OptionSet("num-jobs", [1]),
+    # The size of the task closure (in bytes):
+    OptionSet("closure-size", [0]),
+    # A random seed to make tests reproducible:
+    OptionSet("random-seed", [5]),
+]
+
+SPARK_TESTS += [("scheduling-throughput", "spark.perf.TestRunner",
+    SCALE_FACTOR, COMMON_JAVA_OPTS,
+    [ConstantOption("scheduling-throughput")] + COMMON_OPTS + SCHEDULING_THROUGHPUT_OPTS)]
+
+SPARK_TESTS += [("scala-agg-by-key", "spark.perf.TestRunner", SCALE_FACTOR,
+    COMMON_JAVA_OPTS, [ConstantOption("aggregate-by-key")] + SPARK_KV_OPTS)]
+
+# Scale the input for this test by 2x since ints are smaller.
+SPARK_TESTS += [("scala-agg-by-key-int", "spark.perf.TestRunner", SCALE_FACTOR * 2,
+    COMMON_JAVA_OPTS, [ConstantOption("aggregate-by-key-int")] + SPARK_KV_OPTS)]
+
+SPARK_TESTS += [("scala-agg-by-key-naive", "spark.perf.TestRunner", SCALE_FACTOR,
+    COMMON_JAVA_OPTS, [ConstantOption("aggregate-by-key-naive")] + SPARK_KV_OPTS)]
+
+# Scale the input for this test by 0.10.
+SPARK_TESTS += [("scala-sort-by-key", "spark.perf.TestRunner", SCALE_FACTOR * 0.1,
+    COMMON_JAVA_OPTS, [ConstantOption("sort-by-key")] + SPARK_KV_OPTS)]
+
+SPARK_TESTS += [("scala-sort-by-key-int", "spark.perf.TestRunner", SCALE_FACTOR * 0.2,
+    COMMON_JAVA_OPTS, [ConstantOption("sort-by-key-int")] + SPARK_KV_OPTS)]
+
+SPARK_TESTS += [("scala-count", "spark.perf.TestRunner", SCALE_FACTOR,
+    COMMON_JAVA_OPTS, [ConstantOption("count")] + SPARK_KV_OPTS)]
+
+SPARK_TESTS += [("scala-count-w-fltr", "spark.perf.TestRunner", SCALE_FACTOR,
+    COMMON_JAVA_OPTS, [ConstantOption("count-with-filter")] + SPARK_KV_OPTS)]
+
+
+# ==================== #
+#  Pyspark Test Setup  #
+# ==================== #
+
+PYSPARK_TESTS = []
+
+BROADCAST_TEST_OPTS = [
+    # The size of broadcast
+    OptionSet("broadcast-size", [200 << 20], can_scale=True),
+]
+
+PYSPARK_TESTS += [("python-scheduling-throughput", "core_tests.py",
+    SCALE_FACTOR, COMMON_JAVA_OPTS,
+    [ConstantOption("SchedulerThroughputTest"), OptionSet("num-tasks", [5000])] + COMMON_OPTS)]
+
+PYSPARK_TESTS += [("python-agg-by-key", "core_tests.py", SCALE_FACTOR,
+    COMMON_JAVA_OPTS, [ConstantOption("AggregateByKey")] + SPARK_KV_OPTS)]
+
+# Scale the input for this test by 2x since ints are smaller.
+PYSPARK_TESTS += [("python-agg-by-key-int", "core_tests.py", SCALE_FACTOR * 2,
+    COMMON_JAVA_OPTS, [ConstantOption("AggregateByKeyInt")] + SPARK_KV_OPTS)]
+
+PYSPARK_TESTS += [("python-agg-by-key-naive", "core_tests.py", SCALE_FACTOR,
+    COMMON_JAVA_OPTS, [ConstantOption("AggregateByKeyNaive")] + SPARK_KV_OPTS)]
+
+# Scale the input for this test by 0.10.
+PYSPARK_TESTS += [("python-sort-by-key", "core_tests.py", SCALE_FACTOR * 0.1,
+    COMMON_JAVA_OPTS, [ConstantOption("SortByKey")] + SPARK_KV_OPTS)]
+
+PYSPARK_TESTS += [("python-sort-by-key-int", "core_tests.py", SCALE_FACTOR * 0.2,
+    COMMON_JAVA_OPTS, [ConstantOption("SortByKeyInt")] + SPARK_KV_OPTS)]
+
+PYSPARK_TESTS += [("python-count", "core_tests.py", SCALE_FACTOR,
+                 COMMON_JAVA_OPTS, [ConstantOption("Count")] + SPARK_KV_OPTS)]
+
+PYSPARK_TESTS += [("python-count-w-fltr", "core_tests.py", SCALE_FACTOR,
+    COMMON_JAVA_OPTS, [ConstantOption("CountWithFilter")] + SPARK_KV_OPTS)]
+
+PYSPARK_TESTS += [("python-broadcast-w-bytes", "core_tests.py", SCALE_FACTOR,
+    COMMON_JAVA_OPTS, [ConstantOption("BroadcastWithBytes")] + SPARK_KV_OPTS + BROADCAST_TEST_OPTS)]
+
+PYSPARK_TESTS += [("python-broadcast-w-set", "core_tests.py", SCALE_FACTOR,
+    COMMON_JAVA_OPTS, [ConstantOption("BroadcastWithSet")] + SPARK_KV_OPTS + BROADCAST_TEST_OPTS)]
+
+
+# ============================ #
+#  Spark Streaming Test Setup  #
+# ============================ #
+
+STREAMING_TESTS = []
+
+# The following function generates options for setting batch duration in streaming tests
+def streaming_batch_duration_opts(duration):
+    return [OptionSet("batch-duration", [duration])]
+
+# The following function generates options for setting window duration in streaming tests
+def streaming_window_duration_opts(duration):
+    return [OptionSet("window-duration", [duration])]
+
+STREAMING_COMMON_OPTS = [
+    OptionSet("total-duration", [60]),
+    OptionSet("hdfs-url", [HDFS_URL]),
+]
+
+STREAMING_COMMON_JAVA_OPTS = [
+    # Fraction of JVM memory used for caching RDDs.
+    JavaOptionSet("spark.storage.memoryFraction", [0.66]),
+    JavaOptionSet("spark.serializer", ["org.apache.spark.serializer.JavaSerializer"]),
+    # JavaOptionSet("spark.executor.memory", ["2g"])
+]
+
+STREAMING_KEY_VAL_TEST_OPTS = STREAMING_COMMON_OPTS + streaming_batch_duration_opts(2000) + [
+    # Number of input streams.
+    OptionSet("num-streams", [1], can_scale=True),
+    # Number of records per second per input stream
+    OptionSet("records-per-sec", [10 * 1000]),
+    # Number of reduce tasks.
+    OptionSet("reduce-tasks", [10], can_scale=True),
+    # memory serialization ("true" or "false").
+    OptionSet("memory-serialization", ["true"]),
+    # Number of unique keys to sample from.
+    OptionSet("unique-keys",[100 * 1000], can_scale=True),
+    # Length in characters of each key.
+    OptionSet("unique-values", [1000 * 1000], can_scale=True),
+    # Send data through receiver
+    OptionSet("use-receiver", ["true"]),
+]
+
+STREAMING_HDFS_RECOVERY_TEST_OPTS = STREAMING_COMMON_OPTS + streaming_batch_duration_opts(5000) + [
+    OptionSet("records-per-file", [10000]),
+    OptionSet("file-cleaner-delay", [300])
+]
+
+# This test is just to see if everything is setup properly
+STREAMING_TESTS += [("basic", "streaming.perf.TestRunner", SCALE_FACTOR,
+    STREAMING_COMMON_JAVA_OPTS, [ConstantOption("basic")] + STREAMING_COMMON_OPTS + streaming_batch_duration_opts(1000))]
+
+STREAMING_TESTS += [("state-by-key", "streaming.perf.TestRunner", SCALE_FACTOR,
+    STREAMING_COMMON_JAVA_OPTS, [ConstantOption("state-by-key")] + STREAMING_KEY_VAL_TEST_OPTS)]
+
+STREAMING_TESTS += [("group-by-key-and-window", "streaming.perf.TestRunner", SCALE_FACTOR,
+    STREAMING_COMMON_JAVA_OPTS, [ConstantOption("group-by-key-and-window")] + STREAMING_KEY_VAL_TEST_OPTS + streaming_window_duration_opts(10000) )]
+
+STREAMING_TESTS += [("reduce-by-key-and-window", "streaming.perf.TestRunner", SCALE_FACTOR,
+    STREAMING_COMMON_JAVA_OPTS, [ConstantOption("reduce-by-key-and-window")] + STREAMING_KEY_VAL_TEST_OPTS + streaming_window_duration_opts(10000) )]
+
+STREAMING_TESTS += [("hdfs-recovery", "streaming.perf.TestRunner", SCALE_FACTOR,
+    STREAMING_COMMON_JAVA_OPTS, [ConstantOption("hdfs-recovery")] + STREAMING_HDFS_RECOVERY_TEST_OPTS)]
+
+
+# ================== #
+#  MLlib Test Setup  #
+# ================== #
+
+MLLIB_TESTS = []
+MLLIB_PERF_TEST_RUNNER = "mllib.perf.TestRunner"
+
+# Set this to 1.0, 1.1, 1.2, ... (the major version) to test MLlib with a particular Spark version.
+# Note: You should also build mllib-perf using -Dspark.version to specify the same version.
+# Note: To run perf tests against a snapshot version of Spark which has not yet been packaged into a release:
+#  * Build Spark locally by running `build/sbt assembly; build/sbt publishLocal` in the Spark root directory
+#  * Set `USE_CLUSTER_SPARK = True` and `MLLIB_SPARK_VERSION = {desired Spark version, e.g. 1.5}`
+#  * Don't use PREP_MLLIB_TESTS = True; instead manually run `cd mllib-tests; sbt/sbt -Dspark.version=1.5.0-SNAPSHOT clean assembly` to build perf tests
+
+MLLIB_SPARK_VERSION = "2.0"
+
+MLLIB_JAVA_OPTS = COMMON_JAVA_OPTS
+if MLLIB_SPARK_VERSION >= 1.1:
+    MLLIB_JAVA_OPTS = MLLIB_JAVA_OPTS + [
+        # Shuffle manager: SORT, HASH
+        JavaOptionSet("spark.shuffle.manager", ["SORT"])
+    ]
+
+# The following options value sets are shared among all tests of
+# operations on MLlib algorithms.
+MLLIB_COMMON_OPTS = COMMON_OPTS + [
+    # The number of input partitions.
+    # The default setting is suitable for a 16-node m3.2xlarge EC2 cluster.
+    OptionSet("num-partitions", [128], can_scale=True),
+    # A random seed to make tests reproducable.
+    OptionSet("random-seed", [5])
+]
+
+# Algorithms available in Spark-1.0 #
+
+# Regression and Classification Tests #
+MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS = MLLIB_COMMON_OPTS + [
+    # The number of rows or examples
+    OptionSet("num-examples", [1000000], can_scale=True)
+]
+
+# Generalized Linear Model (GLM) Tests #
+MLLIB_GLM_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
+    # The number of features per example
+    OptionSet("num-features", [10000], can_scale=False),
+    # The number of iterations for SGD
+    OptionSet("num-iterations", [20]),
+    # The step size for SGD
+    OptionSet("step-size", [0.001]),
+    # Regularization type: none, l1, l2
+    OptionSet("reg-type", ["l2"]),
+    # Regularization parameter
+    OptionSet("reg-param", [0.1])
+]
+if MLLIB_SPARK_VERSION >= 1.1:
+    MLLIB_GLM_TEST_OPTS += [
+        # Optimization algorithm: sgd, l-bfgs
+        OptionSet("optimizer", ["sgd", "l-bfgs"])
+    ]
+if MLLIB_SPARK_VERSION >= 1.5:
+    MLLIB_GLM_TEST_OPTS += [
+        # Ignored, but required for config
+        OptionSet("elastic-net-param", [0.0])
+    ]
+
+# GLM Regression Tests #
+MLLIB_GLM_REGRESSION_TEST_OPTS = MLLIB_GLM_TEST_OPTS + [
+    # The intercept for the data
+    OptionSet("intercept", [0.0]),
+    # The scale factor for label noise
+    OptionSet("label-noise", [0.1]),
+    # Loss to minimize: l2 (squared error)
+    OptionSet("loss", ["l2"])
+]
+
+MLLIB_TESTS += [("glm-regression", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
+    MLLIB_JAVA_OPTS, [ConstantOption("glm-regression")] + MLLIB_GLM_REGRESSION_TEST_OPTS)]
+
+# Classification Tests #
+MLLIB_CLASSIFICATION_TEST_OPTS = MLLIB_GLM_TEST_OPTS + [
+    # Expected fraction of examples which are negative
+    OptionSet("per-negative", [0.3]),
+]
+
+# GLM Classification Tests #
+MLLIB_GLM_CLASSIFICATION_TEST_OPTS = MLLIB_CLASSIFICATION_TEST_OPTS + [
+    # Loss to minimize: logistic, hinge (SVM)
+    OptionSet("loss", ["logistic"])
+]
+
+MLLIB_TESTS += [("glm-classification", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
+                 MLLIB_JAVA_OPTS, [ConstantOption("glm-classification")] +
+                 MLLIB_GLM_CLASSIFICATION_TEST_OPTS)]
+
+if MLLIB_SPARK_VERSION >= 1.5:
+    MLLIB_GLM_ELASTIC_NET_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
+        # The max number of iterations for LBFGS/OWLQN
+        OptionSet("num-iterations", [20]),
+        # LBFGS/OWLQN is used with elastic-net regularization.
+        OptionSet("optimizer", ["auto"]),
+        # Using elastic-net regularization.
+        OptionSet("reg-type", ["elastic-net"]),
+        # Runs with L2 (param = 0.0), L1 (param = 1.0).
+        OptionSet("elastic-net-param", [0.0, 1.0]),
+        # Regularization param (lambda)
+        OptionSet("reg-param", [0.01]),
+        # The scale factor for the noise in feature values
+        OptionSet("feature-noise", [1.0]),
+        # The scale factor for the noise in label values
+        OptionSet("label-noise", [0.1]),
+        # The intercept for the data
+        OptionSet("intercept", [0.2]),
+        # The step size is not used in LBFGS, but this is required in parameter checking.
+        OptionSet("step-size", [0.0])
+    ]
+
+    MLLIB_GLM_ELASTIC_NET_REGRESSION_TEST_OPTS = MLLIB_GLM_ELASTIC_NET_TEST_OPTS + [
+        # Loss to minimize: l2 (squared error)
+        OptionSet("loss", ["l2"])
+    ]
+
+    # Test L-BFGS
+    MLLIB_TESTS += [("glm-regression", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
+        MLLIB_JAVA_OPTS, [ConstantOption("glm-regression")] +
+        MLLIB_GLM_ELASTIC_NET_REGRESSION_TEST_OPTS +
+        [OptionSet("num-features", [10000], can_scale=False)])]
+    # Test normal equation solver
+    MLLIB_TESTS += [("glm-regression", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
+                     MLLIB_JAVA_OPTS, [ConstantOption("glm-regression")] +
+                     MLLIB_GLM_ELASTIC_NET_REGRESSION_TEST_OPTS +
+                     [OptionSet("num-features", [100], can_scale=False)])]
+
+    MLLIB_GLM_ELASTIC_NET_CLASSIFICATION_TEST_OPTS = MLLIB_GLM_ELASTIC_NET_TEST_OPTS + [
+        # In GLM classification with elastic-net regularization, only logistic loss is supported.
+        OptionSet("loss", ["logistic"])
+    ]
+
+    # Test L-BFGS
+    MLLIB_TESTS += [("glm-classification", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
+                     MLLIB_JAVA_OPTS, [ConstantOption("glm-classification")] +
+                     MLLIB_GLM_ELASTIC_NET_CLASSIFICATION_TEST_OPTS +
+                     [OptionSet("num-features", [10000], can_scale=False)])]
+    # Test normal equation solver
+    MLLIB_TESTS += [("glm-classification", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
+                     MLLIB_JAVA_OPTS, [ConstantOption("glm-classification")] +
+                     MLLIB_GLM_ELASTIC_NET_CLASSIFICATION_TEST_OPTS +
+                     [OptionSet("num-features", [100], can_scale=False)])]
+
+NAIVE_BAYES_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
+    # The number of features per example
+    OptionSet("num-features", [10000], can_scale=False),
+    # Expected fraction of examples which are negative
+    OptionSet("per-negative", [0.3]),
+    # The scale factor for the noise in feature values
+    OptionSet("feature-noise", [1.0]),
+    # Naive Bayes smoothing lambda.
+    OptionSet("nb-lambda", [1.0]),
+    # Model type: either multinomial or bernoulli (bernoulli only available in Spark 1.4+)
+    OptionSet("model-type", ["multinomial"]),
+]
+
+MLLIB_TESTS += [("naive-bayes", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
+    MLLIB_JAVA_OPTS, [ConstantOption("naive-bayes")] +
+    NAIVE_BAYES_TEST_OPTS)]
+
+# Decision Trees #
+MLLIB_DECISION_TREE_TEST_OPTS = MLLIB_COMMON_OPTS + [
+    # The number of rows or examples
+    OptionSet("num-examples", [1000000], can_scale=True),
+    # The number of features per example
+    OptionSet("num-features", [500], can_scale=False),
+    # Type of label: 0 indicates regression, 2+ indicates classification with this many classes
+    # Note: multi-class (>2) is not supported in Spark 1.0.
+    OptionSet("label-type", [0, 2], can_scale=False),
+    # Fraction of features which are categorical
+    OptionSet("frac-categorical-features", [0.5], can_scale=False),
+    # Fraction of categorical features which are binary. Others have 20 categories.
+    OptionSet("frac-binary-features", [0.5], can_scale=False),
+    # Depth of true decision tree model used to label examples.
+    # WARNING: The meaning of depth changed from Spark 1.0 to Spark 1.1:
+    #          depth=N for Spark 1.0 should be depth=N-1 for Spark 1.1
+    OptionSet("tree-depth", [5, 10], can_scale=False),
+    # Maximum number of bins for the decision tree learning algorithm.
+    OptionSet("max-bins", [32], can_scale=False),
+]
+
+if MLLIB_SPARK_VERSION >= 1.2:
+    ensembleTypes = ["RandomForest"]
+    if MLLIB_SPARK_VERSION >= 1.3:
+        ensembleTypes.append("GradientBoostedTrees")
+    if MLLIB_SPARK_VERSION >= 1.4:
+        ensembleTypes.extend(["ml.RandomForest", "ml.GradientBoostedTrees"])
+    MLLIB_DECISION_TREE_TEST_OPTS += [
+        # Ensemble type: mllib.RandomForest, mllib.GradientBoostedTrees,
+        #                ml.RandomForest, ml.GradientBoostedTrees
+        OptionSet("ensemble-type", ensembleTypes),
+        # Path to training dataset (if not given, use random data).
+        OptionSet("training-data", [""]),
+        # Path to test dataset (only used if training dataset given).
+        # If not given, hold out part of training data for validation.
+        OptionSet("test-data", [""]),
+        # Fraction of data to hold out for testing
+        #  (Ignored if given training and test dataset, or if using synthetic data.)
+        OptionSet("test-data-fraction", [0.2], can_scale=False),
+        # Number of trees. If 1, then run DecisionTree. If >1, then run RandomForest.
+        OptionSet("num-trees", [1, 10], can_scale=False),
+        # Feature subset sampling strategy: auto, all, sqrt, log2, onethird
+        # (only used for RandomForest)
+        OptionSet("feature-subset-strategy", ["auto"])
+    ]
+
+MLLIB_TESTS += [("decision-tree", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
+    MLLIB_JAVA_OPTS, [ConstantOption("decision-tree")] +
+    MLLIB_DECISION_TREE_TEST_OPTS)]
+
+# Recommendation Tests #
+MLLIB_RECOMMENDATION_TEST_OPTS = MLLIB_COMMON_OPTS + [
+     # The number of users
+     OptionSet("num-users", [6000000], can_scale=True),
+     # The number of products
+     OptionSet("num-products", [5000000], can_scale=False),
+     # The number of ratings
+     OptionSet("num-ratings", [50000000], can_scale=True),
+     # The number of iterations for ALS
+     OptionSet("num-iterations", [10]),
+     # The rank of the factorized matrix model
+     OptionSet("rank", [10]),
+     # The regularization parameter
+     OptionSet("reg-param", [0.1]),
+     # Whether to use implicit preferences or not
+     FlagSet("implicit-prefs", [False])
+]
+
+MLLIB_TESTS += [("als", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
+    MLLIB_JAVA_OPTS, [ConstantOption("als")] +
+    MLLIB_RECOMMENDATION_TEST_OPTS)]
+
+# Clustering Tests #
+MLLIB_CLUSTERING_TEST_OPTS = MLLIB_COMMON_OPTS + [
+     # The number of points
+     OptionSet("num-points", [1000000], can_scale=True),
+     # The number of features per point
+     OptionSet("num-columns", [10000], can_scale=False),
+     # The number of centers
+     OptionSet("num-centers", [20]),
+     # The number of iterations for KMeans
+     OptionSet("num-iterations", [20])
+]
+
+MLLIB_TESTS += [("kmeans", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
+    MLLIB_JAVA_OPTS, [ConstantOption("kmeans")] + MLLIB_CLUSTERING_TEST_OPTS)]
+
+MLLIB_GMM_TEST_OPTS = MLLIB_COMMON_OPTS + [
+    OptionSet("num-points", [1000000], can_scale=True),
+    OptionSet("num-columns", [100], can_scale=False),
+    OptionSet("num-centers", [20], can_scale=False),
+    OptionSet("num-iterations", [20])]
+
+if MLLIB_SPARK_VERSION >= 1.3:
+    MLLIB_TESTS += [("gmm", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
+        MLLIB_JAVA_OPTS, [ConstantOption("gmm")] + MLLIB_GMM_TEST_OPTS)]
+
+MLLIB_LDA_TEST_OPTS = MLLIB_COMMON_OPTS + [
+    OptionSet("num-documents", [50000], can_scale=True),
+    OptionSet("num-vocab", [10000], can_scale=False),
+    OptionSet("num-topics", [20], can_scale=False),
+    OptionSet("num-iterations", [20]),
+    OptionSet("document-length", [100]),
+    OptionSet("optimizer", ["em", "online"])]
+
+if MLLIB_SPARK_VERSION >= 1.4:
+    MLLIB_TESTS += [("lda", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
+        MLLIB_JAVA_OPTS, [ConstantOption("lda")] + MLLIB_LDA_TEST_OPTS)]
+
+# TODO: tune PIC test size to run in 20-30 seconds
+MLLIB_PIC_TEST_OPTS = MLLIB_COMMON_OPTS + [
+    OptionSet("num-points", [10000], can_scale=True),
+    OptionSet("node-degree", [10], can_scale=False),
+    OptionSet("num-centers", [20], can_scale=False),
+    OptionSet("num-iterations", [20])]
+
+if MLLIB_SPARK_VERSION >= 1.3:
+    MLLIB_TESTS += [("pic", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
+        MLLIB_JAVA_OPTS, [ConstantOption("pic")] + MLLIB_CLUSTERING_TEST_OPTS)]
+
+# Linear Algebra Tests #
+MLLIB_LINALG_TEST_OPTS = MLLIB_COMMON_OPTS + [
+    # The number of rows for the matrix
+    OptionSet("num-rows", [1000000], can_scale=True),
+    # The number of columns for the matrix
+    OptionSet("num-cols", [1000], can_scale=False),
+    # The number of top singular values wanted for SVD and PCA
+    OptionSet("rank", [50], can_scale=False)
+]
+# Linear Algebra Tests which take more time (slightly smaller settings) #
+MLLIB_BIG_LINALG_TEST_OPTS = MLLIB_COMMON_OPTS + [
+    # The number of rows for the matrix
+    OptionSet("num-rows", [1000000], can_scale=True),
+    # The number of columns for the matrix
+    OptionSet("num-cols", [500], can_scale=False),
+    # The number of top singular values wanted for SVD and PCA
+    OptionSet("rank", [10], can_scale=False)
+]
+
+MLLIB_TESTS += [("svd", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
+    MLLIB_JAVA_OPTS, [ConstantOption("svd")] + MLLIB_BIG_LINALG_TEST_OPTS)]
+
+MLLIB_TESTS += [("pca", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
+    MLLIB_JAVA_OPTS, [ConstantOption("pca")] + MLLIB_LINALG_TEST_OPTS)]
+
+MLLIB_TESTS += [("summary-statistics", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
+    MLLIB_JAVA_OPTS, [ConstantOption("summary-statistics")] +
+    MLLIB_LINALG_TEST_OPTS)]
+
+MLLIB_BLOCK_MATRIX_MULT_TEST_OPTS = MLLIB_COMMON_OPTS + [
+    OptionSet("m", [20000], can_scale=True),
+    OptionSet("k", [10000], can_scale=False),
+    OptionSet("n", [10000], can_scale=False),
+    OptionSet("block-size", [1024], can_scale=False)]
+
+if MLLIB_SPARK_VERSION >= 1.3:
+   MLLIB_TESTS += [("block-matrix-mult", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
+                   MLLIB_JAVA_OPTS, [ConstantOption("block-matrix-mult")] + MLLIB_BLOCK_MATRIX_MULT_TEST_OPTS)]
+
+# Statistic Toolkit Tests #
+MLLIB_STATS_TEST_OPTS = MLLIB_COMMON_OPTS
+
+MLLIB_PEARSON_TEST_OPTS = MLLIB_STATS_TEST_OPTS + \
+                          [OptionSet("num-rows", [1000000], can_scale=True),
+                           OptionSet("num-cols", [1000], can_scale=False)]
+
+MLLIB_SPEARMAN_TEST_OPTS = MLLIB_STATS_TEST_OPTS + \
+                           [OptionSet("num-rows", [1000000], can_scale=True),
+                            OptionSet("num-cols", [100], can_scale=False)]
+
+MLLIB_CHI_SQ_FEATURE_TEST_OPTS = MLLIB_STATS_TEST_OPTS + \
+                                 [OptionSet("num-rows", [2000000], can_scale=True),
+                                  OptionSet("num-cols", [500], can_scale=False)]
+
+MLLIB_CHI_SQ_GOF_TEST_OPTS = MLLIB_STATS_TEST_OPTS + \
+                             [OptionSet("num-rows", [50000000], can_scale=True),
+                              OptionSet("num-cols", [0], can_scale=False)]
+
+MLLIB_CHI_SQ_MAT_TEST_OPTS = MLLIB_STATS_TEST_OPTS + \
+                             [OptionSet("num-rows", [20000], can_scale=True),
+                              OptionSet("num-cols", [0], can_scale=False)]
+
+if MLLIB_SPARK_VERSION >= 1.1:
+    MLLIB_TESTS += [("pearson", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
+        MLLIB_JAVA_OPTS, [ConstantOption("pearson")] + MLLIB_PEARSON_TEST_OPTS)]
+
+    MLLIB_TESTS += [("spearman", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
+        MLLIB_JAVA_OPTS, [ConstantOption("spearman")] + MLLIB_SPEARMAN_TEST_OPTS)]
+
+    MLLIB_TESTS += [("chi-sq-feature", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
+        MLLIB_JAVA_OPTS, [ConstantOption("chi-sq-feature")] + MLLIB_CHI_SQ_FEATURE_TEST_OPTS)]
+
+    MLLIB_TESTS += [("chi-sq-gof", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
+        MLLIB_JAVA_OPTS, [ConstantOption("chi-sq-gof")] + MLLIB_CHI_SQ_GOF_TEST_OPTS)]
+
+    MLLIB_TESTS += [("chi-sq-mat", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
+        MLLIB_JAVA_OPTS, [ConstantOption("chi-sq-mat")] + MLLIB_CHI_SQ_MAT_TEST_OPTS)]
+
+# Feature Transformation Tests #
+
+MLLIB_FEATURE_TEST_OPTS = MLLIB_COMMON_OPTS
+
+MLLIB_WORD2VEC_TEST_OPTS = MLLIB_FEATURE_TEST_OPTS + \
+                           [OptionSet("num-sentences", [1000000], can_scale=True),
+                            OptionSet("num-words", [10000], can_scale=False),
+                            OptionSet("vector-size", [100], can_scale=False),
+                            OptionSet("num-iterations", [3], can_scale=False),
+                            OptionSet("min-count", [5], can_scale=False)]
+
+if MLLIB_SPARK_VERSION >= 1.3:  # TODO: make it work in 1.2
+    MLLIB_TESTS += [("word2vec", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
+        MLLIB_JAVA_OPTS, [ConstantOption("word2vec")] + MLLIB_WORD2VEC_TEST_OPTS)]
+
+# Frequent Pattern Matching Tests #
+
+MLLIB_FPM_TEST_OPTS = MLLIB_COMMON_OPTS
+
+MLLIB_FP_GROWTH_TEST_OPTS = MLLIB_FPM_TEST_OPTS + \
+                            [OptionSet("num-baskets", [5000000], can_scale=True),
+                             OptionSet("avg-basket-size", [10], can_scale=False),
+                             OptionSet("num-items", [1000], can_scale=False),
+                             OptionSet("min-support", [0.01], can_scale=False)]
+
+if MLLIB_SPARK_VERSION >= 1.3:
+    MLLIB_TESTS += [("fp-growth", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
+        MLLIB_JAVA_OPTS, [ConstantOption("fp-growth")] + MLLIB_FP_GROWTH_TEST_OPTS)]
+
+# TODO: tune test size to have runtime within 30-60 seconds
+MLLIB_PREFIX_SPAN_TEST_OPTS = MLLIB_FPM_TEST_OPTS + \
+                            [OptionSet("num-sequences", [5000000], can_scale=True),
+                             OptionSet("avg-sequence-size", [5], can_scale=False),
+                             OptionSet("avg-itemset-size", [1], can_scale=False),
+                             OptionSet("num-items", [100], can_scale=False),
+                             OptionSet("min-support", [0.5], can_scale=False),
+                             OptionSet("max-pattern-len", [10], can_scale=False),
+                             OptionSet("max-local-proj-db-size", [32000000], can_scale=False)]
+
+if MLLIB_SPARK_VERSION >= 1.5:
+    MLLIB_TESTS += [("prefix-span", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
+        MLLIB_JAVA_OPTS, [ConstantOption("prefix-span")] + MLLIB_PREFIX_SPAN_TEST_OPTS)]
+
+# Python MLlib tests
+PYTHON_MLLIB_TESTS = []
+
+PYTHON_MLLIB_TESTS += [("python-glm-classification", "mllib_tests.py", SCALE_FACTOR,
+                         MLLIB_JAVA_OPTS, [ConstantOption("GLMClassificationTest")] +
+                         MLLIB_GLM_CLASSIFICATION_TEST_OPTS)]
+
+PYTHON_MLLIB_TESTS += [("python-glm-regression", "mllib_tests.py", SCALE_FACTOR,
+                         MLLIB_JAVA_OPTS, [ConstantOption("GLMRegressionTest")] +
+                         MLLIB_GLM_REGRESSION_TEST_OPTS)]
+
+PYTHON_MLLIB_TESTS += [("python-naive-bayes", "mllib_tests.py", SCALE_FACTOR,
+                         MLLIB_JAVA_OPTS, [ConstantOption("NaiveBayesTest")] +
+                         NAIVE_BAYES_TEST_OPTS)]
+
+PYTHON_MLLIB_TESTS += [("python-als", "mllib_tests.py", SCALE_FACTOR,
+                         MLLIB_JAVA_OPTS, [ConstantOption("ALSTest")] +
+                         MLLIB_RECOMMENDATION_TEST_OPTS)]
+
+PYTHON_MLLIB_TESTS += [("python-kmeans", "mllib_tests.py", SCALE_FACTOR,
+                         MLLIB_JAVA_OPTS, [ConstantOption("KMeansTest")] + MLLIB_CLUSTERING_TEST_OPTS)]
+
+if MLLIB_SPARK_VERSION >= 1.1:
+    PYTHON_MLLIB_TESTS += [("python-pearson", "mllib_tests.py", SCALE_FACTOR,
+                             MLLIB_JAVA_OPTS, [ConstantOption("PearsonCorrelationTest")] +
+                             MLLIB_PEARSON_TEST_OPTS)]
+
+    PYTHON_MLLIB_TESTS += [("python-spearman", "mllib_tests.py", SCALE_FACTOR,
+                             MLLIB_JAVA_OPTS, [ConstantOption("SpearmanCorrelationTest")] +
+                             MLLIB_SPEARMAN_TEST_OPTS)]
+

From cd7eb1244525dcb983bc7cb268318eeb4721fca0 Mon Sep 17 00:00:00 2001
From: Adam Roberts <aroberts@uk.ibm.com>
Date: Thu, 30 Jun 2016 17:01:22 +0100
Subject: [PATCH 06/22] Add ability to override scala version and cleanup in
 mllib project file

---
 mllib-tests/project/MLlibTestsBuild.scala | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mllib-tests/project/MLlibTestsBuild.scala b/mllib-tests/project/MLlibTestsBuild.scala
index ebf12ab..6a51e93 100644
--- a/mllib-tests/project/MLlibTestsBuild.scala
+++ b/mllib-tests/project/MLlibTestsBuild.scala
@@ -16,8 +16,8 @@ object MLlibTestsBuild extends Build {
   lazy val commonSettings = Seq(
     organization := "org.spark-project",
     version := "0.1",
-    scalaVersion := "2.11.8",
-    sparkVersion := sys.props.getOrElse("spark.version", default="2.0.0-SNAPSHOT"),
+    scalaVersion := sys.props.getOrElse("scala.version", default="2.11.8"),
+    sparkVersion := sys.props.getOrElse("spark.version", default="2.0.0"),
     libraryDependencies ++= Seq(
       "net.sf.jopt-simple" % "jopt-simple" % "4.6",
       "org.scalatest" %% "scalatest" % "2.2.1" % "test",
@@ -42,9 +42,9 @@ object MLlibTestsBuild extends Build {
         println("sparkVersion.value is: " + sparkVersion.value)
         val targetFolder = sparkVersion.value match {
           case v if v.startsWith("1.4.") => "v1p4"
-          case v if v.startsWith("1.5.") => "v1p5"
+          case v if v.startsWith("1.5.") => "v1p5" // acceptable for now, but change later when new algs are added
           case v if v.startsWith("1.6.") => "v1p5"
-          case v if v.startsWith("2.0") => "v2p0" 
+          case v if v.startsWith("2.0") => "v2p0"
           case _ => throw new IllegalArgumentException(s"This Spark version isn't suppored: ${sparkVersion.value}.")
         }
         baseDirectory.value / targetFolder / "src" / "main" / "scala"

From 95d31d300bb8c3f52ca74a8210ce185245c7a799 Mon Sep 17 00:00:00 2001
From: Adam Roberts <aroberts@uk.ibm.com>
Date: Thu, 30 Jun 2016 17:15:38 +0100
Subject: [PATCH 07/22] Add QA 1.6 fixes code back in

---
 .../scala/mllib/perf/MLAlgorithmTests.scala   | 22 +++----
 .../mllib/perf/MLAlgorithmTests.scala.rej     | 58 +++++++++++++++++++
 .../perf/clustering/GaussianMixtureTest.scala | 14 ++---
 .../scala/mllib/perf/clustering/PICTest.scala | 13 +++--
 .../scala/mllib/perf/util/DataGenerator.scala |  8 +--
 .../scala/mllib/perf/MLAlgorithmTests.scala   | 37 +++++++-----
 .../perf/clustering/GaussianMixtureTest.scala | 14 ++---
 .../scala/mllib/perf/clustering/PICTest.scala | 13 +++--
 .../scala/mllib/perf/util/DataGenerator.scala |  8 +--
 pyspark-tests/mllib_tests.py                  |  8 +--
 10 files changed, 131 insertions(+), 64 deletions(-)
 create mode 100644 mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala.rej

diff --git a/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala b/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala
index 6f89aac..0d438db 100644
--- a/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala
+++ b/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala
@@ -315,13 +315,13 @@ abstract class ClusteringTests(sc: SparkContext) extends PerfTest {
 
   def runTest(rdd: RDD[Vector]): KMeansModel
 
-  val NUM_POINTS =    ("num-points",   "number of points for clustering tests")
-  val NUM_COLUMNS =   ("num-columns",   "number of columns for each point for clustering tests")
+  val NUM_EXAMPLES =    ("num-examples",   "number of examples for clustering tests")
+  val NUM_FEATURES =   ("num-features",   "number of features for each example for clustering tests")
   val NUM_CENTERS =   ("num-centers",   "number of centers for clustering tests")
   val NUM_ITERATIONS =      ("num-iterations",   "number of iterations for the algorithm")
 
-  intOptions = intOptions ++ Seq(NUM_CENTERS, NUM_COLUMNS, NUM_ITERATIONS)
-  longOptions = longOptions ++ Seq(NUM_POINTS)
+  intOptions = intOptions ++ Seq(NUM_CENTERS, NUM_FEATURES, NUM_ITERATIONS)
+  longOptions = longOptions ++ Seq(NUM_EXAMPLES)
   val options = intOptions ++ stringOptions  ++ booleanOptions ++ longOptions ++ doubleOptions
   addOptionsToParser()
 
@@ -329,21 +329,21 @@ abstract class ClusteringTests(sc: SparkContext) extends PerfTest {
   var testRdd: RDD[Vector] = _
 
   def validate(model: KMeansModel, rdd: RDD[Vector]): Double = {
-    val numPoints = rdd.cache().count()
+    val numExamples = rdd.cache().count()
 
     val error = model.computeCost(rdd)
 
-    math.sqrt(error/numPoints)
+    math.sqrt(error/numExamples)
   }
 
   override def createInputData(seed: Long) = {
     val numPartitions: Int = intOptionValue(NUM_PARTITIONS)
 
-    val numPoints: Long = longOptionValue(NUM_POINTS)
-    val numColumns: Int = intOptionValue(NUM_COLUMNS)
+    val numExamples: Long = longOptionValue(NUM_EXAMPLES)
+    val numFeatures: Int = intOptionValue(NUM_FEATURES)
     val numCenters: Int = intOptionValue(NUM_CENTERS)
 
-    val data = DataGenerator.generateKMeansVectors(sc, math.ceil(numPoints*1.25).toLong, numColumns,
+    val data = DataGenerator.generateKMeansVectors(sc, math.ceil(numExamples*1.25).toLong, numFeatures,
       numCenters, numPartitions, seed)
 
     val split = data.randomSplit(Array(0.8, 0.2), seed)
@@ -441,9 +441,10 @@ class ALSTest(sc: SparkContext) extends RecommendationTests(sc) {
     val rank: Int = intOptionValue(RANK)
     val regParam = doubleOptionValue(REG_PARAM)
     val seed = intOptionValue(RANDOM_SEED) + 12
+    val implicitRatings: Boolean = booleanOptionValue(IMPLICIT)
 
     new ALS().setIterations(numIterations).setRank(rank).setSeed(seed).setLambda(regParam)
-      .setBlocks(rdd.partitions.size).run(rdd)
+      .setBlocks(rdd.partitions.length).setImplicitPrefs(implicitRatings).run(rdd)
   }
 }
 
@@ -602,3 +603,4 @@ class DecisionTreeTest(sc: SparkContext) extends DecisionTreeTests(sc) {
     }
   }
 }
+
diff --git a/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala.rej b/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala.rej
new file mode 100644
index 0000000..7c1776b
--- /dev/null
+++ b/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala.rej
@@ -0,0 +1,58 @@
+diff a/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala b/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala	(rejected hunks)
+@@ -315,13 +315,13 @@ abstract class ClusteringTests(sc: SparkContext) extends PerfTest {
+ 
+   def runTest(rdd: RDD[Vector]): KMeansModel
+ 
+-  val NUM_POINTS =    ("num-points",   "number of points for clustering tests")
+-  val NUM_COLUMNS =   ("num-columns",   "number of columns for each point for clustering tests")
++  val NUM_EXAMPLES =    ("num-examples",   "number of examples for clustering tests")
++  val NUM_FEATURES =   ("num-features",   "number of features for each example for clustering tests")
+   val NUM_CENTERS =   ("num-centers",   "number of centers for clustering tests")
+   val NUM_ITERATIONS =      ("num-iterations",   "number of iterations for the algorithm")
+ 
+-  intOptions = intOptions ++ Seq(NUM_CENTERS, NUM_COLUMNS, NUM_ITERATIONS)
+-  longOptions = longOptions ++ Seq(NUM_POINTS)
++  intOptions = intOptions ++ Seq(NUM_CENTERS, NUM_FEATURES, NUM_ITERATIONS)
++  longOptions = longOptions ++ Seq(NUM_EXAMPLES)
+   val options = intOptions ++ stringOptions  ++ booleanOptions ++ longOptions ++ doubleOptions
+   addOptionsToParser()
+ 
+@@ -329,21 +329,21 @@ abstract class ClusteringTests(sc: SparkContext) extends PerfTest {
+   var testRdd: RDD[Vector] = _
+ 
+   def validate(model: KMeansModel, rdd: RDD[Vector]): Double = {
+-    val numPoints = rdd.cache().count()
++    val numExamples = rdd.cache().count()
+ 
+     val error = model.computeCost(rdd)
+ 
+-    math.sqrt(error/numPoints)
++    math.sqrt(error/numExamples)
+   }
+ 
+   override def createInputData(seed: Long) = {
+     val numPartitions: Int = intOptionValue(NUM_PARTITIONS)
+ 
+-    val numPoints: Long = longOptionValue(NUM_POINTS)
+-    val numColumns: Int = intOptionValue(NUM_COLUMNS)
++    val numExamples: Long = longOptionValue(NUM_EXAMPLES)
++    val numFeatures: Int = intOptionValue(NUM_FEATURES)
+     val numCenters: Int = intOptionValue(NUM_CENTERS)
+ 
+-    val data = DataGenerator.generateKMeansVectors(sc, math.ceil(numPoints*1.25).toLong, numColumns,
++    val data = DataGenerator.generateKMeansVectors(sc, math.ceil(numExamples*1.25).toLong, numFeatures,
+       numCenters, numPartitions, seed)
+ 
+     val split = data.randomSplit(Array(0.8, 0.2), seed)
+@@ -441,9 +441,10 @@ class ALSTest(sc: SparkContext) extends RecommendationTests(sc) {
+     val rank: Int = intOptionValue(RANK)
+     val regParam = doubleOptionValue(REG_PARAM)
+     val seed = intOptionValue(RANDOM_SEED) + 12
++    val implicitRatings: Boolean = booleanOptionValue(IMPLICIT)
+ 
+     new ALS().setIterations(numIterations).setRank(rank).setSeed(seed).setLambda(regParam)
+-      .setBlocks(rdd.partitions.size).run(rdd)
++      .setBlocks(rdd.partitions.length).setImplicitPrefs(implicitRatings).run(rdd)
+   }
+ }
+ 
diff --git a/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala b/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala
index 0004f8d..5903e2e 100644
--- a/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala
+++ b/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala
@@ -16,21 +16,21 @@ import mllib.perf.PerfTest
 class GaussianMixtureTest(sc: SparkContext) extends PerfTest {
 
   // TODO: refactor k-means and GMM code
-  val NUM_POINTS = ("num-points", "number of points for clustering tests")
-  val NUM_COLUMNS = ("num-columns", "number of columns for each point for clustering tests")
+  val NUM_EXAMPLES = ("num-examples", "number of examples for clustering tests")
+  val NUM_FEATURES = ("num-features", "number of features for each example for clustering tests")
   val NUM_CENTERS = ("num-centers", "number of centers for clustering tests")
   val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm")
 
-  intOptions ++= Seq(NUM_CENTERS, NUM_COLUMNS, NUM_ITERATIONS)
-  longOptions ++= Seq(NUM_POINTS)
+  intOptions ++= Seq(NUM_CENTERS, NUM_FEATURES, NUM_ITERATIONS)
+  longOptions ++= Seq(NUM_EXAMPLES)
   val options = intOptions ++ stringOptions  ++ booleanOptions ++ longOptions ++ doubleOptions
   addOptionsToParser()
 
   var data: RDD[Vector] = _
 
   override def createInputData(seed: Long): Unit = {
-    val m = longOptionValue(NUM_POINTS)
-    val n = intOptionValue(NUM_COLUMNS)
+    val m = longOptionValue(NUM_EXAMPLES)
+    val n = intOptionValue(NUM_FEATURES)
     val k = intOptionValue(NUM_CENTERS)
     val p = intOptionValue(NUM_PARTITIONS)
 
@@ -47,7 +47,7 @@ class GaussianMixtureTest(sc: SparkContext) extends PerfTest {
           Vectors.dense(y.data)
         }
       }.cache()
-    logInfo(s"Generated ${data.count()} points.")
+    logInfo(s"Generated ${data.count()} examples.")
   }
 
   override def run(): JValue = {
diff --git a/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/PICTest.scala b/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/PICTest.scala
index 6832ffa..2018c61 100644
--- a/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/PICTest.scala
+++ b/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/PICTest.scala
@@ -11,28 +11,28 @@ import mllib.perf.PerfTest
 
 class PICTest(sc: SparkContext) extends PerfTest {
 
-  val NUM_POINTS = ("num-points", "number of points")
+  val NUM_EXAMPLES = ("num-examples", "number of examples")
   val NODE_DEGREE = ("node-degree", "number of neighbors each node is connected to")
   val NUM_CENTERS = ("num-centers", "number of centers for clustering tests")
   val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm")
 
   intOptions ++= Seq(NODE_DEGREE, NUM_CENTERS, NUM_ITERATIONS)
-  longOptions ++= Seq(NUM_POINTS)
+  longOptions ++= Seq(NUM_EXAMPLES)
   val options = intOptions ++ stringOptions  ++ booleanOptions ++ longOptions ++ doubleOptions
   addOptionsToParser()
 
   var data: RDD[(Long, Long, Double)] = _
 
   override def createInputData(seed: Long): Unit = {
-    val numPoints = longOptionValue(NUM_POINTS)
+    val numExamples = longOptionValue(NUM_EXAMPLES)
     val nodeDegree = intOptionValue(NODE_DEGREE)
     val numPartitions = intOptionValue(NUM_PARTITIONS)
 
     // Generates a periodic banded matrix with bandwidth = nodeDegree
-    val data = sc.parallelize(0L to numPoints, numPartitions)
+    data = sc.parallelize(0L to numExamples, numPartitions)
       .flatMap { id =>
-        (((id - nodeDegree / 2) % numPoints) until id).map { nbr =>
-          (id, (nbr + numPoints) % numPoints, 1D)
+        (((id - nodeDegree / 2) % numExamples) until id).map { nbr =>
+          (id, (nbr + numExamples) % numExamples, 1D)
         }
       }
     logInfo(s"Generated ${data.count()} pairwise similarities.")
@@ -46,6 +46,7 @@ class PICTest(sc: SparkContext) extends PerfTest {
       .setK(k)
       .setMaxIterations(numIterations)
     val model = pic.run(data)
+    model.assignments.count()
     val duration = (System.currentTimeMillis() - start) / 1e3
     "time" -> duration
   }
diff --git a/mllib-tests/v1p4/src/main/scala/mllib/perf/util/DataGenerator.scala b/mllib-tests/v1p4/src/main/scala/mllib/perf/util/DataGenerator.scala
index 6e354fd..f721ca7 100644
--- a/mllib-tests/v1p4/src/main/scala/mllib/perf/util/DataGenerator.scala
+++ b/mllib-tests/v1p4/src/main/scala/mllib/perf/util/DataGenerator.scala
@@ -509,7 +509,7 @@ class FeaturesGenerator(val categoricalArities: Array[Int], val numContinuous: I
 
 class KMeansDataGenerator(
     val numCenters: Int,
-    val numColumns: Int,
+    val numFeatures: Int,
     val seed: Long) extends RandomDataGenerator[Vector] {
 
   private val rng = new java.util.Random(seed)
@@ -528,7 +528,7 @@ class KMeansDataGenerator(
   }
 
   private val centers = (0 until numCenters).map{i =>
-    Array.fill(numColumns)((2 * rng.nextDouble() - 1)*scale_factors(i))
+    Array.fill(numFeatures)((2 * rng.nextDouble() - 1)*scale_factors(i))
   }
 
   override def nextValue(): Vector = {
@@ -536,12 +536,12 @@ class KMeansDataGenerator(
 
     val centerToAddTo = centers(concentrations.indexWhere(p => pick_center_rand <= p))
 
-    Vectors.dense(Array.tabulate(numColumns)(i => centerToAddTo(i) + rng2.nextGaussian()))
+    Vectors.dense(Array.tabulate(numFeatures)(i => centerToAddTo(i) + rng2.nextGaussian()))
   }
 
   override def setSeed(seed: Long) {
     rng.setSeed(seed)
   }
 
-  override def copy(): KMeansDataGenerator = new KMeansDataGenerator(numCenters, numColumns, seed)
+  override def copy(): KMeansDataGenerator = new KMeansDataGenerator(numCenters, numFeatures, seed)
 }
diff --git a/mllib-tests/v1p5/src/main/scala/mllib/perf/MLAlgorithmTests.scala b/mllib-tests/v1p5/src/main/scala/mllib/perf/MLAlgorithmTests.scala
index 1f1ec27..1c06465 100644
--- a/mllib-tests/v1p5/src/main/scala/mllib/perf/MLAlgorithmTests.scala
+++ b/mllib-tests/v1p5/src/main/scala/mllib/perf/MLAlgorithmTests.scala
@@ -97,10 +97,12 @@ abstract class GLMTests(sc: SparkContext)
 class GLMRegressionTest(sc: SparkContext) extends GLMTests(sc) {
 
   val INTERCEPT =  ("intercept",   "intercept for random data generation")
+  val FEATURE_NOISE =  ("feature-noise",
+    "scale factor for the noise during feature generation; CURRENTLY IGNORED")
   val LABEL_NOISE =  ("label-noise",   "scale factor for the noise during label generation")
   val LOSS =  ("loss",   "loss to minimize. Supported: l2 (squared error).")
 
-  doubleOptions = doubleOptions ++ Seq(INTERCEPT, LABEL_NOISE)
+  doubleOptions = doubleOptions ++ Seq(INTERCEPT, FEATURE_NOISE, LABEL_NOISE)
   stringOptions = stringOptions ++ Seq(LOSS)
 
   val options = intOptions ++ stringOptions  ++ booleanOptions ++ doubleOptions ++ longOptions
@@ -158,6 +160,7 @@ class GLMRegressionTest(sc: SparkContext) extends GLMTests(sc) {
         .setElasticNetParam(elasticNetParam)
         .setRegParam(regParam)
         .setMaxIter(numIterations)
+        .setTol(0.0)
       val sqlContext = new SQLContext(rdd.context)
       import sqlContext.implicits._
       val mlModel = rr.fit(rdd.toDF())
@@ -265,6 +268,7 @@ class GLMClassificationTest(sc: SparkContext) extends GLMTests(sc) {
             .setElasticNetParam(elasticNetParam)
             .setRegParam(regParam)
             .setMaxIter(numIterations)
+            .setTol(0.0)
           val sqlContext = new SQLContext(rdd.context)
           import sqlContext.implicits._
           val mlModel = lor.fit(rdd.toDF())
@@ -379,6 +383,8 @@ abstract class RecommendationTests(sc: SparkContext) extends PerfTest {
 
     val testMetric = validate(model, testRdd)
 
+    /*
+    // Removed temporarily because these methods are really slow.
     val numThingsToRecommend = 10
     start = System.currentTimeMillis()
     model.recommendProductsForUsers(numThingsToRecommend).count()
@@ -386,11 +392,11 @@ abstract class RecommendationTests(sc: SparkContext) extends PerfTest {
     start = System.currentTimeMillis()
     model.recommendUsersForProducts(numThingsToRecommend).count()
     val recommendUsersForProductsTime = (System.currentTimeMillis() - start).toDouble / 1000.0
-
+    */
     Map("trainingTime" -> trainingTime, "testTime" -> testTime,
-      "trainingMetric" -> trainingMetric, "testMetric" -> testMetric,
-      "recommendProductsForUsersTime" -> recommendProductsForUsersTime,
-      "recommendUsersForProductsTime" -> recommendUsersForProductsTime)
+      "trainingMetric" -> trainingMetric, "testMetric" -> testMetric)
+    // "recommendProductsForUsersTime" -> recommendProductsForUsersTime,
+    // "recommendUsersForProductsTime" -> recommendUsersForProductsTime)
   }
 }
 
@@ -398,13 +404,13 @@ abstract class ClusteringTests(sc: SparkContext) extends PerfTest {
 
   def runTest(rdd: RDD[Vector]): KMeansModel
 
-  val NUM_POINTS =    ("num-points",   "number of points for clustering tests")
-  val NUM_COLUMNS =   ("num-columns",   "number of columns for each point for clustering tests")
+  val NUM_EXAMPLES =    ("num-examples",   "number of examples for clustering tests")
+  val NUM_FEATURES =   ("num-features",  "number of features for each example for clustering tests")
   val NUM_CENTERS =   ("num-centers",   "number of centers for clustering tests")
   val NUM_ITERATIONS =      ("num-iterations",   "number of iterations for the algorithm")
 
-  intOptions = intOptions ++ Seq(NUM_CENTERS, NUM_COLUMNS, NUM_ITERATIONS)
-  longOptions = longOptions ++ Seq(NUM_POINTS)
+  intOptions = intOptions ++ Seq(NUM_CENTERS, NUM_FEATURES, NUM_ITERATIONS)
+  longOptions = longOptions ++ Seq(NUM_EXAMPLES)
   val options = intOptions ++ stringOptions  ++ booleanOptions ++ longOptions ++ doubleOptions
   addOptionsToParser()
 
@@ -412,21 +418,21 @@ abstract class ClusteringTests(sc: SparkContext) extends PerfTest {
   var testRdd: RDD[Vector] = _
 
   def validate(model: KMeansModel, rdd: RDD[Vector]): Double = {
-    val numPoints = rdd.cache().count()
+    val numExamples = rdd.cache().count()
 
     val error = model.computeCost(rdd)
 
-    math.sqrt(error/numPoints)
+    math.sqrt(error/numExamples)
   }
 
   override def createInputData(seed: Long) = {
     val numPartitions: Int = intOptionValue(NUM_PARTITIONS)
 
-    val numPoints: Long = longOptionValue(NUM_POINTS)
-    val numColumns: Int = intOptionValue(NUM_COLUMNS)
+    val numExamples: Long = longOptionValue(NUM_EXAMPLES)
+    val numFeatures: Int = intOptionValue(NUM_FEATURES)
     val numCenters: Int = intOptionValue(NUM_CENTERS)
 
-    val data = DataGenerator.generateKMeansVectors(sc, math.ceil(numPoints*1.25).toLong, numColumns,
+    val data = DataGenerator.generateKMeansVectors(sc, math.ceil(numExamples*1.25).toLong, numFeatures,
       numCenters, numPartitions, seed)
 
     val split = data.randomSplit(Array(0.8, 0.2), seed)
@@ -524,9 +530,10 @@ class ALSTest(sc: SparkContext) extends RecommendationTests(sc) {
     val rank: Int = intOptionValue(RANK)
     val regParam = doubleOptionValue(REG_PARAM)
     val seed = intOptionValue(RANDOM_SEED) + 12
+    val implicitRatings: Boolean = booleanOptionValue(IMPLICIT)
 
     new ALS().setIterations(numIterations).setRank(rank).setSeed(seed).setLambda(regParam)
-      .setBlocks(rdd.partitions.length).run(rdd)
+      .setBlocks(rdd.partitions.length).setImplicitPrefs(implicitRatings).run(rdd)
   }
 }
 
diff --git a/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala b/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala
index 95ce9c6..13da1ac 100644
--- a/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala
+++ b/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala
@@ -15,21 +15,21 @@ import mllib.perf.PerfTest
 
 class GaussianMixtureTest(sc: SparkContext) extends PerfTest {
 
-  val NUM_POINTS = ("num-points", "number of points for clustering tests")
-  val NUM_COLUMNS = ("num-columns", "number of columns for each point for clustering tests")
+  val NUM_EXAMPLES = ("num-examples", "number of examples for clustering tests")
+  val NUM_FEATURES = ("num-features", "number of features for each example for clustering tests")
   val NUM_CENTERS = ("num-centers", "number of centers for clustering tests")
   val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm")
 
-  intOptions ++= Seq(NUM_CENTERS, NUM_COLUMNS, NUM_ITERATIONS)
-  longOptions ++= Seq(NUM_POINTS)
+  intOptions ++= Seq(NUM_CENTERS, NUM_FEATURES, NUM_ITERATIONS)
+  longOptions ++= Seq(NUM_EXAMPLES)
   val options = intOptions ++ stringOptions  ++ booleanOptions ++ longOptions ++ doubleOptions
   addOptionsToParser()
 
   var data: RDD[Vector] = _
 
   override def createInputData(seed: Long): Unit = {
-    val m = longOptionValue(NUM_POINTS)
-    val n = intOptionValue(NUM_COLUMNS)
+    val m = longOptionValue(NUM_EXAMPLES)
+    val n = intOptionValue(NUM_FEATURES)
     val k = intOptionValue(NUM_CENTERS)
     val p = intOptionValue(NUM_PARTITIONS)
 
@@ -46,7 +46,7 @@ class GaussianMixtureTest(sc: SparkContext) extends PerfTest {
           Vectors.dense(y.data)
         }
       }.cache()
-    logInfo(s"Generated ${data.count()} points.")
+    logInfo(s"Generated ${data.count()} examples.")
   }
 
   override def run(): JValue = {
diff --git a/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/PICTest.scala b/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/PICTest.scala
index 6832ffa..2018c61 100644
--- a/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/PICTest.scala
+++ b/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/PICTest.scala
@@ -11,28 +11,28 @@ import mllib.perf.PerfTest
 
 class PICTest(sc: SparkContext) extends PerfTest {
 
-  val NUM_POINTS = ("num-points", "number of points")
+  val NUM_EXAMPLES = ("num-examples", "number of examples")
   val NODE_DEGREE = ("node-degree", "number of neighbors each node is connected to")
   val NUM_CENTERS = ("num-centers", "number of centers for clustering tests")
   val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm")
 
   intOptions ++= Seq(NODE_DEGREE, NUM_CENTERS, NUM_ITERATIONS)
-  longOptions ++= Seq(NUM_POINTS)
+  longOptions ++= Seq(NUM_EXAMPLES)
   val options = intOptions ++ stringOptions  ++ booleanOptions ++ longOptions ++ doubleOptions
   addOptionsToParser()
 
   var data: RDD[(Long, Long, Double)] = _
 
   override def createInputData(seed: Long): Unit = {
-    val numPoints = longOptionValue(NUM_POINTS)
+    val numExamples = longOptionValue(NUM_EXAMPLES)
     val nodeDegree = intOptionValue(NODE_DEGREE)
     val numPartitions = intOptionValue(NUM_PARTITIONS)
 
     // Generates a periodic banded matrix with bandwidth = nodeDegree
-    val data = sc.parallelize(0L to numPoints, numPartitions)
+    data = sc.parallelize(0L to numExamples, numPartitions)
       .flatMap { id =>
-        (((id - nodeDegree / 2) % numPoints) until id).map { nbr =>
-          (id, (nbr + numPoints) % numPoints, 1D)
+        (((id - nodeDegree / 2) % numExamples) until id).map { nbr =>
+          (id, (nbr + numExamples) % numExamples, 1D)
         }
       }
     logInfo(s"Generated ${data.count()} pairwise similarities.")
@@ -46,6 +46,7 @@ class PICTest(sc: SparkContext) extends PerfTest {
       .setK(k)
       .setMaxIterations(numIterations)
     val model = pic.run(data)
+    model.assignments.count()
     val duration = (System.currentTimeMillis() - start) / 1e3
     "time" -> duration
   }
diff --git a/mllib-tests/v1p5/src/main/scala/mllib/perf/util/DataGenerator.scala b/mllib-tests/v1p5/src/main/scala/mllib/perf/util/DataGenerator.scala
index e65a5a5..33f041e 100644
--- a/mllib-tests/v1p5/src/main/scala/mllib/perf/util/DataGenerator.scala
+++ b/mllib-tests/v1p5/src/main/scala/mllib/perf/util/DataGenerator.scala
@@ -548,7 +548,7 @@ class FeaturesGenerator(val categoricalArities: Array[Int], val numContinuous: I
 
 class KMeansDataGenerator(
     val numCenters: Int,
-    val numColumns: Int,
+    val numFeatures: Int,
     val seed: Long) extends RandomDataGenerator[Vector] {
 
   private val rng = new java.util.Random(seed)
@@ -567,7 +567,7 @@ class KMeansDataGenerator(
   }
 
   private val centers = (0 until numCenters).map{i =>
-    Array.fill(numColumns)((2 * rng.nextDouble() - 1)*scale_factors(i))
+    Array.fill(numFeatures)((2 * rng.nextDouble() - 1)*scale_factors(i))
   }
 
   override def nextValue(): Vector = {
@@ -575,12 +575,12 @@ class KMeansDataGenerator(
 
     val centerToAddTo = centers(concentrations.indexWhere(p => pick_center_rand <= p))
 
-    Vectors.dense(Array.tabulate(numColumns)(i => centerToAddTo(i) + rng2.nextGaussian()))
+    Vectors.dense(Array.tabulate(numFeatures)(i => centerToAddTo(i) + rng2.nextGaussian()))
   }
 
   override def setSeed(seed: Long) {
     rng.setSeed(seed)
   }
 
-  override def copy(): KMeansDataGenerator = new KMeansDataGenerator(numCenters, numColumns, seed)
+  override def copy(): KMeansDataGenerator = new KMeansDataGenerator(numCenters, numFeatures, seed)
 }
diff --git a/pyspark-tests/mllib_tests.py b/pyspark-tests/mllib_tests.py
index 133d751..1b6a306 100644
--- a/pyspark-tests/mllib_tests.py
+++ b/pyspark-tests/mllib_tests.py
@@ -219,8 +219,8 @@ def __init__(self, sc):
 
     def createInputData(self):
         options = self.options
-        numTrain = options.num_points
-        numTest = int(options.num_points * 0.2)
+        numTrain = options.num_examples
+        numTest = int(options.num_examples * 0.2)
         self.trainRDD = LabeledDataGenerator.generateGLMData(
             self.sc, numTrain, options.num_features,
             options.num_partitions, options.random_seed, labelType=2)
@@ -242,7 +242,7 @@ def __init__(self, sc):
     def createInputData(self):
         options = self.options
         self.data = FeaturesGenerator.generateContinuousData(
-            self.sc, options.num_points, options.num_columns,
+            self.sc, options.num_examples, options.num_features,
             options.num_partitions, options.random_seed)
 
     def runTest(self):
@@ -368,8 +368,6 @@ def runTest(self):
     parser.add_option("--num-ratings", type="int", default=500)
     parser.add_option("--implicit-prefs", type="int", default=0)
     # MLLIB_CLUSTERING_TEST_OPTS
-    parser.add_option("--num-points", type="int", default=1000)
-    parser.add_option("--num-columns", type="int", default=10)
     parser.add_option("--num-centers", type="int", default=5)
     # MLLIB_LINALG_TEST_OPTS + MLLIB_STATS_TEST_OPTS
     parser.add_option("--num-rows", type="int", default=1000)

From 8cb9e6240525143334c940fec9b8e7beab4e2b31 Mon Sep 17 00:00:00 2001
From: Adam Roberts <aroberts@uk.ibm.com>
Date: Thu, 30 Jun 2016 17:17:31 +0100
Subject: [PATCH 08/22] Remove .rej file too

---
 .../mllib/perf/MLAlgorithmTests.scala.rej     | 58 -------------------
 1 file changed, 58 deletions(-)
 delete mode 100644 mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala.rej

diff --git a/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala.rej b/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala.rej
deleted file mode 100644
index 7c1776b..0000000
--- a/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala.rej
+++ /dev/null
@@ -1,58 +0,0 @@
-diff a/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala b/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala	(rejected hunks)
-@@ -315,13 +315,13 @@ abstract class ClusteringTests(sc: SparkContext) extends PerfTest {
- 
-   def runTest(rdd: RDD[Vector]): KMeansModel
- 
--  val NUM_POINTS =    ("num-points",   "number of points for clustering tests")
--  val NUM_COLUMNS =   ("num-columns",   "number of columns for each point for clustering tests")
-+  val NUM_EXAMPLES =    ("num-examples",   "number of examples for clustering tests")
-+  val NUM_FEATURES =   ("num-features",   "number of features for each example for clustering tests")
-   val NUM_CENTERS =   ("num-centers",   "number of centers for clustering tests")
-   val NUM_ITERATIONS =      ("num-iterations",   "number of iterations for the algorithm")
- 
--  intOptions = intOptions ++ Seq(NUM_CENTERS, NUM_COLUMNS, NUM_ITERATIONS)
--  longOptions = longOptions ++ Seq(NUM_POINTS)
-+  intOptions = intOptions ++ Seq(NUM_CENTERS, NUM_FEATURES, NUM_ITERATIONS)
-+  longOptions = longOptions ++ Seq(NUM_EXAMPLES)
-   val options = intOptions ++ stringOptions  ++ booleanOptions ++ longOptions ++ doubleOptions
-   addOptionsToParser()
- 
-@@ -329,21 +329,21 @@ abstract class ClusteringTests(sc: SparkContext) extends PerfTest {
-   var testRdd: RDD[Vector] = _
- 
-   def validate(model: KMeansModel, rdd: RDD[Vector]): Double = {
--    val numPoints = rdd.cache().count()
-+    val numExamples = rdd.cache().count()
- 
-     val error = model.computeCost(rdd)
- 
--    math.sqrt(error/numPoints)
-+    math.sqrt(error/numExamples)
-   }
- 
-   override def createInputData(seed: Long) = {
-     val numPartitions: Int = intOptionValue(NUM_PARTITIONS)
- 
--    val numPoints: Long = longOptionValue(NUM_POINTS)
--    val numColumns: Int = intOptionValue(NUM_COLUMNS)
-+    val numExamples: Long = longOptionValue(NUM_EXAMPLES)
-+    val numFeatures: Int = intOptionValue(NUM_FEATURES)
-     val numCenters: Int = intOptionValue(NUM_CENTERS)
- 
--    val data = DataGenerator.generateKMeansVectors(sc, math.ceil(numPoints*1.25).toLong, numColumns,
-+    val data = DataGenerator.generateKMeansVectors(sc, math.ceil(numExamples*1.25).toLong, numFeatures,
-       numCenters, numPartitions, seed)
- 
-     val split = data.randomSplit(Array(0.8, 0.2), seed)
-@@ -441,9 +441,10 @@ class ALSTest(sc: SparkContext) extends RecommendationTests(sc) {
-     val rank: Int = intOptionValue(RANK)
-     val regParam = doubleOptionValue(REG_PARAM)
-     val seed = intOptionValue(RANDOM_SEED) + 12
-+    val implicitRatings: Boolean = booleanOptionValue(IMPLICIT)
- 
-     new ALS().setIterations(numIterations).setRank(rank).setSeed(seed).setLambda(regParam)
--      .setBlocks(rdd.partitions.size).run(rdd)
-+      .setBlocks(rdd.partitions.length).setImplicitPrefs(implicitRatings).run(rdd)
-   }
- }
- 

From 9e26cfccf8b1b9a20743a0f650e5a7d8964a0e13 Mon Sep 17 00:00:00 2001
From: Adam Roberts <aroberts@uk.ibm.com>
Date: Thu, 30 Jun 2016 17:21:40 +0100
Subject: [PATCH 09/22] Comment dep on 2.0.0 preview

---
 spark-tests/project/SparkTestsBuild.scala | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/spark-tests/project/SparkTestsBuild.scala b/spark-tests/project/SparkTestsBuild.scala
index 697b28a..707c39a 100644
--- a/spark-tests/project/SparkTestsBuild.scala
+++ b/spark-tests/project/SparkTestsBuild.scala
@@ -15,8 +15,14 @@ object SparkTestsBuild extends Build {
         "net.sf.jopt-simple" % "jopt-simple" % "4.6",
         "org.scalatest" %% "scalatest" % "2.2.1" % "test",
         "com.google.guava" % "guava" % "14.0.1",
-        "org.apache.spark" %% "spark-core" % "2.0.0-preview" % "provided",
         "org.json4s" %% "json4s-native" % "3.2.10"
+
+        // IMPORTANT!
+        // We need to uncomment the below once Spark 2.0.0 becomes available
+        // This relies on using spark built under the lib folder 
+        // of this project      
+        //"org.apache.spark" %% "spark-core" % "2.0.0-SNAPSHOT" % "provided",
+
       ),
       test in assembly := {},
       outputPath in assembly := file("target/spark-perf-tests-assembly.jar"),

From 9cc8cae334d0a0c6342b87921edf7d6cfaa41c75 Mon Sep 17 00:00:00 2001
From: Adam Roberts <aroberts@uk.ibm.com>
Date: Fri, 1 Jul 2016 11:14:17 +0100
Subject: [PATCH 10/22] Use original config.py but with SPARK_HOME recognised,
 default local cluster

---
 config/config.py.template                 | 85 ++++++++++++-----------
 mllib-tests/project/MLlibTestsBuild.scala |  2 +-
 2 files changed, 46 insertions(+), 41 deletions(-)

diff --git a/config/config.py.template b/config/config.py.template
index 019cbac..8489d6f 100755
--- a/config/config.py.template
+++ b/config/config.py.template
@@ -19,17 +19,19 @@ from sparkperf.config_utils import FlagSet, JavaOptionSet, OptionSet, ConstantOp
 
 DEFAULT_HOME=os.environ['HOME']
 
+# Point to an installation of Spark on the cluster.
 SPARK_HOME_DIR = os.getenv('SPARK_HOME', DEFAULT_HOME)
 
 # Use a custom configuration directory
 SPARK_CONF_DIR = SPARK_HOME_DIR + "/conf"
 
 # Master used when submitting Spark jobs.
-# For local clusters: "spark://%s:7077" % socket.gethostname()
+# for ec2 clusters: open("/root/spark-ec2/cluster-url", 'r').readline().strip()
+# For local clusters (default): "spark://%s:7077" % socket.gethostname()
 # For Yarn clusters: "yarn"
-# Otherwise, the default uses the specified EC2 cluster
 
-SPARK_CLUSTER_URL = "spark://%s:7077" % socket.gethostname()
+SPARK_CLUSTER_URL="spark://%s:7077" % socket.gethostname()
+
 IS_YARN_MODE = "yarn" in SPARK_CLUSTER_URL
 IS_MESOS_MODE = "mesos" in SPARK_CLUSTER_URL
 
@@ -78,18 +80,18 @@ RESTART_SPARK_CLUSTER = RESTART_SPARK_CLUSTER and not IS_YARN_MODE
 RSYNC_SPARK_HOME = True
 
 # Which tests to run
-RUN_SPARK_TESTS = False
+RUN_SPARK_TESTS = True
 RUN_PYSPARK_TESTS = False
 RUN_STREAMING_TESTS = False
-RUN_MLLIB_TESTS = True
-RUN_PYTHON_MLLIB_TESTS = True
+RUN_MLLIB_TESTS = False
+RUN_PYTHON_MLLIB_TESTS = False
 
 # Which tests to prepare. Set this to true for the first
 # installation or whenever you make a change to the tests.
 PREP_SPARK_TESTS = True
-PREP_PYSPARK_TESTS = True
-PREP_STREAMING_TESTS = True
-PREP_MLLIB_TESTS = True
+PREP_PYSPARK_TESTS = False
+PREP_STREAMING_TESTS = False
+PREP_MLLIB_TESTS = False
 
 # Whether to warm up local disks (warm-up is only necesary on EC2).
 DISK_WARMUP = False
@@ -102,7 +104,7 @@ DISK_WARMUP_BYTES = 200 * 1024 * 1024
 DISK_WARMUP_FILES = 200
 
 # Prompt for confirmation when deleting temporary files.
-PROMPT_FOR_DELETES = False
+PROMPT_FOR_DELETES = True
 
 # Files to write results to
 SPARK_OUTPUT_FILENAME = "results/spark_perf_output_%s_%s" % (
@@ -126,7 +128,7 @@ PYTHON_MLLIB_OUTPUT_FILENAME = "results/python_mllib_perf_output_%s_%s" % (
 # number of records in a generated dataset) if you are running the tests with more
 # or fewer nodes. When developing new test suites, you might want to set this to a small
 # value suitable for a single machine, such as 0.001.
-SCALE_FACTOR = 0.01
+SCALE_FACTOR = 1.0
 
 assert SCALE_FACTOR > 0, "SCALE_FACTOR must be > 0."
 
@@ -145,7 +147,7 @@ COMMON_JAVA_OPTS = [
     # Fraction of JVM memory used for caching RDDs.
     JavaOptionSet("spark.storage.memoryFraction", [0.66]),
     JavaOptionSet("spark.serializer", ["org.apache.spark.serializer.JavaSerializer"]),
-    JavaOptionSet("spark.executor.memory", ["2g"]),
+    # JavaOptionSet("spark.executor.memory", ["9g"]),
     # Turn event logging on in order better diagnose failed tests. Off by default as it crashes
     # releases prior to 1.0.2
     # JavaOptionSet("spark.eventLog.enabled", [True]),
@@ -153,7 +155,7 @@ COMMON_JAVA_OPTS = [
     JavaOptionSet("spark.locality.wait", [str(60 * 1000 * 1000)])
 ]
 # Set driver memory here
-SPARK_DRIVER_MEMORY = "2g"
+SPARK_DRIVER_MEMORY = "20g"
 # The following options value sets are shared among all tests.
 COMMON_OPTS = [
     # How many times to run each experiment - used to warm up system caches.
@@ -316,7 +318,8 @@ STREAMING_COMMON_JAVA_OPTS = [
     # Fraction of JVM memory used for caching RDDs.
     JavaOptionSet("spark.storage.memoryFraction", [0.66]),
     JavaOptionSet("spark.serializer", ["org.apache.spark.serializer.JavaSerializer"]),
-    # JavaOptionSet("spark.executor.memory", ["2g"])
+    # JavaOptionSet("spark.executor.memory", ["9g"]),
+    JavaOptionSet("spark.executor.extraJavaOptions", [" -XX:+UseConcMarkSweepGC "])
 ]
 
 STREAMING_KEY_VAL_TEST_OPTS = STREAMING_COMMON_OPTS + streaming_batch_duration_opts(2000) + [
@@ -371,8 +374,7 @@ MLLIB_PERF_TEST_RUNNER = "mllib.perf.TestRunner"
 #  * Build Spark locally by running `build/sbt assembly; build/sbt publishLocal` in the Spark root directory
 #  * Set `USE_CLUSTER_SPARK = True` and `MLLIB_SPARK_VERSION = {desired Spark version, e.g. 1.5}`
 #  * Don't use PREP_MLLIB_TESTS = True; instead manually run `cd mllib-tests; sbt/sbt -Dspark.version=1.5.0-SNAPSHOT clean assembly` to build perf tests
-
-MLLIB_SPARK_VERSION = "2.0"
+MLLIB_SPARK_VERSION = 1.5
 
 MLLIB_JAVA_OPTS = COMMON_JAVA_OPTS
 if MLLIB_SPARK_VERSION >= 1.1:
@@ -401,6 +403,9 @@ MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS = MLLIB_COMMON_OPTS + [
 
 # Generalized Linear Model (GLM) Tests #
 MLLIB_GLM_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
+    # The scale factor for the noise in feature values.
+    # Currently ignored for regression.
+    OptionSet("feature-noise", [1.0]),
     # The number of features per example
     OptionSet("num-features", [10000], can_scale=False),
     # The number of iterations for SGD
@@ -412,11 +417,6 @@ MLLIB_GLM_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
     # Regularization parameter
     OptionSet("reg-param", [0.1])
 ]
-if MLLIB_SPARK_VERSION >= 1.1:
-    MLLIB_GLM_TEST_OPTS += [
-        # Optimization algorithm: sgd, l-bfgs
-        OptionSet("optimizer", ["sgd", "l-bfgs"])
-    ]
 if MLLIB_SPARK_VERSION >= 1.5:
     MLLIB_GLM_TEST_OPTS += [
         # Ignored, but required for config
@@ -425,6 +425,8 @@ if MLLIB_SPARK_VERSION >= 1.5:
 
 # GLM Regression Tests #
 MLLIB_GLM_REGRESSION_TEST_OPTS = MLLIB_GLM_TEST_OPTS + [
+    # Optimization algorithm: sgd
+    OptionSet("optimizer", ["sgd"]),
     # The intercept for the data
     OptionSet("intercept", [0.0]),
     # The scale factor for label noise
@@ -440,6 +442,8 @@ MLLIB_TESTS += [("glm-regression", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
 MLLIB_CLASSIFICATION_TEST_OPTS = MLLIB_GLM_TEST_OPTS + [
     # Expected fraction of examples which are negative
     OptionSet("per-negative", [0.3]),
+    # Optimization algorithm: sgd, l-bfgs
+    OptionSet("optimizer", ["sgd", "l-bfgs"])
 ]
 
 # GLM Classification Tests #
@@ -466,15 +470,15 @@ if MLLIB_SPARK_VERSION >= 1.5:
         OptionSet("reg-param", [0.01]),
         # The scale factor for the noise in feature values
         OptionSet("feature-noise", [1.0]),
-        # The scale factor for the noise in label values
-        OptionSet("label-noise", [0.1]),
-        # The intercept for the data
-        OptionSet("intercept", [0.2]),
         # The step size is not used in LBFGS, but this is required in parameter checking.
         OptionSet("step-size", [0.0])
     ]
 
     MLLIB_GLM_ELASTIC_NET_REGRESSION_TEST_OPTS = MLLIB_GLM_ELASTIC_NET_TEST_OPTS + [
+        # The scale factor for the noise in label values
+        OptionSet("label-noise", [0.1]),
+        # The intercept for the data
+        OptionSet("intercept", [0.2]),
         # Loss to minimize: l2 (squared error)
         OptionSet("loss", ["l2"])
     ]
@@ -488,9 +492,11 @@ if MLLIB_SPARK_VERSION >= 1.5:
     MLLIB_TESTS += [("glm-regression", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
                      MLLIB_JAVA_OPTS, [ConstantOption("glm-regression")] +
                      MLLIB_GLM_ELASTIC_NET_REGRESSION_TEST_OPTS +
-                     [OptionSet("num-features", [100], can_scale=False)])]
+                     [OptionSet("num-features", [200], can_scale=False)])]
 
     MLLIB_GLM_ELASTIC_NET_CLASSIFICATION_TEST_OPTS = MLLIB_GLM_ELASTIC_NET_TEST_OPTS + [
+        # Expected fraction of examples which are negative
+        OptionSet("per-negative", [0.3]),
         # In GLM classification with elastic-net regularization, only logistic loss is supported.
         OptionSet("loss", ["logistic"])
     ]
@@ -504,7 +510,7 @@ if MLLIB_SPARK_VERSION >= 1.5:
     MLLIB_TESTS += [("glm-classification", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
                      MLLIB_JAVA_OPTS, [ConstantOption("glm-classification")] +
                      MLLIB_GLM_ELASTIC_NET_CLASSIFICATION_TEST_OPTS +
-                     [OptionSet("num-features", [100], can_scale=False)])]
+                     [OptionSet("num-features", [200], can_scale=False)])]
 
 NAIVE_BAYES_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
     # The number of features per example
@@ -597,10 +603,10 @@ MLLIB_TESTS += [("als", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
 
 # Clustering Tests #
 MLLIB_CLUSTERING_TEST_OPTS = MLLIB_COMMON_OPTS + [
-     # The number of points
-     OptionSet("num-points", [1000000], can_scale=True),
+     # The number of examples
+     OptionSet("num-examples", [1000000], can_scale=True),
      # The number of features per point
-     OptionSet("num-columns", [10000], can_scale=False),
+     OptionSet("num-features", [10000], can_scale=False),
      # The number of centers
      OptionSet("num-centers", [20]),
      # The number of iterations for KMeans
@@ -611,8 +617,8 @@ MLLIB_TESTS += [("kmeans", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
     MLLIB_JAVA_OPTS, [ConstantOption("kmeans")] + MLLIB_CLUSTERING_TEST_OPTS)]
 
 MLLIB_GMM_TEST_OPTS = MLLIB_COMMON_OPTS + [
-    OptionSet("num-points", [1000000], can_scale=True),
-    OptionSet("num-columns", [100], can_scale=False),
+    OptionSet("num-examples", [1000000], can_scale=True),
+    OptionSet("num-features", [100], can_scale=False),
     OptionSet("num-centers", [20], can_scale=False),
     OptionSet("num-iterations", [20])]
 
@@ -632,16 +638,15 @@ if MLLIB_SPARK_VERSION >= 1.4:
     MLLIB_TESTS += [("lda", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
         MLLIB_JAVA_OPTS, [ConstantOption("lda")] + MLLIB_LDA_TEST_OPTS)]
 
-# TODO: tune PIC test size to run in 20-30 seconds
 MLLIB_PIC_TEST_OPTS = MLLIB_COMMON_OPTS + [
-    OptionSet("num-points", [10000], can_scale=True),
-    OptionSet("node-degree", [10], can_scale=False),
-    OptionSet("num-centers", [20], can_scale=False),
+    OptionSet("num-examples", [10000000], can_scale=True),
+    OptionSet("node-degree", [20], can_scale=False),
+    OptionSet("num-centers", [40], can_scale=False),
     OptionSet("num-iterations", [20])]
 
 if MLLIB_SPARK_VERSION >= 1.3:
     MLLIB_TESTS += [("pic", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
-        MLLIB_JAVA_OPTS, [ConstantOption("pic")] + MLLIB_CLUSTERING_TEST_OPTS)]
+        MLLIB_JAVA_OPTS, [ConstantOption("pic")] + MLLIB_PIC_TEST_OPTS)]
 
 # Linear Algebra Tests #
 MLLIB_LINALG_TEST_OPTS = MLLIB_COMMON_OPTS + [
@@ -670,7 +675,7 @@ MLLIB_TESTS += [("pca", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
 
 MLLIB_TESTS += [("summary-statistics", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR,
     MLLIB_JAVA_OPTS, [ConstantOption("summary-statistics")] +
-    MLLIB_LINALG_TEST_OPTS)]
+    MLLIB_BIG_LINALG_TEST_OPTS)]
 
 MLLIB_BLOCK_MATRIX_MULT_TEST_OPTS = MLLIB_COMMON_OPTS + [
     OptionSet("m", [20000], can_scale=True),
@@ -754,8 +759,8 @@ if MLLIB_SPARK_VERSION >= 1.3:
 MLLIB_PREFIX_SPAN_TEST_OPTS = MLLIB_FPM_TEST_OPTS + \
                             [OptionSet("num-sequences", [5000000], can_scale=True),
                              OptionSet("avg-sequence-size", [5], can_scale=False),
-                             OptionSet("avg-itemset-size", [1], can_scale=False),
-                             OptionSet("num-items", [100], can_scale=False),
+                             OptionSet("avg-itemset-size", [2], can_scale=False),
+                             OptionSet("num-items", [500], can_scale=False),
                              OptionSet("min-support", [0.5], can_scale=False),
                              OptionSet("max-pattern-len", [10], can_scale=False),
                              OptionSet("max-local-proj-db-size", [32000000], can_scale=False)]
diff --git a/mllib-tests/project/MLlibTestsBuild.scala b/mllib-tests/project/MLlibTestsBuild.scala
index 6a51e93..63c4328 100644
--- a/mllib-tests/project/MLlibTestsBuild.scala
+++ b/mllib-tests/project/MLlibTestsBuild.scala
@@ -45,7 +45,7 @@ object MLlibTestsBuild extends Build {
           case v if v.startsWith("1.5.") => "v1p5" // acceptable for now, but change later when new algs are added
           case v if v.startsWith("1.6.") => "v1p5"
           case v if v.startsWith("2.0") => "v2p0"
-          case _ => throw new IllegalArgumentException(s"This Spark version isn't suppored: ${sparkVersion.value}.")
+          case _ => throw new IllegalArgumentException(s"This Spark version isn't supported: ${sparkVersion.value}.")
         }
         baseDirectory.value / targetFolder / "src" / "main" / "scala"
       },

From 44f7a5fe6c4b4990fd6d59009e5ec512f1897013 Mon Sep 17 00:00:00 2001
From: Adam Roberts <aroberts@uk.ibm.com>
Date: Fri, 1 Jul 2016 11:21:46 +0100
Subject: [PATCH 11/22] Back to 3.2.9 for json native

---
 mllib-tests/project/MLlibTestsBuild.scala | 2 +-
 spark-tests/project/SparkTestsBuild.scala | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mllib-tests/project/MLlibTestsBuild.scala b/mllib-tests/project/MLlibTestsBuild.scala
index 63c4328..43d6b2a 100644
--- a/mllib-tests/project/MLlibTestsBuild.scala
+++ b/mllib-tests/project/MLlibTestsBuild.scala
@@ -22,7 +22,7 @@ object MLlibTestsBuild extends Build {
       "net.sf.jopt-simple" % "jopt-simple" % "4.6",
       "org.scalatest" %% "scalatest" % "2.2.1" % "test",
       "org.slf4j" % "slf4j-log4j12" % "1.7.2",
-      "org.json4s" %% "json4s-native" % "3.2.10"
+      "org.json4s" %% "json4s-native" % "3.2.9"
 
       // IMPORTANT!
       // We need to uncomment the below once Spark 2.0.0 becomes available
diff --git a/spark-tests/project/SparkTestsBuild.scala b/spark-tests/project/SparkTestsBuild.scala
index 707c39a..3aec419 100644
--- a/spark-tests/project/SparkTestsBuild.scala
+++ b/spark-tests/project/SparkTestsBuild.scala
@@ -15,7 +15,7 @@ object SparkTestsBuild extends Build {
         "net.sf.jopt-simple" % "jopt-simple" % "4.6",
         "org.scalatest" %% "scalatest" % "2.2.1" % "test",
         "com.google.guava" % "guava" % "14.0.1",
-        "org.json4s" %% "json4s-native" % "3.2.10"
+        "org.json4s" %% "json4s-native" % "3.2.9"
 
         // IMPORTANT!
         // We need to uncomment the below once Spark 2.0.0 becomes available

From cf6d6717754f806c06593c0fdf70fcef6334b21d Mon Sep 17 00:00:00 2001
From: Adam Roberts <aroberts@uk.ibm.com>
Date: Fri, 1 Jul 2016 11:23:06 +0100
Subject: [PATCH 12/22] Tidying up

---
 .../v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala        | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala b/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala
index 0d438db..4dd1d49 100644
--- a/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala
+++ b/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala
@@ -603,4 +603,3 @@ class DecisionTreeTest(sc: SparkContext) extends DecisionTreeTests(sc) {
     }
   }
 }
-

From 9e171d415e1099f4052055e55298e4f14a25b7cb Mon Sep 17 00:00:00 2001
From: Adam Roberts <aroberts@uk.ibm.com>
Date: Fri, 1 Jul 2016 16:36:33 +0100
Subject: [PATCH 13/22] Add default to 2.0 for spark mllib version

---
 config/config.py.template | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/config/config.py.template b/config/config.py.template
index 8489d6f..becd7b6 100755
--- a/config/config.py.template
+++ b/config/config.py.template
@@ -374,7 +374,8 @@ MLLIB_PERF_TEST_RUNNER = "mllib.perf.TestRunner"
 #  * Build Spark locally by running `build/sbt assembly; build/sbt publishLocal` in the Spark root directory
 #  * Set `USE_CLUSTER_SPARK = True` and `MLLIB_SPARK_VERSION = {desired Spark version, e.g. 1.5}`
 #  * Don't use PREP_MLLIB_TESTS = True; instead manually run `cd mllib-tests; sbt/sbt -Dspark.version=1.5.0-SNAPSHOT clean assembly` to build perf tests
-MLLIB_SPARK_VERSION = 1.5
+
+MLLIB_SPARK_VERSION = 2.0
 
 MLLIB_JAVA_OPTS = COMMON_JAVA_OPTS
 if MLLIB_SPARK_VERSION >= 1.1:

From b35c29460f522e5cc4b7eef7c387da5ef5c78275 Mon Sep 17 00:00:00 2001
From: Adam Roberts <aroberts@uk.ibm.com>
Date: Mon, 15 Aug 2016 13:41:25 +0100
Subject: [PATCH 14/22] Use GA Spark 2 not preview

---
 mllib-tests/project/MLlibTestsBuild.scala         | 10 ++--------
 spark-tests/project/SparkTestsBuild.scala         | 10 ++--------
 streaming-tests/project/StreamingTestsBuild.scala |  4 ++--
 3 files changed, 6 insertions(+), 18 deletions(-)

diff --git a/mllib-tests/project/MLlibTestsBuild.scala b/mllib-tests/project/MLlibTestsBuild.scala
index 43d6b2a..cf2641c 100644
--- a/mllib-tests/project/MLlibTestsBuild.scala
+++ b/mllib-tests/project/MLlibTestsBuild.scala
@@ -23,14 +23,8 @@ object MLlibTestsBuild extends Build {
       "org.scalatest" %% "scalatest" % "2.2.1" % "test",
       "org.slf4j" % "slf4j-log4j12" % "1.7.2",
       "org.json4s" %% "json4s-native" % "3.2.9"
-
-      // IMPORTANT!
-      // We need to uncomment the below once Spark 2.0.0 becomes available
-      // This relies on using spark built under the lib folder 
-      // of this project
-      
-      //"org.apache.spark" %% "spark-core" % "2.0.0-SNAPSHOT" % "provided",
-      //"org.apache.spark" %% "spark-mllib" % "2.0.0-SNAPSHOT" % "provided"
+      "org.apache.spark" %% "spark-core" % "2.0.0",
+      "org.apache.spark" %% "spark-mllib" % "2.0.0"
     )
   )
 
diff --git a/spark-tests/project/SparkTestsBuild.scala b/spark-tests/project/SparkTestsBuild.scala
index 3aec419..8ea4915 100644
--- a/spark-tests/project/SparkTestsBuild.scala
+++ b/spark-tests/project/SparkTestsBuild.scala
@@ -15,14 +15,8 @@ object SparkTestsBuild extends Build {
         "net.sf.jopt-simple" % "jopt-simple" % "4.6",
         "org.scalatest" %% "scalatest" % "2.2.1" % "test",
         "com.google.guava" % "guava" % "14.0.1",
-        "org.json4s" %% "json4s-native" % "3.2.9"
-
-        // IMPORTANT!
-        // We need to uncomment the below once Spark 2.0.0 becomes available
-        // This relies on using spark built under the lib folder 
-        // of this project      
-        //"org.apache.spark" %% "spark-core" % "2.0.0-SNAPSHOT" % "provided",
-
+        "org.json4s" %% "json4s-native" % "3.2.9",
+        "org.apache.spark" %% "spark-core" % "2.0.0"
       ),
       test in assembly := {},
       outputPath in assembly := file("target/spark-perf-tests-assembly.jar"),
diff --git a/streaming-tests/project/StreamingTestsBuild.scala b/streaming-tests/project/StreamingTestsBuild.scala
index fc2569c..6bf6b25 100644
--- a/streaming-tests/project/StreamingTestsBuild.scala
+++ b/streaming-tests/project/StreamingTestsBuild.scala
@@ -20,8 +20,8 @@ object StreamingTestsBuild extends Build {
         "com.typesafe.akka" %% "akka-remote"  % "2.3.11",
         "com.typesafe.akka" %% "akka-agent"   % "2.3.11",
         "org.slf4j" % "slf4j-log4j12" % "1.7.2",
-        "org.apache.spark" %% "spark-core" % "2.0.0-preview" % "provided",
-        "org.apache.spark" %% "spark-streaming" % "2.0.0-preview" % "provided"
+        "org.apache.spark" %% "spark-core" % "2.0.0",
+        "org.apache.spark" %% "spark-streaming" % "2.0.0"
       ),
       test in assembly := {},
       outputPath in assembly := file("target/streaming-perf-tests-assembly.jar"),

From 533f6a6e9e924a267e99214cfec17e314b9d31ab Mon Sep 17 00:00:00 2001
From: Adam Roberts <aroberts@uk.ibm.com>
Date: Mon, 15 Aug 2016 14:03:01 +0100
Subject: [PATCH 15/22] Ensure we can override Spark version for all projects

---
 mllib-tests/project/MLlibTestsBuild.scala         | 5 ++---
 spark-tests/project/SparkTestsBuild.scala         | 5 +++--
 streaming-tests/project/StreamingTestsBuild.scala | 7 ++++---
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/mllib-tests/project/MLlibTestsBuild.scala b/mllib-tests/project/MLlibTestsBuild.scala
index cf2641c..fa8b231 100644
--- a/mllib-tests/project/MLlibTestsBuild.scala
+++ b/mllib-tests/project/MLlibTestsBuild.scala
@@ -23,8 +23,8 @@ object MLlibTestsBuild extends Build {
       "org.scalatest" %% "scalatest" % "2.2.1" % "test",
       "org.slf4j" % "slf4j-log4j12" % "1.7.2",
       "org.json4s" %% "json4s-native" % "3.2.9"
-      "org.apache.spark" %% "spark-core" % "2.0.0",
-      "org.apache.spark" %% "spark-mllib" % "2.0.0"
+      "org.apache.spark" %% "spark-core" % sparkVersion.value,
+      "org.apache.spark" %% "spark-mllib" % sparkVersion.value
     )
   )
 
@@ -33,7 +33,6 @@ object MLlibTestsBuild extends Build {
     file("."),
     settings = assemblySettings ++ commonSettings ++ Seq(
       scalaSource in Compile := {
-        println("sparkVersion.value is: " + sparkVersion.value)
         val targetFolder = sparkVersion.value match {
           case v if v.startsWith("1.4.") => "v1p4"
           case v if v.startsWith("1.5.") => "v1p5" // acceptable for now, but change later when new algs are added
diff --git a/spark-tests/project/SparkTestsBuild.scala b/spark-tests/project/SparkTestsBuild.scala
index 8ea4915..c842851 100644
--- a/spark-tests/project/SparkTestsBuild.scala
+++ b/spark-tests/project/SparkTestsBuild.scala
@@ -10,13 +10,14 @@ object SparkTestsBuild extends Build {
     settings = assemblySettings ++ Seq(
       organization := "org.spark-project",
       version := "0.1",
-      scalaVersion := "2.11.8",
+      scalaVersion := sys.props.getOrElse("scala.version", default="2.11.8"),
+      sparkVersion := sys.props.getOrElse("spark.version", default="2.0.0"),
       libraryDependencies ++= Seq(
         "net.sf.jopt-simple" % "jopt-simple" % "4.6",
         "org.scalatest" %% "scalatest" % "2.2.1" % "test",
         "com.google.guava" % "guava" % "14.0.1",
         "org.json4s" %% "json4s-native" % "3.2.9",
-        "org.apache.spark" %% "spark-core" % "2.0.0"
+        "org.apache.spark" %% "spark-core" % sparkVersion
       ),
       test in assembly := {},
       outputPath in assembly := file("target/spark-perf-tests-assembly.jar"),
diff --git a/streaming-tests/project/StreamingTestsBuild.scala b/streaming-tests/project/StreamingTestsBuild.scala
index 6bf6b25..9c2e0bf 100644
--- a/streaming-tests/project/StreamingTestsBuild.scala
+++ b/streaming-tests/project/StreamingTestsBuild.scala
@@ -10,7 +10,8 @@ object StreamingTestsBuild extends Build {
     settings = assemblySettings ++ Seq(
       organization := "org.spark-project",
       version := "0.1",
-      scalaVersion := "2.11.8",
+      scalaVersion := sys.props.getOrElse("scala.version", default="2.11.8"),
+      sparkVersion := sys.props.getOrElse("spark.version", default="2.0.0"),
       libraryDependencies ++= Seq(
         "net.sf.jopt-simple" % "jopt-simple" % "4.5",
         "org.scalatest" %% "scalatest" % "2.2.1" % "test",
@@ -20,8 +21,8 @@ object StreamingTestsBuild extends Build {
         "com.typesafe.akka" %% "akka-remote"  % "2.3.11",
         "com.typesafe.akka" %% "akka-agent"   % "2.3.11",
         "org.slf4j" % "slf4j-log4j12" % "1.7.2",
-        "org.apache.spark" %% "spark-core" % "2.0.0",
-        "org.apache.spark" %% "spark-streaming" % "2.0.0"
+        "org.apache.spark" %% "spark-core" % sparkVersion,
+        "org.apache.spark" %% "spark-streaming" % sparkVersion
       ),
       test in assembly := {},
       outputPath in assembly := file("target/streaming-perf-tests-assembly.jar"),

From 1eb427c37b34302aa2770fe1ade2a9fb8f64b892 Mon Sep 17 00:00:00 2001
From: Adam Roberts <aroberts@uk.ibm.com>
Date: Mon, 15 Aug 2016 14:04:46 +0100
Subject: [PATCH 16/22] Add a missing comma for ml project file

---
 mllib-tests/project/MLlibTestsBuild.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib-tests/project/MLlibTestsBuild.scala b/mllib-tests/project/MLlibTestsBuild.scala
index fa8b231..2ec5ecf 100644
--- a/mllib-tests/project/MLlibTestsBuild.scala
+++ b/mllib-tests/project/MLlibTestsBuild.scala
@@ -22,7 +22,7 @@ object MLlibTestsBuild extends Build {
       "net.sf.jopt-simple" % "jopt-simple" % "4.6",
       "org.scalatest" %% "scalatest" % "2.2.1" % "test",
       "org.slf4j" % "slf4j-log4j12" % "1.7.2",
-      "org.json4s" %% "json4s-native" % "3.2.9"
+      "org.json4s" %% "json4s-native" % "3.2.9",
       "org.apache.spark" %% "spark-core" % sparkVersion.value,
       "org.apache.spark" %% "spark-mllib" % sparkVersion.value
     )

From 70963cfc9f2a6234d4db9f953e74b4b09bb8c89b Mon Sep 17 00:00:00 2001
From: Adam Roberts <aroberts@uk.ibm.com>
Date: Fri, 26 Aug 2016 13:26:45 +0100
Subject: [PATCH 17/22] Just use mllib provided artifact and remove core part

---
 mllib-tests/project/MLlibTestsBuild.scala | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mllib-tests/project/MLlibTestsBuild.scala b/mllib-tests/project/MLlibTestsBuild.scala
index 2ec5ecf..4ebf2a1 100644
--- a/mllib-tests/project/MLlibTestsBuild.scala
+++ b/mllib-tests/project/MLlibTestsBuild.scala
@@ -23,8 +23,7 @@ object MLlibTestsBuild extends Build {
       "org.scalatest" %% "scalatest" % "2.2.1" % "test",
       "org.slf4j" % "slf4j-log4j12" % "1.7.2",
       "org.json4s" %% "json4s-native" % "3.2.9",
-      "org.apache.spark" %% "spark-core" % sparkVersion.value,
-      "org.apache.spark" %% "spark-mllib" % sparkVersion.value
+      "org.apache.spark" %% "spark-mllib" % sparkVersion.value % "provided"
     )
   )
 

From 1d1441ba259e1e8e64e868a921c83c28fec997da Mon Sep 17 00:00:00 2001
From: Adam Roberts <aroberts@uk.ibm.com>
Date: Fri, 26 Aug 2016 13:29:08 +0100
Subject: [PATCH 18/22] Use 2.0.0 to resolve artifacts for mllib

---
 config/config.py.template | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/config.py.template b/config/config.py.template
index becd7b6..c15d5e9 100755
--- a/config/config.py.template
+++ b/config/config.py.template
@@ -375,7 +375,7 @@ MLLIB_PERF_TEST_RUNNER = "mllib.perf.TestRunner"
 #  * Set `USE_CLUSTER_SPARK = True` and `MLLIB_SPARK_VERSION = {desired Spark version, e.g. 1.5}`
 #  * Don't use PREP_MLLIB_TESTS = True; instead manually run `cd mllib-tests; sbt/sbt -Dspark.version=1.5.0-SNAPSHOT clean assembly` to build perf tests
 
-MLLIB_SPARK_VERSION = 2.0
+MLLIB_SPARK_VERSION = 2.0.0
 
 MLLIB_JAVA_OPTS = COMMON_JAVA_OPTS
 if MLLIB_SPARK_VERSION >= 1.1:

From c873532c3ec91e814c78f1abcdf0c31702ebd99e Mon Sep 17 00:00:00 2001
From: Adam Roberts <aroberts@uk.ibm.com>
Date: Fri, 26 Aug 2016 14:41:08 +0100
Subject: [PATCH 19/22] Remove feature-noise from glm regression so we can
 build and run

---
 .../src/main/scala/mllib/perf/MLAlgorithmTests.scala | 12 ++++--------
 .../v2p0/src/main/scala/mllib/perf/StatTests.scala   |  2 +-
 .../main/scala/mllib/perf/util/DataGenerator.scala   |  8 +++-----
 3 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/MLAlgorithmTests.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/MLAlgorithmTests.scala
index 693ca7c..37409b7 100644
--- a/mllib-tests/v2p0/src/main/scala/mllib/perf/MLAlgorithmTests.scala
+++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/MLAlgorithmTests.scala
@@ -204,10 +204,9 @@ class GLMRegressionTest(sc: SparkContext) extends GLMTests(sc) {
 class GLMClassificationTest(sc: SparkContext) extends GLMTests(sc) {
 
   val THRESHOLD =  ("per-negative",   "probability for a negative label during data generation")
-  val FEATURE_NOISE =  ("feature-noise",   "scale factor for the noise during feature generation")
   val LOSS =  ("loss",   "loss to minimize. Supported: logistic, hinge (SVM).")
 
-  doubleOptions = doubleOptions ++ Seq(THRESHOLD, FEATURE_NOISE)
+  doubleOptions = doubleOptions ++ Seq(THRESHOLD)
   stringOptions = stringOptions ++ Seq(LOSS)
 
   val options = intOptions ++ stringOptions  ++ booleanOptions ++ doubleOptions ++ longOptions
@@ -227,10 +226,9 @@ class GLMClassificationTest(sc: SparkContext) extends GLMTests(sc) {
     val numPartitions: Int = intOptionValue(NUM_PARTITIONS)
 
     val threshold: Double = doubleOptionValue(THRESHOLD)
-    val featureNoise: Double = doubleOptionValue(FEATURE_NOISE)
 
     val data = DataGenerator.generateClassificationLabeledPoints(sc,
-      math.ceil(numExamples * 1.25).toLong, numFeatures, threshold, featureNoise, numPartitions,
+      math.ceil(numExamples * 1.25).toLong, numFeatures, threshold, numPartitions,
       seed)
 
     val split = data.randomSplit(Array(0.8, 0.2), seed)
@@ -462,11 +460,10 @@ class NaiveBayesTest(sc: SparkContext)
   extends RegressionAndClassificationTests[NaiveBayesModel](sc) {
 
   val THRESHOLD =  ("per-negative",   "probability for a negative label during data generation")
-  val FEATURE_NOISE =  ("feature-noise",   "scale factor for the noise during feature generation")
   val SMOOTHING =     ("nb-lambda",   "the smoothing parameter lambda for Naive Bayes")
   val MODEL_TYPE = ("model-type", "either multinomial (default) or bernoulli")
 
-  doubleOptions = doubleOptions ++ Seq(THRESHOLD, FEATURE_NOISE, SMOOTHING)
+  doubleOptions = doubleOptions ++ Seq(THRESHOLD, SMOOTHING)
   stringOptions = stringOptions ++ Seq(MODEL_TYPE)
   val options = intOptions ++ stringOptions  ++ booleanOptions ++ doubleOptions ++ longOptions
   addOptionsToParser()
@@ -478,7 +475,6 @@ class NaiveBayesTest(sc: SparkContext)
     val numPartitions: Int = intOptionValue(NUM_PARTITIONS)
 
     val threshold: Double = doubleOptionValue(THRESHOLD)
-    val featureNoise: Double = doubleOptionValue(FEATURE_NOISE)
     val modelType = stringOptionValue(MODEL_TYPE)
 
     val data = if (modelType == "bernoulli") {
@@ -486,7 +482,7 @@ class NaiveBayesTest(sc: SparkContext)
         math.ceil(numExamples * 1.25).toLong, numFeatures, threshold, numPartitions, seed)
     } else {
       val negdata = DataGenerator.generateClassificationLabeledPoints(sc,
-        math.ceil(numExamples * 1.25).toLong, numFeatures, threshold, featureNoise, numPartitions,
+        math.ceil(numExamples * 1.25).toLong, numFeatures, threshold, numPartitions,
         seed)
       val dataNonneg = negdata.map { lp =>
         LabeledPoint(lp.label, Vectors.dense(lp.features.toArray.map(math.abs)))
diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/StatTests.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/StatTests.scala
index 21c286c..2e84629 100644
--- a/mllib-tests/v2p0/src/main/scala/mllib/perf/StatTests.scala
+++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/StatTests.scala
@@ -73,7 +73,7 @@ class ChiSquaredFeatureTest(sc: SparkContext) extends StatTests[RDD[LabeledPoint
     val n: Int = intOptionValue(NUM_COLS)
     val numPartitions: Int = intOptionValue(NUM_PARTITIONS)
 
-    rdd = DataGenerator.generateClassificationLabeledPoints(sc, m, n, 0.5, 1.0, numPartitions,
+    rdd = DataGenerator.generateClassificationLabeledPoints(sc, m, n, 0.5, numPartitions,
       seed, chiSq = true).cache()
 
     // Materialize rdd
diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/util/DataGenerator.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/util/DataGenerator.scala
index e65a5a5..ff3fd00 100644
--- a/mllib-tests/v2p0/src/main/scala/mllib/perf/util/DataGenerator.scala
+++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/util/DataGenerator.scala
@@ -49,12 +49,11 @@ object DataGenerator {
       numRows: Long,
       numCols: Int,
       threshold: Double,
-      featureNoise: Double,
       numPartitions: Int,
       seed: Long = System.currentTimeMillis(),
       chiSq: Boolean = false): RDD[LabeledPoint] = {
 
-    RandomRDDs.randomRDD(sc, new ClassLabelGenerator(numCols,threshold, featureNoise, chiSq),
+    RandomRDDs.randomRDD(sc, new ClassLabelGenerator(numCols,threshold, chiSq),
       numRows, numPartitions, seed)
   }
 
@@ -364,7 +363,6 @@ class RatingGenerator(
 class ClassLabelGenerator(
     private val numFeatures: Int,
     private val threshold: Double,
-    private val featureNoise: Double,
     private val chiSq: Boolean) extends RandomDataGenerator[LabeledPoint] {
 
   private val rng = new java.util.Random()
@@ -372,7 +370,7 @@ class ClassLabelGenerator(
   override def nextValue(): LabeledPoint = {
     val y = if (rng.nextDouble() < threshold) 0.0 else 1.0
     val x = Array.fill[Double](numFeatures) {
-      if (!chiSq) rng.nextGaussian() + (y * featureNoise) else rng.nextInt(6) * 1.0
+      if (!chiSq) rng.nextGaussian() + y else rng.nextInt(6) * 1.0
     }
 
     LabeledPoint(y, Vectors.dense(x))
@@ -383,7 +381,7 @@ class ClassLabelGenerator(
   }
 
   override def copy(): ClassLabelGenerator =
-    new ClassLabelGenerator(numFeatures, threshold, featureNoise, chiSq)
+    new ClassLabelGenerator(numFeatures, threshold, chiSq)
 }
 
 class BinaryLabeledDataGenerator(

From e15a40e565568129c25b188328dc4ea1bd5b9d53 Mon Sep 17 00:00:00 2001
From: Adam Roberts <aroberts@uk.ibm.com>
Date: Fri, 26 Aug 2016 15:00:24 +0100
Subject: [PATCH 20/22] Remove sparkVersion statement in SparkTestsBuild.scala

---
 spark-tests/project/SparkTestsBuild.scala | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/spark-tests/project/SparkTestsBuild.scala b/spark-tests/project/SparkTestsBuild.scala
index c842851..f77c39d 100644
--- a/spark-tests/project/SparkTestsBuild.scala
+++ b/spark-tests/project/SparkTestsBuild.scala
@@ -11,13 +11,12 @@ object SparkTestsBuild extends Build {
       organization := "org.spark-project",
       version := "0.1",
       scalaVersion := sys.props.getOrElse("scala.version", default="2.11.8"),
-      sparkVersion := sys.props.getOrElse("spark.version", default="2.0.0"),
       libraryDependencies ++= Seq(
         "net.sf.jopt-simple" % "jopt-simple" % "4.6",
         "org.scalatest" %% "scalatest" % "2.2.1" % "test",
         "com.google.guava" % "guava" % "14.0.1",
-        "org.json4s" %% "json4s-native" % "3.2.9",
-        "org.apache.spark" %% "spark-core" % sparkVersion
+        "org.apache.spark" %% "spark-core" % "2.0.0" % "provided",
+        "org.json4s" %% "json4s-native" % "3.2.9"
       ),
       test in assembly := {},
       outputPath in assembly := file("target/spark-perf-tests-assembly.jar"),

From 0ad07c7dc2c58b01688f08f0985fed88075eaf49 Mon Sep 17 00:00:00 2001
From: Adam Roberts <aroberts@uk.ibm.com>
Date: Fri, 26 Aug 2016 16:36:04 +0100
Subject: [PATCH 21/22] Remove spark version from streaming project file also

---
 streaming-tests/project/StreamingTestsBuild.scala | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/streaming-tests/project/StreamingTestsBuild.scala b/streaming-tests/project/StreamingTestsBuild.scala
index 9c2e0bf..39784f3 100644
--- a/streaming-tests/project/StreamingTestsBuild.scala
+++ b/streaming-tests/project/StreamingTestsBuild.scala
@@ -11,7 +11,6 @@ object StreamingTestsBuild extends Build {
       organization := "org.spark-project",
       version := "0.1",
       scalaVersion := sys.props.getOrElse("scala.version", default="2.11.8"),
-      sparkVersion := sys.props.getOrElse("spark.version", default="2.0.0"),
       libraryDependencies ++= Seq(
         "net.sf.jopt-simple" % "jopt-simple" % "4.5",
         "org.scalatest" %% "scalatest" % "2.2.1" % "test",
@@ -21,8 +20,8 @@ object StreamingTestsBuild extends Build {
         "com.typesafe.akka" %% "akka-remote"  % "2.3.11",
         "com.typesafe.akka" %% "akka-agent"   % "2.3.11",
         "org.slf4j" % "slf4j-log4j12" % "1.7.2",
-        "org.apache.spark" %% "spark-core" % sparkVersion,
-        "org.apache.spark" %% "spark-streaming" % sparkVersion
+        "org.apache.spark" %% "spark-core" % "2.0.0" % "provided",
+        "org.apache.spark" %% "spark-streaming" % "2.0.0" % "provided"
       ),
       test in assembly := {},
       outputPath in assembly := file("target/streaming-perf-tests-assembly.jar"),

From e6cbaf9d266eca7e6e11a71f27c3655f412f02da Mon Sep 17 00:00:00 2001
From: Adam Roberts <aroberts@uk.ibm.com>
Date: Tue, 27 Sep 2016 19:21:30 +0100
Subject: [PATCH 22/22] Lower defaults, feature-noise if Spark 1.x only

---
 config/config.py.template                 | 21 +++++++++++++++------
 mllib-tests/project/MLlibTestsBuild.scala |  5 ++++-
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/config/config.py.template b/config/config.py.template
index c15d5e9..aeb39ee 100755
--- a/config/config.py.template
+++ b/config/config.py.template
@@ -128,7 +128,7 @@ PYTHON_MLLIB_OUTPUT_FILENAME = "results/python_mllib_perf_output_%s_%s" % (
 # number of records in a generated dataset) if you are running the tests with more
 # or fewer nodes. When developing new test suites, you might want to set this to a small
 # value suitable for a single machine, such as 0.001.
-SCALE_FACTOR = 1.0
+SCALE_FACTOR = 0.01
 
 assert SCALE_FACTOR > 0, "SCALE_FACTOR must be > 0."
 
@@ -155,7 +155,8 @@ COMMON_JAVA_OPTS = [
     JavaOptionSet("spark.locality.wait", [str(60 * 1000 * 1000)])
 ]
 # Set driver memory here
-SPARK_DRIVER_MEMORY = "20g"
+SPARK_DRIVER_MEMORY = "1g"
+
 # The following options value sets are shared among all tests.
 COMMON_OPTS = [
     # How many times to run each experiment - used to warm up system caches.
@@ -375,7 +376,7 @@ MLLIB_PERF_TEST_RUNNER = "mllib.perf.TestRunner"
 #  * Set `USE_CLUSTER_SPARK = True` and `MLLIB_SPARK_VERSION = {desired Spark version, e.g. 1.5}`
 #  * Don't use PREP_MLLIB_TESTS = True; instead manually run `cd mllib-tests; sbt/sbt -Dspark.version=1.5.0-SNAPSHOT clean assembly` to build perf tests
 
-MLLIB_SPARK_VERSION = 2.0.0
+MLLIB_SPARK_VERSION = 2.0
 
 MLLIB_JAVA_OPTS = COMMON_JAVA_OPTS
 if MLLIB_SPARK_VERSION >= 1.1:
@@ -403,10 +404,8 @@ MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS = MLLIB_COMMON_OPTS + [
 ]
 
 # Generalized Linear Model (GLM) Tests #
+
 MLLIB_GLM_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
-    # The scale factor for the noise in feature values.
-    # Currently ignored for regression.
-    OptionSet("feature-noise", [1.0]),
     # The number of features per example
     OptionSet("num-features", [10000], can_scale=False),
     # The number of iterations for SGD
@@ -418,12 +417,22 @@ MLLIB_GLM_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [
     # Regularization parameter
     OptionSet("reg-param", [0.1])
 ]
+
 if MLLIB_SPARK_VERSION >= 1.5:
     MLLIB_GLM_TEST_OPTS += [
         # Ignored, but required for config
         OptionSet("elastic-net-param", [0.0])
     ]
 
+if MLLIB_SPARK_VERSION < 2.0:
+    MLLIB_GLM_TEST_OPTS += [
+        # The scale factor for the noise in feature values.
+        # Currently ignored for regression.
+        # Only available in Spark 1.x
+        OptionSet("feature-noise", [1.0])
+    ]
+
+
 # GLM Regression Tests #
 MLLIB_GLM_REGRESSION_TEST_OPTS = MLLIB_GLM_TEST_OPTS + [
     # Optimization algorithm: sgd
diff --git a/mllib-tests/project/MLlibTestsBuild.scala b/mllib-tests/project/MLlibTestsBuild.scala
index 4ebf2a1..3622734 100644
--- a/mllib-tests/project/MLlibTestsBuild.scala
+++ b/mllib-tests/project/MLlibTestsBuild.scala
@@ -23,7 +23,10 @@ object MLlibTestsBuild extends Build {
       "org.scalatest" %% "scalatest" % "2.2.1" % "test",
       "org.slf4j" % "slf4j-log4j12" % "1.7.2",
       "org.json4s" %% "json4s-native" % "3.2.9",
-      "org.apache.spark" %% "spark-mllib" % sparkVersion.value % "provided"
+      // Allow the user to set the Spark version but default to look
+      // for the Spark 2.0.0 artifact. Uncomment below to use spark.version
+      // "org.apache.spark" %% "spark-mllib" % sparkVersion.value % "provided"
+      "org.apache.spark" %% "spark-mllib" % "2.0.0" % "provided"
     )
   )