From 5f090fc2f1c272b839cee8965c77293d018c18d1 Mon Sep 17 00:00:00 2001 From: Adam Roberts Date: Mon, 20 Jun 2016 14:30:30 +0100 Subject: [PATCH 01/22] Spark 2.0.0 support This commit makes the default version of Spark "2.0.0-preview" and consists of various configuration file changes and a couple of method changes. We should remove the -preview from the project files once 2.0.0 is made generally available (so we won't be relying on the preview builds). * Several changes have been made including downloading Akka for streaming-tests * Scala 2.11.8 is used * config/py now looks for $SPARK_HOME instead of /root * foreachRDD is used instead of foreach for a DStream * awaitTerminationOrTimeout is used instead of awaitTermination for a StreamingContext * json4s render call is removed owing to API changes --- config/config.py.template | 5 ++++- lib/sparkperf/testsuites.py | 2 +- mllib-tests/project/MLlibTestsBuild.scala | 14 +++++++------- pyspark-tests/mllib_tests.py | 8 +++++--- spark-tests/project/SparkTestsBuild.scala | 8 ++++---- .../src/main/scala/spark/perf/TestRunner.scala | 2 +- streaming-tests/project/StreamingTestsBuild.scala | 10 +++++++--- .../scala/streaming/perf/HdfsRecoveryTest.scala | 4 ++-- .../src/main/scala/streaming/perf/KVDataTest.scala | 2 +- 9 files changed, 32 insertions(+), 23 deletions(-) diff --git a/config/config.py.template b/config/config.py.template index a348b6f..bde48d2 100755 --- a/config/config.py.template +++ b/config/config.py.template @@ -18,7 +18,8 @@ from sparkperf.config_utils import FlagSet, JavaOptionSet, OptionSet, ConstantOp # ================================ # # Point to an installation of Spark on the cluster. -SPARK_HOME_DIR = "/root/spark" +DEFAULT_HOME=os.environ['HOME'] +SPARK_HOME_DIR = os.getenv('SPARK_HOME', DEFAULT_HOME) # Use a custom configuration directory SPARK_CONF_DIR = SPARK_HOME_DIR + "/conf" @@ -370,6 +371,8 @@ MLLIB_PERF_TEST_RUNNER = "mllib.perf.TestRunner" # * Build Spark locally by running `build/sbt assembly; build/sbt publishLocal` in the Spark root directory # * Set `USE_CLUSTER_SPARK = True` and `MLLIB_SPARK_VERSION = {desired Spark version, e.g. 1.5}` # * Don't use PREP_MLLIB_TESTS = True; instead manually run `cd mllib-tests; sbt/sbt -Dspark.version=1.5.0-SNAPSHOT clean assembly` to build perf tests + +# Can be changed to 2.0 for using Spark 2.0 MLLIB_SPARK_VERSION = 1.5 MLLIB_JAVA_OPTS = COMMON_JAVA_OPTS diff --git a/lib/sparkperf/testsuites.py b/lib/sparkperf/testsuites.py index 573ecf5..46ab349 100644 --- a/lib/sparkperf/testsuites.py +++ b/lib/sparkperf/testsuites.py @@ -252,7 +252,7 @@ class MLlibTests(JVMPerfTestSuite, MLlibTestHelper): @classmethod def build(cls, spark_version): - run_cmd("cd %s/mllib-tests; %s -Dspark.version=%s.0 clean assembly" % (PROJ_DIR, SBT_CMD, spark_version)) + run_cmd("cd %s/mllib-tests; %s -Dspark.version=%s clean assembly" % (PROJ_DIR, SBT_CMD, spark_version)) @classmethod def process_output(cls, config, short_name, opt_list, stdout_filename, stderr_filename): diff --git a/mllib-tests/project/MLlibTestsBuild.scala b/mllib-tests/project/MLlibTestsBuild.scala index 3347db4..64a46e9 100644 --- a/mllib-tests/project/MLlibTestsBuild.scala +++ b/mllib-tests/project/MLlibTestsBuild.scala @@ -16,14 +16,15 @@ object MLlibTestsBuild extends Build { lazy val commonSettings = Seq( organization := "org.spark-project", version := "0.1", - scalaVersion := "2.10.4", - sparkVersion := sys.props.getOrElse("spark.version", default="1.5.2"), + scalaVersion := "2.11.8", + sparkVersion := sys.props.getOrElse("spark.version", default="2.0.0-preview"), libraryDependencies ++= Seq( "net.sf.jopt-simple" % "jopt-simple" % "4.6", "org.scalatest" %% "scalatest" % "2.2.1" % "test", "org.slf4j" % "slf4j-log4j12" % "1.7.2", - "org.json4s" %% "json4s-native" % "3.2.9", - "org.apache.spark" %% "spark-mllib" % sparkVersion.value % "provided" + "org.json4s" %% "json4s-native" % "3.2.10", + "org.apache.spark" %% "spark-core" % "2.0.0-preview" % "provided", + "org.apache.spark" %% "spark-streaming" % "2.0.0-preview" % "provided" ) ) @@ -35,9 +36,8 @@ object MLlibTestsBuild extends Build { val targetFolder = sparkVersion.value match { case v if v.startsWith("1.4.") => "v1p4" case v if v.startsWith("1.5.") => "v1p5" - case v if v.startsWith("1.6.") => - "v1p5" // acceptable for now, but change later when new algs are added - case _ => throw new IllegalArgumentException(s"Do not support Spark ${sparkVersion.value}.") + case v if v.startsWith("2.0") => "v2p0" + case _ => throw new IllegalArgumentException(s"This Spark version isn't suppored: ${sparkVersion.value}.") } baseDirectory.value / targetFolder / "src" / "main" / "scala" }, diff --git a/pyspark-tests/mllib_tests.py b/pyspark-tests/mllib_tests.py index 1b6a306..133d751 100644 --- a/pyspark-tests/mllib_tests.py +++ b/pyspark-tests/mllib_tests.py @@ -219,8 +219,8 @@ def __init__(self, sc): def createInputData(self): options = self.options - numTrain = options.num_examples - numTest = int(options.num_examples * 0.2) + numTrain = options.num_points + numTest = int(options.num_points * 0.2) self.trainRDD = LabeledDataGenerator.generateGLMData( self.sc, numTrain, options.num_features, options.num_partitions, options.random_seed, labelType=2) @@ -242,7 +242,7 @@ def __init__(self, sc): def createInputData(self): options = self.options self.data = FeaturesGenerator.generateContinuousData( - self.sc, options.num_examples, options.num_features, + self.sc, options.num_points, options.num_columns, options.num_partitions, options.random_seed) def runTest(self): @@ -368,6 +368,8 @@ def runTest(self): parser.add_option("--num-ratings", type="int", default=500) parser.add_option("--implicit-prefs", type="int", default=0) # MLLIB_CLUSTERING_TEST_OPTS + parser.add_option("--num-points", type="int", default=1000) + parser.add_option("--num-columns", type="int", default=10) parser.add_option("--num-centers", type="int", default=5) # MLLIB_LINALG_TEST_OPTS + MLLIB_STATS_TEST_OPTS parser.add_option("--num-rows", type="int", default=1000) diff --git a/spark-tests/project/SparkTestsBuild.scala b/spark-tests/project/SparkTestsBuild.scala index 4116326..697b28a 100644 --- a/spark-tests/project/SparkTestsBuild.scala +++ b/spark-tests/project/SparkTestsBuild.scala @@ -10,13 +10,13 @@ object SparkTestsBuild extends Build { settings = assemblySettings ++ Seq( organization := "org.spark-project", version := "0.1", - scalaVersion := "2.10.4", + scalaVersion := "2.11.8", libraryDependencies ++= Seq( "net.sf.jopt-simple" % "jopt-simple" % "4.6", "org.scalatest" %% "scalatest" % "2.2.1" % "test", "com.google.guava" % "guava" % "14.0.1", - "org.apache.spark" %% "spark-core" % "1.0.0" % "provided", - "org.json4s" %% "json4s-native" % "3.2.9" + "org.apache.spark" %% "spark-core" % "2.0.0-preview" % "provided", + "org.json4s" %% "json4s-native" % "3.2.10" ), test in assembly := {}, outputPath in assembly := file("target/spark-perf-tests-assembly.jar"), @@ -36,4 +36,4 @@ object SparkTestsBuild extends Build { case _ => MergeStrategy.first } )) -} \ No newline at end of file +} diff --git a/spark-tests/src/main/scala/spark/perf/TestRunner.scala b/spark-tests/src/main/scala/spark/perf/TestRunner.scala index cbfcb0a..6c21f33 100644 --- a/spark-tests/src/main/scala/spark/perf/TestRunner.scala +++ b/spark-tests/src/main/scala/spark/perf/TestRunner.scala @@ -44,7 +44,7 @@ object TestRunner { ("sparkVersion" -> sc.version) ~ ("systemProperties" -> System.getProperties.asScala.toMap) ~ ("results" -> results) - println("results: " + compact(render(json))) + println("results: " + compact(json)) // Gracefully stop the SparkContext so that the application web UI can be preserved // and viewed using the HistoryServer. diff --git a/streaming-tests/project/StreamingTestsBuild.scala b/streaming-tests/project/StreamingTestsBuild.scala index 7c8c903..fc2569c 100644 --- a/streaming-tests/project/StreamingTestsBuild.scala +++ b/streaming-tests/project/StreamingTestsBuild.scala @@ -10,14 +10,18 @@ object StreamingTestsBuild extends Build { settings = assemblySettings ++ Seq( organization := "org.spark-project", version := "0.1", - scalaVersion := "2.10.4", + scalaVersion := "2.11.8", libraryDependencies ++= Seq( "net.sf.jopt-simple" % "jopt-simple" % "4.5", "org.scalatest" %% "scalatest" % "2.2.1" % "test", "com.google.guava" % "guava" % "14.0.1", + "com.typesafe.akka" %% "akka-actor" % "2.3.11", + "com.typesafe.akka" %% "akka-slf4j" % "2.3.11", + "com.typesafe.akka" %% "akka-remote" % "2.3.11", + "com.typesafe.akka" %% "akka-agent" % "2.3.11", "org.slf4j" % "slf4j-log4j12" % "1.7.2", - "org.apache.spark" %% "spark-core" % "1.0.0" % "provided", - "org.apache.spark" %% "spark-streaming" % "1.0.0" % "provided" + "org.apache.spark" %% "spark-core" % "2.0.0-preview" % "provided", + "org.apache.spark" %% "spark-streaming" % "2.0.0-preview" % "provided" ), test in assembly := {}, outputPath in assembly := file("target/streaming-perf-tests-assembly.jar"), diff --git a/streaming-tests/src/main/scala/streaming/perf/HdfsRecoveryTest.scala b/streaming-tests/src/main/scala/streaming/perf/HdfsRecoveryTest.scala index 23e8b80..841ae5e 100644 --- a/streaming-tests/src/main/scala/streaming/perf/HdfsRecoveryTest.scala +++ b/streaming-tests/src/main/scala/streaming/perf/HdfsRecoveryTest.scala @@ -39,7 +39,7 @@ class HdfsRecoveryTest extends PerfTest { // Verify the running counts. For any key the running count should be in the sequence // 1, 3, 6, 10, 15, 21, ... (i.e., nth number is sum of 1..n) val expectedCounts = (1L to maxRecordsPerFile).map(x => (1L to x).reduce(_ + _)).toSet - wordStream.foreach((rdd: RDD[(String, Long)], time: Time) => { + wordStream.foreachRDD((rdd: RDD[(String, Long)], time: Time) => { val partitionCounts = rdd.sparkContext.runJob(rdd.mapPartitions(iter => iter.toSeq.groupBy(_._1).toSeq.map(x => (x._1, x._2.map(_._2).sum)).toIterator ), (iter: Iterator[(String, Long)]) => iter.toArray) @@ -48,7 +48,7 @@ class HdfsRecoveryTest extends PerfTest { val counts = rdd.reduceByKey(_ + _, 1).collect() println(s"New total count at $time = " + counts.mkString("[", ", ", "]")) }) - runningCountStream.foreach((rdd: RDD[(String, Long)], time: Time) => { + runningCountStream.foreachRDD((rdd: RDD[(String, Long)], time: Time) => { val counts = rdd.collect() val possibleCounts = expectedCounts val expected = counts.forall { case (word, count) => possibleCounts.contains(count) } diff --git a/streaming-tests/src/main/scala/streaming/perf/KVDataTest.scala b/streaming-tests/src/main/scala/streaming/perf/KVDataTest.scala index 628376d..9aeb989 100644 --- a/streaming-tests/src/main/scala/streaming/perf/KVDataTest.scala +++ b/streaming-tests/src/main/scala/streaming/perf/KVDataTest.scala @@ -70,7 +70,7 @@ abstract class KVDataTest extends PerfTest { // run test ssc.start() val startTime = System.currentTimeMillis - ssc.awaitTermination(totalDurationSec * 1000) + ssc.awaitTerminationOrTimeout(totalDurationSec * 1000) ssc.stop() processResults(statsReportListener) } From a6c3403c15cc0db35d80186938edb73749490abc Mon Sep 17 00:00:00 2001 From: Adam Roberts Date: Mon, 20 Jun 2016 18:21:24 +0100 Subject: [PATCH 02/22] Fix dependency for mllib --- mllib-tests/project/MLlibTestsBuild.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib-tests/project/MLlibTestsBuild.scala b/mllib-tests/project/MLlibTestsBuild.scala index 64a46e9..6799a93 100644 --- a/mllib-tests/project/MLlibTestsBuild.scala +++ b/mllib-tests/project/MLlibTestsBuild.scala @@ -24,7 +24,7 @@ object MLlibTestsBuild extends Build { "org.slf4j" % "slf4j-log4j12" % "1.7.2", "org.json4s" %% "json4s-native" % "3.2.10", "org.apache.spark" %% "spark-core" % "2.0.0-preview" % "provided", - "org.apache.spark" %% "spark-streaming" % "2.0.0-preview" % "provided" + "org.apache.spark" %% "spark-mllib" % "2.0.0-preview" % "provided" ) ) From 27e640f3e11ed5d2b978f6b781f1db09ed04c99e Mon Sep 17 00:00:00 2001 From: Adam Roberts Date: Mon, 20 Jun 2016 18:24:01 +0100 Subject: [PATCH 03/22] Add 1.6 target folder back in --- mllib-tests/project/MLlibTestsBuild.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mllib-tests/project/MLlibTestsBuild.scala b/mllib-tests/project/MLlibTestsBuild.scala index 6799a93..cd392d8 100644 --- a/mllib-tests/project/MLlibTestsBuild.scala +++ b/mllib-tests/project/MLlibTestsBuild.scala @@ -36,6 +36,8 @@ object MLlibTestsBuild extends Build { val targetFolder = sparkVersion.value match { case v if v.startsWith("1.4.") => "v1p4" case v if v.startsWith("1.5.") => "v1p5" + case v if v.startsWith("1.6.") => + "v1p5" // acceptable for now, but change later when new algs are added case v if v.startsWith("2.0") => "v2p0" case _ => throw new IllegalArgumentException(s"This Spark version isn't suppored: ${sparkVersion.value}.") } From 54735062c85455e4925da883e22c33d72158f0ad Mon Sep 17 00:00:00 2001 From: Adam Roberts Date: Thu, 30 Jun 2016 16:47:21 +0100 Subject: [PATCH 04/22] Add ML changes for Spark 2, note requires Spark in lib folder currently! --- config/config.py.template | 801 ------------------ mllib-tests/project/MLlibTestsBuild.scala | 20 +- .../scala/mllib/perf/MLAlgorithmTests.scala | 21 +- .../perf/clustering/GaussianMixtureTest.scala | 14 +- .../scala/mllib/perf/clustering/PICTest.scala | 13 +- .../scala/mllib/perf/util/DataGenerator.scala | 8 +- .../scala/mllib/perf/MLAlgorithmTests.scala | 37 +- .../perf/clustering/GaussianMixtureTest.scala | 14 +- .../scala/mllib/perf/clustering/PICTest.scala | 13 +- .../scala/mllib/perf/util/DataGenerator.scala | 8 +- .../scala/mllib/perf/LinearAlgebraTests.scala | 68 ++ .../scala/mllib/perf/MLAlgorithmTests.scala | 779 +++++++++++++++++ .../src/main/scala/mllib/perf/PerfTest.scala | 134 +++ .../src/main/scala/mllib/perf/StatTests.scala | 109 +++ .../main/scala/mllib/perf/TestRunner.scala | 87 ++ .../perf/clustering/GaussianMixtureTest.scala | 63 ++ .../scala/mllib/perf/clustering/LDATest.scala | 73 ++ .../scala/mllib/perf/clustering/PICTest.scala | 53 ++ .../mllib/perf/feature/Word2VecTest.scala | 69 ++ .../scala/mllib/perf/fpm/FPGrowthTest.scala | 65 ++ .../scala/mllib/perf/fpm/PrefixSpanTest.scala | 82 ++ .../perf/linalg/BlockMatrixMultTest.scala | 74 ++ .../scala/mllib/perf/util/DataGenerator.scala | 586 +++++++++++++ .../scala/mllib/perf/util/DataLoader.scala | 143 ++++ 24 files changed, 2457 insertions(+), 877 deletions(-) delete mode 100755 config/config.py.template create mode 100644 mllib-tests/v2p0/src/main/scala/mllib/perf/LinearAlgebraTests.scala create mode 100644 mllib-tests/v2p0/src/main/scala/mllib/perf/MLAlgorithmTests.scala create mode 100644 mllib-tests/v2p0/src/main/scala/mllib/perf/PerfTest.scala create mode 100644 mllib-tests/v2p0/src/main/scala/mllib/perf/StatTests.scala create mode 100644 mllib-tests/v2p0/src/main/scala/mllib/perf/TestRunner.scala create mode 100644 mllib-tests/v2p0/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala create mode 100644 mllib-tests/v2p0/src/main/scala/mllib/perf/clustering/LDATest.scala create mode 100644 mllib-tests/v2p0/src/main/scala/mllib/perf/clustering/PICTest.scala create mode 100644 mllib-tests/v2p0/src/main/scala/mllib/perf/feature/Word2VecTest.scala create mode 100644 mllib-tests/v2p0/src/main/scala/mllib/perf/fpm/FPGrowthTest.scala create mode 100644 mllib-tests/v2p0/src/main/scala/mllib/perf/fpm/PrefixSpanTest.scala create mode 100644 mllib-tests/v2p0/src/main/scala/mllib/perf/linalg/BlockMatrixMultTest.scala create mode 100644 mllib-tests/v2p0/src/main/scala/mllib/perf/util/DataGenerator.scala create mode 100644 mllib-tests/v2p0/src/main/scala/mllib/perf/util/DataLoader.scala diff --git a/config/config.py.template b/config/config.py.template deleted file mode 100755 index bde48d2..0000000 --- a/config/config.py.template +++ /dev/null @@ -1,801 +0,0 @@ -""" -Configuration options for running Spark performance tests. - -When updating `spark-perf`, you should probably use `diff` to compare the updated template to -your modified `config.py` file and copy over any new configurations. -""" - -import time -import os -import os.path -import socket - -from sparkperf.config_utils import FlagSet, JavaOptionSet, OptionSet, ConstantOption - - -# ================================ # -# Standard Configuration Options # -# ================================ # - -# Point to an installation of Spark on the cluster. -DEFAULT_HOME=os.environ['HOME'] -SPARK_HOME_DIR = os.getenv('SPARK_HOME', DEFAULT_HOME) - -# Use a custom configuration directory -SPARK_CONF_DIR = SPARK_HOME_DIR + "/conf" - -# Master used when submitting Spark jobs. -# For local clusters: "spark://%s:7077" % socket.gethostname() -# For Yarn clusters: "yarn" -# Otherwise, the default uses the specified EC2 cluster -SPARK_CLUSTER_URL = open("/root/spark-ec2/cluster-url", 'r').readline().strip() -IS_YARN_MODE = "yarn" in SPARK_CLUSTER_URL -IS_MESOS_MODE = "mesos" in SPARK_CLUSTER_URL - -# Specify URI to download spark executor. This only applied for running with Mesos. -#SPARK_EXECUTOR_URI = "http://localhost:8000/spark.tgz" - -# Path to the Mesos native library. This is only required for running with Mesos. -#MESOS_NATIVE_LIBRARY = "/usr/local/lib/libmesos.so" - -# Run Mesos client in coarse or fine grain mode. This is only applied for running with Mesos. -#SPARK_MESOS_COARSE = True - - -# If this is true, we'll submit your job using an existing Spark installation. -# If this is false, we'll clone and build a specific version of Spark, and -# copy configurations from your existing Spark installation. -USE_CLUSTER_SPARK = True - -# URL of the HDFS installation in the Spark EC2 cluster -HDFS_URL = "hdfs://%s:9000/test/" % socket.gethostname() - -# Set the following if not using existing Spark installation -# Commit id and repo used if you are not using an existing Spark cluster -# custom version of Spark. The remote name in your git repo is assumed -# to be "origin". -# -# The commit ID can specify any of the following: -# 1. A git commit hash e.g. "4af93ff3" -# 2. A branch name e.g. "origin/branch-0.7" -# 3. A tag name e.g. "origin/tag/v0.8.0-incubating" -# 4. A pull request e.g. "origin/pr/675" -SPARK_COMMIT_ID = "" -SPARK_GIT_REPO = "https://github.com/apache/spark.git" -SPARK_MERGE_COMMIT_INTO_MASTER = False # Whether to merge the commit into master - -# Whether to install and build Spark. Set this to true only for the -# first installation if an existing one does not already exist. -PREP_SPARK = not USE_CLUSTER_SPARK - -# Whether to restart the Master and all Workers -# This should always be false for Yarn -RESTART_SPARK_CLUSTER = True -RESTART_SPARK_CLUSTER = RESTART_SPARK_CLUSTER and not IS_YARN_MODE - -# Rsync SPARK_HOME to all the slaves or not -RSYNC_SPARK_HOME = True - -# Which tests to run -RUN_SPARK_TESTS = True -RUN_PYSPARK_TESTS = False -RUN_STREAMING_TESTS = False -RUN_MLLIB_TESTS = False -RUN_PYTHON_MLLIB_TESTS = False - -# Which tests to prepare. Set this to true for the first -# installation or whenever you make a change to the tests. -PREP_SPARK_TESTS = True -PREP_PYSPARK_TESTS = False -PREP_STREAMING_TESTS = False -PREP_MLLIB_TESTS = False - -# Whether to warm up local disks (warm-up is only necesary on EC2). -DISK_WARMUP = False - -# Total number of bytes used to warm up each local directory. -DISK_WARMUP_BYTES = 200 * 1024 * 1024 - -# Number of files to create when warming up each local directory. -# Bytes will be evenly divided across files. -DISK_WARMUP_FILES = 200 - -# Prompt for confirmation when deleting temporary files. -PROMPT_FOR_DELETES = True - -# Files to write results to -SPARK_OUTPUT_FILENAME = "results/spark_perf_output_%s_%s" % ( - SPARK_COMMIT_ID.replace("/", "-"), time.strftime("%Y-%m-%d_%H-%M-%S")) -PYSPARK_OUTPUT_FILENAME = "results/python_perf_output_%s_%s" % ( - SPARK_COMMIT_ID.replace("/", "-"), time.strftime("%Y-%m-%d_%H-%M-%S")) -STREAMING_OUTPUT_FILENAME = "results/streaming_perf_output_%s_%s" % ( - SPARK_COMMIT_ID.replace("/", "-"), time.strftime("%Y-%m-%d_%H-%M-%S")) -MLLIB_OUTPUT_FILENAME = "results/mllib_perf_output_%s_%s" % ( - SPARK_COMMIT_ID.replace("/", "-"), time.strftime("%Y-%m-%d_%H-%M-%S")) -PYTHON_MLLIB_OUTPUT_FILENAME = "results/python_mllib_perf_output_%s_%s" % ( - SPARK_COMMIT_ID.replace("/", "-"), time.strftime("%Y-%m-%d_%H-%M-%S")) - - -# ============================ # -# Test Configuration Options # -# ============================ # - -# The default values configured below are appropriate for approximately 20 m1.xlarge nodes, -# in which each node has 15 GB of memory. Use this variable to scale the values (e.g. -# number of records in a generated dataset) if you are running the tests with more -# or fewer nodes. When developing new test suites, you might want to set this to a small -# value suitable for a single machine, such as 0.001. -SCALE_FACTOR = 1.0 - -assert SCALE_FACTOR > 0, "SCALE_FACTOR must be > 0." - -# If set, removes the first N trials for each test from all reported statistics. Useful for -# tests which have outlier behavior due to JIT and other system cache warm-ups. If any test -# returns fewer N + 1 results, an exception is thrown. -IGNORED_TRIALS = 2 - -# Command used to launch Scala or Java. - -# Set up OptionSets. Note that giant cross product is done over all JavaOptionsSets + OptionSets -# passed to each test which may be combinations of those set up here. - -# Java options. -COMMON_JAVA_OPTS = [ - # Fraction of JVM memory used for caching RDDs. - JavaOptionSet("spark.storage.memoryFraction", [0.66]), - JavaOptionSet("spark.serializer", ["org.apache.spark.serializer.JavaSerializer"]), - # JavaOptionSet("spark.executor.memory", ["9g"]), - # Turn event logging on in order better diagnose failed tests. Off by default as it crashes - # releases prior to 1.0.2 - # JavaOptionSet("spark.eventLog.enabled", [True]), - # To ensure consistency across runs, we disable delay scheduling - JavaOptionSet("spark.locality.wait", [str(60 * 1000 * 1000)]) -] -# Set driver memory here -SPARK_DRIVER_MEMORY = "20g" -# The following options value sets are shared among all tests. -COMMON_OPTS = [ - # How many times to run each experiment - used to warm up system caches. - # This OptionSet should probably only have a single value (i.e., length 1) - # since it doesn't make sense to have multiple values here. - OptionSet("num-trials", [10]), - # Extra pause added between trials, in seconds. For runs with large amounts - # of shuffle data, this gives time for buffer cache write-back. - OptionSet("inter-trial-wait", [3]) -] - -# The following options value sets are shared among all tests of -# operations on key-value data. -SPARK_KEY_VAL_TEST_OPTS = [ - # The number of input partitions. - OptionSet("num-partitions", [400], can_scale=True), - # The number of reduce tasks. - OptionSet("reduce-tasks", [400], can_scale=True), - # A random seed to make tests reproducable. - OptionSet("random-seed", [5]), - # Input persistence strategy (can be "memory", "disk", or "hdfs"). - # NOTE: If "hdfs" is selected, datasets will be re-used across runs of - # this script. This means parameters here are effectively ignored if - # an existing input dataset is present. - OptionSet("persistent-type", ["memory"]), - # Whether to wait for input in order to exit the JVM. - FlagSet("wait-for-exit", [False]), - # Total number of records to create. - OptionSet("num-records", [200 * 1000 * 1000], True), - # Number of unique keys to sample from. - OptionSet("unique-keys",[20 * 1000], True), - # Length in characters of each key. - OptionSet("key-length", [10]), - # Number of unique values to sample from. - OptionSet("unique-values", [1000 * 1000], True), - # Length in characters of each value. - OptionSet("value-length", [10]), - # Use hashes instead of padded numbers for keys and values - FlagSet("hash-records", [False]), - # Storage location if HDFS persistence is used - OptionSet("storage-location", [ - HDFS_URL + "/spark-perf-kv-data"]) -] - - -# ======================= # -# Spark Core Test Setup # -# ======================= # - -# Set up the actual tests. Each test is represtented by a tuple: -# (short_name, test_cmd, scale_factor, list, list) - -SPARK_KV_OPTS = COMMON_OPTS + SPARK_KEY_VAL_TEST_OPTS -SPARK_TESTS = [] - -SCHEDULING_THROUGHPUT_OPTS = [ - # The number of tasks that should be launched in each job: - OptionSet("num-tasks", [10 * 1000]), - # The number of jobs that should be run: - OptionSet("num-jobs", [1]), - # The size of the task closure (in bytes): - OptionSet("closure-size", [0]), - # A random seed to make tests reproducible: - OptionSet("random-seed", [5]), -] - -SPARK_TESTS += [("scheduling-throughput", "spark.perf.TestRunner", - SCALE_FACTOR, COMMON_JAVA_OPTS, - [ConstantOption("scheduling-throughput")] + COMMON_OPTS + SCHEDULING_THROUGHPUT_OPTS)] - -SPARK_TESTS += [("scala-agg-by-key", "spark.perf.TestRunner", SCALE_FACTOR, - COMMON_JAVA_OPTS, [ConstantOption("aggregate-by-key")] + SPARK_KV_OPTS)] - -# Scale the input for this test by 2x since ints are smaller. -SPARK_TESTS += [("scala-agg-by-key-int", "spark.perf.TestRunner", SCALE_FACTOR * 2, - COMMON_JAVA_OPTS, [ConstantOption("aggregate-by-key-int")] + SPARK_KV_OPTS)] - -SPARK_TESTS += [("scala-agg-by-key-naive", "spark.perf.TestRunner", SCALE_FACTOR, - COMMON_JAVA_OPTS, [ConstantOption("aggregate-by-key-naive")] + SPARK_KV_OPTS)] - -# Scale the input for this test by 0.10. -SPARK_TESTS += [("scala-sort-by-key", "spark.perf.TestRunner", SCALE_FACTOR * 0.1, - COMMON_JAVA_OPTS, [ConstantOption("sort-by-key")] + SPARK_KV_OPTS)] - -SPARK_TESTS += [("scala-sort-by-key-int", "spark.perf.TestRunner", SCALE_FACTOR * 0.2, - COMMON_JAVA_OPTS, [ConstantOption("sort-by-key-int")] + SPARK_KV_OPTS)] - -SPARK_TESTS += [("scala-count", "spark.perf.TestRunner", SCALE_FACTOR, - COMMON_JAVA_OPTS, [ConstantOption("count")] + SPARK_KV_OPTS)] - -SPARK_TESTS += [("scala-count-w-fltr", "spark.perf.TestRunner", SCALE_FACTOR, - COMMON_JAVA_OPTS, [ConstantOption("count-with-filter")] + SPARK_KV_OPTS)] - - -# ==================== # -# Pyspark Test Setup # -# ==================== # - -PYSPARK_TESTS = [] - -BROADCAST_TEST_OPTS = [ - # The size of broadcast - OptionSet("broadcast-size", [200 << 20], can_scale=True), -] - -PYSPARK_TESTS += [("python-scheduling-throughput", "core_tests.py", - SCALE_FACTOR, COMMON_JAVA_OPTS, - [ConstantOption("SchedulerThroughputTest"), OptionSet("num-tasks", [5000])] + COMMON_OPTS)] - -PYSPARK_TESTS += [("python-agg-by-key", "core_tests.py", SCALE_FACTOR, - COMMON_JAVA_OPTS, [ConstantOption("AggregateByKey")] + SPARK_KV_OPTS)] - -# Scale the input for this test by 2x since ints are smaller. -PYSPARK_TESTS += [("python-agg-by-key-int", "core_tests.py", SCALE_FACTOR * 2, - COMMON_JAVA_OPTS, [ConstantOption("AggregateByKeyInt")] + SPARK_KV_OPTS)] - -PYSPARK_TESTS += [("python-agg-by-key-naive", "core_tests.py", SCALE_FACTOR, - COMMON_JAVA_OPTS, [ConstantOption("AggregateByKeyNaive")] + SPARK_KV_OPTS)] - -# Scale the input for this test by 0.10. -PYSPARK_TESTS += [("python-sort-by-key", "core_tests.py", SCALE_FACTOR * 0.1, - COMMON_JAVA_OPTS, [ConstantOption("SortByKey")] + SPARK_KV_OPTS)] - -PYSPARK_TESTS += [("python-sort-by-key-int", "core_tests.py", SCALE_FACTOR * 0.2, - COMMON_JAVA_OPTS, [ConstantOption("SortByKeyInt")] + SPARK_KV_OPTS)] - -PYSPARK_TESTS += [("python-count", "core_tests.py", SCALE_FACTOR, - COMMON_JAVA_OPTS, [ConstantOption("Count")] + SPARK_KV_OPTS)] - -PYSPARK_TESTS += [("python-count-w-fltr", "core_tests.py", SCALE_FACTOR, - COMMON_JAVA_OPTS, [ConstantOption("CountWithFilter")] + SPARK_KV_OPTS)] - -PYSPARK_TESTS += [("python-broadcast-w-bytes", "core_tests.py", SCALE_FACTOR, - COMMON_JAVA_OPTS, [ConstantOption("BroadcastWithBytes")] + SPARK_KV_OPTS + BROADCAST_TEST_OPTS)] - -PYSPARK_TESTS += [("python-broadcast-w-set", "core_tests.py", SCALE_FACTOR, - COMMON_JAVA_OPTS, [ConstantOption("BroadcastWithSet")] + SPARK_KV_OPTS + BROADCAST_TEST_OPTS)] - - -# ============================ # -# Spark Streaming Test Setup # -# ============================ # - -STREAMING_TESTS = [] - -# The following function generates options for setting batch duration in streaming tests -def streaming_batch_duration_opts(duration): - return [OptionSet("batch-duration", [duration])] - -# The following function generates options for setting window duration in streaming tests -def streaming_window_duration_opts(duration): - return [OptionSet("window-duration", [duration])] - -STREAMING_COMMON_OPTS = [ - OptionSet("total-duration", [60]), - OptionSet("hdfs-url", [HDFS_URL]), -] - -STREAMING_COMMON_JAVA_OPTS = [ - # Fraction of JVM memory used for caching RDDs. - JavaOptionSet("spark.storage.memoryFraction", [0.66]), - JavaOptionSet("spark.serializer", ["org.apache.spark.serializer.JavaSerializer"]), - # JavaOptionSet("spark.executor.memory", ["9g"]), - JavaOptionSet("spark.executor.extraJavaOptions", [" -XX:+UseConcMarkSweepGC "]) -] - -STREAMING_KEY_VAL_TEST_OPTS = STREAMING_COMMON_OPTS + streaming_batch_duration_opts(2000) + [ - # Number of input streams. - OptionSet("num-streams", [1], can_scale=True), - # Number of records per second per input stream - OptionSet("records-per-sec", [10 * 1000]), - # Number of reduce tasks. - OptionSet("reduce-tasks", [10], can_scale=True), - # memory serialization ("true" or "false"). - OptionSet("memory-serialization", ["true"]), - # Number of unique keys to sample from. - OptionSet("unique-keys",[100 * 1000], can_scale=True), - # Length in characters of each key. - OptionSet("unique-values", [1000 * 1000], can_scale=True), - # Send data through receiver - OptionSet("use-receiver", ["true"]), -] - -STREAMING_HDFS_RECOVERY_TEST_OPTS = STREAMING_COMMON_OPTS + streaming_batch_duration_opts(5000) + [ - OptionSet("records-per-file", [10000]), - OptionSet("file-cleaner-delay", [300]) -] - -# This test is just to see if everything is setup properly -STREAMING_TESTS += [("basic", "streaming.perf.TestRunner", SCALE_FACTOR, - STREAMING_COMMON_JAVA_OPTS, [ConstantOption("basic")] + STREAMING_COMMON_OPTS + streaming_batch_duration_opts(1000))] - -STREAMING_TESTS += [("state-by-key", "streaming.perf.TestRunner", SCALE_FACTOR, - STREAMING_COMMON_JAVA_OPTS, [ConstantOption("state-by-key")] + STREAMING_KEY_VAL_TEST_OPTS)] - -STREAMING_TESTS += [("group-by-key-and-window", "streaming.perf.TestRunner", SCALE_FACTOR, - STREAMING_COMMON_JAVA_OPTS, [ConstantOption("group-by-key-and-window")] + STREAMING_KEY_VAL_TEST_OPTS + streaming_window_duration_opts(10000) )] - -STREAMING_TESTS += [("reduce-by-key-and-window", "streaming.perf.TestRunner", SCALE_FACTOR, - STREAMING_COMMON_JAVA_OPTS, [ConstantOption("reduce-by-key-and-window")] + STREAMING_KEY_VAL_TEST_OPTS + streaming_window_duration_opts(10000) )] - -STREAMING_TESTS += [("hdfs-recovery", "streaming.perf.TestRunner", SCALE_FACTOR, - STREAMING_COMMON_JAVA_OPTS, [ConstantOption("hdfs-recovery")] + STREAMING_HDFS_RECOVERY_TEST_OPTS)] - - -# ================== # -# MLlib Test Setup # -# ================== # - -MLLIB_TESTS = [] -MLLIB_PERF_TEST_RUNNER = "mllib.perf.TestRunner" - -# Set this to 1.0, 1.1, 1.2, ... (the major version) to test MLlib with a particular Spark version. -# Note: You should also build mllib-perf using -Dspark.version to specify the same version. -# Note: To run perf tests against a snapshot version of Spark which has not yet been packaged into a release: -# * Build Spark locally by running `build/sbt assembly; build/sbt publishLocal` in the Spark root directory -# * Set `USE_CLUSTER_SPARK = True` and `MLLIB_SPARK_VERSION = {desired Spark version, e.g. 1.5}` -# * Don't use PREP_MLLIB_TESTS = True; instead manually run `cd mllib-tests; sbt/sbt -Dspark.version=1.5.0-SNAPSHOT clean assembly` to build perf tests - -# Can be changed to 2.0 for using Spark 2.0 -MLLIB_SPARK_VERSION = 1.5 - -MLLIB_JAVA_OPTS = COMMON_JAVA_OPTS -if MLLIB_SPARK_VERSION >= 1.1: - MLLIB_JAVA_OPTS = MLLIB_JAVA_OPTS + [ - # Shuffle manager: SORT, HASH - JavaOptionSet("spark.shuffle.manager", ["SORT"]) - ] - -# The following options value sets are shared among all tests of -# operations on MLlib algorithms. -MLLIB_COMMON_OPTS = COMMON_OPTS + [ - # The number of input partitions. - # The default setting is suitable for a 16-node m3.2xlarge EC2 cluster. - OptionSet("num-partitions", [128], can_scale=True), - # A random seed to make tests reproducable. - OptionSet("random-seed", [5]) -] - -# Algorithms available in Spark-1.0 # - -# Regression and Classification Tests # -MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS = MLLIB_COMMON_OPTS + [ - # The number of rows or examples - OptionSet("num-examples", [1000000], can_scale=True) -] - -# Generalized Linear Model (GLM) Tests # -MLLIB_GLM_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [ - # The scale factor for the noise in feature values. - # Currently ignored for regression. - OptionSet("feature-noise", [1.0]), - # The number of features per example - OptionSet("num-features", [10000], can_scale=False), - # The number of iterations for SGD - OptionSet("num-iterations", [20]), - # The step size for SGD - OptionSet("step-size", [0.001]), - # Regularization type: none, l1, l2 - OptionSet("reg-type", ["l2"]), - # Regularization parameter - OptionSet("reg-param", [0.1]) -] -if MLLIB_SPARK_VERSION >= 1.5: - MLLIB_GLM_TEST_OPTS += [ - # Ignored, but required for config - OptionSet("elastic-net-param", [0.0]) - ] - -# GLM Regression Tests # -MLLIB_GLM_REGRESSION_TEST_OPTS = MLLIB_GLM_TEST_OPTS + [ - # Optimization algorithm: sgd - OptionSet("optimizer", ["sgd"]), - # The intercept for the data - OptionSet("intercept", [0.0]), - # The scale factor for label noise - OptionSet("label-noise", [0.1]), - # Loss to minimize: l2 (squared error) - OptionSet("loss", ["l2"]) -] - -MLLIB_TESTS += [("glm-regression", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("glm-regression")] + MLLIB_GLM_REGRESSION_TEST_OPTS)] - -# Classification Tests # -MLLIB_CLASSIFICATION_TEST_OPTS = MLLIB_GLM_TEST_OPTS + [ - # Expected fraction of examples which are negative - OptionSet("per-negative", [0.3]), - # Optimization algorithm: sgd, l-bfgs - OptionSet("optimizer", ["sgd", "l-bfgs"]) -] - -# GLM Classification Tests # -MLLIB_GLM_CLASSIFICATION_TEST_OPTS = MLLIB_CLASSIFICATION_TEST_OPTS + [ - # Loss to minimize: logistic, hinge (SVM) - OptionSet("loss", ["logistic"]) -] - -MLLIB_TESTS += [("glm-classification", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("glm-classification")] + - MLLIB_GLM_CLASSIFICATION_TEST_OPTS)] - -if MLLIB_SPARK_VERSION >= 1.5: - MLLIB_GLM_ELASTIC_NET_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [ - # The max number of iterations for LBFGS/OWLQN - OptionSet("num-iterations", [20]), - # LBFGS/OWLQN is used with elastic-net regularization. - OptionSet("optimizer", ["auto"]), - # Using elastic-net regularization. - OptionSet("reg-type", ["elastic-net"]), - # Runs with L2 (param = 0.0), L1 (param = 1.0). - OptionSet("elastic-net-param", [0.0, 1.0]), - # Regularization param (lambda) - OptionSet("reg-param", [0.01]), - # The scale factor for the noise in feature values - OptionSet("feature-noise", [1.0]), - # The step size is not used in LBFGS, but this is required in parameter checking. - OptionSet("step-size", [0.0]) - ] - - MLLIB_GLM_ELASTIC_NET_REGRESSION_TEST_OPTS = MLLIB_GLM_ELASTIC_NET_TEST_OPTS + [ - # The scale factor for the noise in label values - OptionSet("label-noise", [0.1]), - # The intercept for the data - OptionSet("intercept", [0.2]), - # Loss to minimize: l2 (squared error) - OptionSet("loss", ["l2"]) - ] - - # Test L-BFGS - MLLIB_TESTS += [("glm-regression", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("glm-regression")] + - MLLIB_GLM_ELASTIC_NET_REGRESSION_TEST_OPTS + - [OptionSet("num-features", [10000], can_scale=False)])] - # Test normal equation solver - MLLIB_TESTS += [("glm-regression", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("glm-regression")] + - MLLIB_GLM_ELASTIC_NET_REGRESSION_TEST_OPTS + - [OptionSet("num-features", [200], can_scale=False)])] - - MLLIB_GLM_ELASTIC_NET_CLASSIFICATION_TEST_OPTS = MLLIB_GLM_ELASTIC_NET_TEST_OPTS + [ - # Expected fraction of examples which are negative - OptionSet("per-negative", [0.3]), - # In GLM classification with elastic-net regularization, only logistic loss is supported. - OptionSet("loss", ["logistic"]) - ] - - # Test L-BFGS - MLLIB_TESTS += [("glm-classification", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("glm-classification")] + - MLLIB_GLM_ELASTIC_NET_CLASSIFICATION_TEST_OPTS + - [OptionSet("num-features", [10000], can_scale=False)])] - # Test normal equation solver - MLLIB_TESTS += [("glm-classification", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("glm-classification")] + - MLLIB_GLM_ELASTIC_NET_CLASSIFICATION_TEST_OPTS + - [OptionSet("num-features", [200], can_scale=False)])] - -NAIVE_BAYES_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [ - # The number of features per example - OptionSet("num-features", [10000], can_scale=False), - # Expected fraction of examples which are negative - OptionSet("per-negative", [0.3]), - # The scale factor for the noise in feature values - OptionSet("feature-noise", [1.0]), - # Naive Bayes smoothing lambda. - OptionSet("nb-lambda", [1.0]), - # Model type: either multinomial or bernoulli (bernoulli only available in Spark 1.4+) - OptionSet("model-type", ["multinomial"]), -] - -MLLIB_TESTS += [("naive-bayes", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("naive-bayes")] + - NAIVE_BAYES_TEST_OPTS)] - -# Decision Trees # -MLLIB_DECISION_TREE_TEST_OPTS = MLLIB_COMMON_OPTS + [ - # The number of rows or examples - OptionSet("num-examples", [1000000], can_scale=True), - # The number of features per example - OptionSet("num-features", [500], can_scale=False), - # Type of label: 0 indicates regression, 2+ indicates classification with this many classes - # Note: multi-class (>2) is not supported in Spark 1.0. - OptionSet("label-type", [0, 2], can_scale=False), - # Fraction of features which are categorical - OptionSet("frac-categorical-features", [0.5], can_scale=False), - # Fraction of categorical features which are binary. Others have 20 categories. - OptionSet("frac-binary-features", [0.5], can_scale=False), - # Depth of true decision tree model used to label examples. - # WARNING: The meaning of depth changed from Spark 1.0 to Spark 1.1: - # depth=N for Spark 1.0 should be depth=N-1 for Spark 1.1 - OptionSet("tree-depth", [5, 10], can_scale=False), - # Maximum number of bins for the decision tree learning algorithm. - OptionSet("max-bins", [32], can_scale=False), -] - -if MLLIB_SPARK_VERSION >= 1.2: - ensembleTypes = ["RandomForest"] - if MLLIB_SPARK_VERSION >= 1.3: - ensembleTypes.append("GradientBoostedTrees") - if MLLIB_SPARK_VERSION >= 1.4: - ensembleTypes.extend(["ml.RandomForest", "ml.GradientBoostedTrees"]) - MLLIB_DECISION_TREE_TEST_OPTS += [ - # Ensemble type: mllib.RandomForest, mllib.GradientBoostedTrees, - # ml.RandomForest, ml.GradientBoostedTrees - OptionSet("ensemble-type", ensembleTypes), - # Path to training dataset (if not given, use random data). - OptionSet("training-data", [""]), - # Path to test dataset (only used if training dataset given). - # If not given, hold out part of training data for validation. - OptionSet("test-data", [""]), - # Fraction of data to hold out for testing - # (Ignored if given training and test dataset, or if using synthetic data.) - OptionSet("test-data-fraction", [0.2], can_scale=False), - # Number of trees. If 1, then run DecisionTree. If >1, then run RandomForest. - OptionSet("num-trees", [1, 10], can_scale=False), - # Feature subset sampling strategy: auto, all, sqrt, log2, onethird - # (only used for RandomForest) - OptionSet("feature-subset-strategy", ["auto"]) - ] - -MLLIB_TESTS += [("decision-tree", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("decision-tree")] + - MLLIB_DECISION_TREE_TEST_OPTS)] - -# Recommendation Tests # -MLLIB_RECOMMENDATION_TEST_OPTS = MLLIB_COMMON_OPTS + [ - # The number of users - OptionSet("num-users", [6000000], can_scale=True), - # The number of products - OptionSet("num-products", [5000000], can_scale=False), - # The number of ratings - OptionSet("num-ratings", [50000000], can_scale=True), - # The number of iterations for ALS - OptionSet("num-iterations", [10]), - # The rank of the factorized matrix model - OptionSet("rank", [10]), - # The regularization parameter - OptionSet("reg-param", [0.1]), - # Whether to use implicit preferences or not - FlagSet("implicit-prefs", [False]) -] - -MLLIB_TESTS += [("als", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("als")] + - MLLIB_RECOMMENDATION_TEST_OPTS)] - -# Clustering Tests # -MLLIB_CLUSTERING_TEST_OPTS = MLLIB_COMMON_OPTS + [ - # The number of examples - OptionSet("num-examples", [1000000], can_scale=True), - # The number of features per point - OptionSet("num-features", [10000], can_scale=False), - # The number of centers - OptionSet("num-centers", [20]), - # The number of iterations for KMeans - OptionSet("num-iterations", [20]) -] - -MLLIB_TESTS += [("kmeans", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("kmeans")] + MLLIB_CLUSTERING_TEST_OPTS)] - -MLLIB_GMM_TEST_OPTS = MLLIB_COMMON_OPTS + [ - OptionSet("num-examples", [1000000], can_scale=True), - OptionSet("num-features", [100], can_scale=False), - OptionSet("num-centers", [20], can_scale=False), - OptionSet("num-iterations", [20])] - -if MLLIB_SPARK_VERSION >= 1.3: - MLLIB_TESTS += [("gmm", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("gmm")] + MLLIB_GMM_TEST_OPTS)] - -MLLIB_LDA_TEST_OPTS = MLLIB_COMMON_OPTS + [ - OptionSet("num-documents", [50000], can_scale=True), - OptionSet("num-vocab", [10000], can_scale=False), - OptionSet("num-topics", [20], can_scale=False), - OptionSet("num-iterations", [20]), - OptionSet("document-length", [100]), - OptionSet("optimizer", ["em", "online"])] - -if MLLIB_SPARK_VERSION >= 1.4: - MLLIB_TESTS += [("lda", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("lda")] + MLLIB_LDA_TEST_OPTS)] - -MLLIB_PIC_TEST_OPTS = MLLIB_COMMON_OPTS + [ - OptionSet("num-examples", [10000000], can_scale=True), - OptionSet("node-degree", [20], can_scale=False), - OptionSet("num-centers", [40], can_scale=False), - OptionSet("num-iterations", [20])] - -if MLLIB_SPARK_VERSION >= 1.3: - MLLIB_TESTS += [("pic", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("pic")] + MLLIB_PIC_TEST_OPTS)] - -# Linear Algebra Tests # -MLLIB_LINALG_TEST_OPTS = MLLIB_COMMON_OPTS + [ - # The number of rows for the matrix - OptionSet("num-rows", [1000000], can_scale=True), - # The number of columns for the matrix - OptionSet("num-cols", [1000], can_scale=False), - # The number of top singular values wanted for SVD and PCA - OptionSet("rank", [50], can_scale=False) -] -# Linear Algebra Tests which take more time (slightly smaller settings) # -MLLIB_BIG_LINALG_TEST_OPTS = MLLIB_COMMON_OPTS + [ - # The number of rows for the matrix - OptionSet("num-rows", [1000000], can_scale=True), - # The number of columns for the matrix - OptionSet("num-cols", [500], can_scale=False), - # The number of top singular values wanted for SVD and PCA - OptionSet("rank", [10], can_scale=False) -] - -MLLIB_TESTS += [("svd", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("svd")] + MLLIB_BIG_LINALG_TEST_OPTS)] - -MLLIB_TESTS += [("pca", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("pca")] + MLLIB_LINALG_TEST_OPTS)] - -MLLIB_TESTS += [("summary-statistics", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("summary-statistics")] + - MLLIB_BIG_LINALG_TEST_OPTS)] - -MLLIB_BLOCK_MATRIX_MULT_TEST_OPTS = MLLIB_COMMON_OPTS + [ - OptionSet("m", [20000], can_scale=True), - OptionSet("k", [10000], can_scale=False), - OptionSet("n", [10000], can_scale=False), - OptionSet("block-size", [1024], can_scale=False)] - -if MLLIB_SPARK_VERSION >= 1.3: - MLLIB_TESTS += [("block-matrix-mult", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("block-matrix-mult")] + MLLIB_BLOCK_MATRIX_MULT_TEST_OPTS)] - -# Statistic Toolkit Tests # -MLLIB_STATS_TEST_OPTS = MLLIB_COMMON_OPTS - -MLLIB_PEARSON_TEST_OPTS = MLLIB_STATS_TEST_OPTS + \ - [OptionSet("num-rows", [1000000], can_scale=True), - OptionSet("num-cols", [1000], can_scale=False)] - -MLLIB_SPEARMAN_TEST_OPTS = MLLIB_STATS_TEST_OPTS + \ - [OptionSet("num-rows", [1000000], can_scale=True), - OptionSet("num-cols", [100], can_scale=False)] - -MLLIB_CHI_SQ_FEATURE_TEST_OPTS = MLLIB_STATS_TEST_OPTS + \ - [OptionSet("num-rows", [2000000], can_scale=True), - OptionSet("num-cols", [500], can_scale=False)] - -MLLIB_CHI_SQ_GOF_TEST_OPTS = MLLIB_STATS_TEST_OPTS + \ - [OptionSet("num-rows", [50000000], can_scale=True), - OptionSet("num-cols", [0], can_scale=False)] - -MLLIB_CHI_SQ_MAT_TEST_OPTS = MLLIB_STATS_TEST_OPTS + \ - [OptionSet("num-rows", [20000], can_scale=True), - OptionSet("num-cols", [0], can_scale=False)] - -if MLLIB_SPARK_VERSION >= 1.1: - MLLIB_TESTS += [("pearson", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("pearson")] + MLLIB_PEARSON_TEST_OPTS)] - - MLLIB_TESTS += [("spearman", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("spearman")] + MLLIB_SPEARMAN_TEST_OPTS)] - - MLLIB_TESTS += [("chi-sq-feature", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("chi-sq-feature")] + MLLIB_CHI_SQ_FEATURE_TEST_OPTS)] - - MLLIB_TESTS += [("chi-sq-gof", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("chi-sq-gof")] + MLLIB_CHI_SQ_GOF_TEST_OPTS)] - - MLLIB_TESTS += [("chi-sq-mat", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("chi-sq-mat")] + MLLIB_CHI_SQ_MAT_TEST_OPTS)] - -# Feature Transformation Tests # - -MLLIB_FEATURE_TEST_OPTS = MLLIB_COMMON_OPTS - -MLLIB_WORD2VEC_TEST_OPTS = MLLIB_FEATURE_TEST_OPTS + \ - [OptionSet("num-sentences", [1000000], can_scale=True), - OptionSet("num-words", [10000], can_scale=False), - OptionSet("vector-size", [100], can_scale=False), - OptionSet("num-iterations", [3], can_scale=False), - OptionSet("min-count", [5], can_scale=False)] - -if MLLIB_SPARK_VERSION >= 1.3: # TODO: make it work in 1.2 - MLLIB_TESTS += [("word2vec", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("word2vec")] + MLLIB_WORD2VEC_TEST_OPTS)] - -# Frequent Pattern Matching Tests # - -MLLIB_FPM_TEST_OPTS = MLLIB_COMMON_OPTS - -MLLIB_FP_GROWTH_TEST_OPTS = MLLIB_FPM_TEST_OPTS + \ - [OptionSet("num-baskets", [5000000], can_scale=True), - OptionSet("avg-basket-size", [10], can_scale=False), - OptionSet("num-items", [1000], can_scale=False), - OptionSet("min-support", [0.01], can_scale=False)] - -if MLLIB_SPARK_VERSION >= 1.3: - MLLIB_TESTS += [("fp-growth", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("fp-growth")] + MLLIB_FP_GROWTH_TEST_OPTS)] - -# TODO: tune test size to have runtime within 30-60 seconds -MLLIB_PREFIX_SPAN_TEST_OPTS = MLLIB_FPM_TEST_OPTS + \ - [OptionSet("num-sequences", [5000000], can_scale=True), - OptionSet("avg-sequence-size", [5], can_scale=False), - OptionSet("avg-itemset-size", [2], can_scale=False), - OptionSet("num-items", [500], can_scale=False), - OptionSet("min-support", [0.5], can_scale=False), - OptionSet("max-pattern-len", [10], can_scale=False), - OptionSet("max-local-proj-db-size", [32000000], can_scale=False)] - -if MLLIB_SPARK_VERSION >= 1.5: - MLLIB_TESTS += [("prefix-span", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("prefix-span")] + MLLIB_PREFIX_SPAN_TEST_OPTS)] - -# Python MLlib tests -PYTHON_MLLIB_TESTS = [] - -PYTHON_MLLIB_TESTS += [("python-glm-classification", "mllib_tests.py", SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("GLMClassificationTest")] + - MLLIB_GLM_CLASSIFICATION_TEST_OPTS)] - -PYTHON_MLLIB_TESTS += [("python-glm-regression", "mllib_tests.py", SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("GLMRegressionTest")] + - MLLIB_GLM_REGRESSION_TEST_OPTS)] - -PYTHON_MLLIB_TESTS += [("python-naive-bayes", "mllib_tests.py", SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("NaiveBayesTest")] + - NAIVE_BAYES_TEST_OPTS)] - -PYTHON_MLLIB_TESTS += [("python-als", "mllib_tests.py", SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("ALSTest")] + - MLLIB_RECOMMENDATION_TEST_OPTS)] - -PYTHON_MLLIB_TESTS += [("python-kmeans", "mllib_tests.py", SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("KMeansTest")] + MLLIB_CLUSTERING_TEST_OPTS)] - -if MLLIB_SPARK_VERSION >= 1.1: - PYTHON_MLLIB_TESTS += [("python-pearson", "mllib_tests.py", SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("PearsonCorrelationTest")] + - MLLIB_PEARSON_TEST_OPTS)] - - PYTHON_MLLIB_TESTS += [("python-spearman", "mllib_tests.py", SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("SpearmanCorrelationTest")] + - MLLIB_SPEARMAN_TEST_OPTS)] - diff --git a/mllib-tests/project/MLlibTestsBuild.scala b/mllib-tests/project/MLlibTestsBuild.scala index cd392d8..ebf12ab 100644 --- a/mllib-tests/project/MLlibTestsBuild.scala +++ b/mllib-tests/project/MLlibTestsBuild.scala @@ -17,14 +17,20 @@ object MLlibTestsBuild extends Build { organization := "org.spark-project", version := "0.1", scalaVersion := "2.11.8", - sparkVersion := sys.props.getOrElse("spark.version", default="2.0.0-preview"), + sparkVersion := sys.props.getOrElse("spark.version", default="2.0.0-SNAPSHOT"), libraryDependencies ++= Seq( "net.sf.jopt-simple" % "jopt-simple" % "4.6", "org.scalatest" %% "scalatest" % "2.2.1" % "test", "org.slf4j" % "slf4j-log4j12" % "1.7.2", - "org.json4s" %% "json4s-native" % "3.2.10", - "org.apache.spark" %% "spark-core" % "2.0.0-preview" % "provided", - "org.apache.spark" %% "spark-mllib" % "2.0.0-preview" % "provided" + "org.json4s" %% "json4s-native" % "3.2.10" + + // IMPORTANT! + // We need to uncomment the below once Spark 2.0.0 becomes available + // This relies on using spark built under the lib folder + // of this project + + //"org.apache.spark" %% "spark-core" % "2.0.0-SNAPSHOT" % "provided", + //"org.apache.spark" %% "spark-mllib" % "2.0.0-SNAPSHOT" % "provided" ) ) @@ -33,12 +39,12 @@ object MLlibTestsBuild extends Build { file("."), settings = assemblySettings ++ commonSettings ++ Seq( scalaSource in Compile := { + println("sparkVersion.value is: " + sparkVersion.value) val targetFolder = sparkVersion.value match { case v if v.startsWith("1.4.") => "v1p4" case v if v.startsWith("1.5.") => "v1p5" - case v if v.startsWith("1.6.") => - "v1p5" // acceptable for now, but change later when new algs are added - case v if v.startsWith("2.0") => "v2p0" + case v if v.startsWith("1.6.") => "v1p5" + case v if v.startsWith("2.0") => "v2p0" case _ => throw new IllegalArgumentException(s"This Spark version isn't suppored: ${sparkVersion.value}.") } baseDirectory.value / targetFolder / "src" / "main" / "scala" diff --git a/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala b/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala index 4dd1d49..6f89aac 100644 --- a/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala +++ b/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala @@ -315,13 +315,13 @@ abstract class ClusteringTests(sc: SparkContext) extends PerfTest { def runTest(rdd: RDD[Vector]): KMeansModel - val NUM_EXAMPLES = ("num-examples", "number of examples for clustering tests") - val NUM_FEATURES = ("num-features", "number of features for each example for clustering tests") + val NUM_POINTS = ("num-points", "number of points for clustering tests") + val NUM_COLUMNS = ("num-columns", "number of columns for each point for clustering tests") val NUM_CENTERS = ("num-centers", "number of centers for clustering tests") val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm") - intOptions = intOptions ++ Seq(NUM_CENTERS, NUM_FEATURES, NUM_ITERATIONS) - longOptions = longOptions ++ Seq(NUM_EXAMPLES) + intOptions = intOptions ++ Seq(NUM_CENTERS, NUM_COLUMNS, NUM_ITERATIONS) + longOptions = longOptions ++ Seq(NUM_POINTS) val options = intOptions ++ stringOptions ++ booleanOptions ++ longOptions ++ doubleOptions addOptionsToParser() @@ -329,21 +329,21 @@ abstract class ClusteringTests(sc: SparkContext) extends PerfTest { var testRdd: RDD[Vector] = _ def validate(model: KMeansModel, rdd: RDD[Vector]): Double = { - val numExamples = rdd.cache().count() + val numPoints = rdd.cache().count() val error = model.computeCost(rdd) - math.sqrt(error/numExamples) + math.sqrt(error/numPoints) } override def createInputData(seed: Long) = { val numPartitions: Int = intOptionValue(NUM_PARTITIONS) - val numExamples: Long = longOptionValue(NUM_EXAMPLES) - val numFeatures: Int = intOptionValue(NUM_FEATURES) + val numPoints: Long = longOptionValue(NUM_POINTS) + val numColumns: Int = intOptionValue(NUM_COLUMNS) val numCenters: Int = intOptionValue(NUM_CENTERS) - val data = DataGenerator.generateKMeansVectors(sc, math.ceil(numExamples*1.25).toLong, numFeatures, + val data = DataGenerator.generateKMeansVectors(sc, math.ceil(numPoints*1.25).toLong, numColumns, numCenters, numPartitions, seed) val split = data.randomSplit(Array(0.8, 0.2), seed) @@ -441,10 +441,9 @@ class ALSTest(sc: SparkContext) extends RecommendationTests(sc) { val rank: Int = intOptionValue(RANK) val regParam = doubleOptionValue(REG_PARAM) val seed = intOptionValue(RANDOM_SEED) + 12 - val implicitRatings: Boolean = booleanOptionValue(IMPLICIT) new ALS().setIterations(numIterations).setRank(rank).setSeed(seed).setLambda(regParam) - .setBlocks(rdd.partitions.length).setImplicitPrefs(implicitRatings).run(rdd) + .setBlocks(rdd.partitions.size).run(rdd) } } diff --git a/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala b/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala index 5903e2e..0004f8d 100644 --- a/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala +++ b/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala @@ -16,21 +16,21 @@ import mllib.perf.PerfTest class GaussianMixtureTest(sc: SparkContext) extends PerfTest { // TODO: refactor k-means and GMM code - val NUM_EXAMPLES = ("num-examples", "number of examples for clustering tests") - val NUM_FEATURES = ("num-features", "number of features for each example for clustering tests") + val NUM_POINTS = ("num-points", "number of points for clustering tests") + val NUM_COLUMNS = ("num-columns", "number of columns for each point for clustering tests") val NUM_CENTERS = ("num-centers", "number of centers for clustering tests") val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm") - intOptions ++= Seq(NUM_CENTERS, NUM_FEATURES, NUM_ITERATIONS) - longOptions ++= Seq(NUM_EXAMPLES) + intOptions ++= Seq(NUM_CENTERS, NUM_COLUMNS, NUM_ITERATIONS) + longOptions ++= Seq(NUM_POINTS) val options = intOptions ++ stringOptions ++ booleanOptions ++ longOptions ++ doubleOptions addOptionsToParser() var data: RDD[Vector] = _ override def createInputData(seed: Long): Unit = { - val m = longOptionValue(NUM_EXAMPLES) - val n = intOptionValue(NUM_FEATURES) + val m = longOptionValue(NUM_POINTS) + val n = intOptionValue(NUM_COLUMNS) val k = intOptionValue(NUM_CENTERS) val p = intOptionValue(NUM_PARTITIONS) @@ -47,7 +47,7 @@ class GaussianMixtureTest(sc: SparkContext) extends PerfTest { Vectors.dense(y.data) } }.cache() - logInfo(s"Generated ${data.count()} examples.") + logInfo(s"Generated ${data.count()} points.") } override def run(): JValue = { diff --git a/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/PICTest.scala b/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/PICTest.scala index 2018c61..6832ffa 100644 --- a/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/PICTest.scala +++ b/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/PICTest.scala @@ -11,28 +11,28 @@ import mllib.perf.PerfTest class PICTest(sc: SparkContext) extends PerfTest { - val NUM_EXAMPLES = ("num-examples", "number of examples") + val NUM_POINTS = ("num-points", "number of points") val NODE_DEGREE = ("node-degree", "number of neighbors each node is connected to") val NUM_CENTERS = ("num-centers", "number of centers for clustering tests") val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm") intOptions ++= Seq(NODE_DEGREE, NUM_CENTERS, NUM_ITERATIONS) - longOptions ++= Seq(NUM_EXAMPLES) + longOptions ++= Seq(NUM_POINTS) val options = intOptions ++ stringOptions ++ booleanOptions ++ longOptions ++ doubleOptions addOptionsToParser() var data: RDD[(Long, Long, Double)] = _ override def createInputData(seed: Long): Unit = { - val numExamples = longOptionValue(NUM_EXAMPLES) + val numPoints = longOptionValue(NUM_POINTS) val nodeDegree = intOptionValue(NODE_DEGREE) val numPartitions = intOptionValue(NUM_PARTITIONS) // Generates a periodic banded matrix with bandwidth = nodeDegree - data = sc.parallelize(0L to numExamples, numPartitions) + val data = sc.parallelize(0L to numPoints, numPartitions) .flatMap { id => - (((id - nodeDegree / 2) % numExamples) until id).map { nbr => - (id, (nbr + numExamples) % numExamples, 1D) + (((id - nodeDegree / 2) % numPoints) until id).map { nbr => + (id, (nbr + numPoints) % numPoints, 1D) } } logInfo(s"Generated ${data.count()} pairwise similarities.") @@ -46,7 +46,6 @@ class PICTest(sc: SparkContext) extends PerfTest { .setK(k) .setMaxIterations(numIterations) val model = pic.run(data) - model.assignments.count() val duration = (System.currentTimeMillis() - start) / 1e3 "time" -> duration } diff --git a/mllib-tests/v1p4/src/main/scala/mllib/perf/util/DataGenerator.scala b/mllib-tests/v1p4/src/main/scala/mllib/perf/util/DataGenerator.scala index f721ca7..6e354fd 100644 --- a/mllib-tests/v1p4/src/main/scala/mllib/perf/util/DataGenerator.scala +++ b/mllib-tests/v1p4/src/main/scala/mllib/perf/util/DataGenerator.scala @@ -509,7 +509,7 @@ class FeaturesGenerator(val categoricalArities: Array[Int], val numContinuous: I class KMeansDataGenerator( val numCenters: Int, - val numFeatures: Int, + val numColumns: Int, val seed: Long) extends RandomDataGenerator[Vector] { private val rng = new java.util.Random(seed) @@ -528,7 +528,7 @@ class KMeansDataGenerator( } private val centers = (0 until numCenters).map{i => - Array.fill(numFeatures)((2 * rng.nextDouble() - 1)*scale_factors(i)) + Array.fill(numColumns)((2 * rng.nextDouble() - 1)*scale_factors(i)) } override def nextValue(): Vector = { @@ -536,12 +536,12 @@ class KMeansDataGenerator( val centerToAddTo = centers(concentrations.indexWhere(p => pick_center_rand <= p)) - Vectors.dense(Array.tabulate(numFeatures)(i => centerToAddTo(i) + rng2.nextGaussian())) + Vectors.dense(Array.tabulate(numColumns)(i => centerToAddTo(i) + rng2.nextGaussian())) } override def setSeed(seed: Long) { rng.setSeed(seed) } - override def copy(): KMeansDataGenerator = new KMeansDataGenerator(numCenters, numFeatures, seed) + override def copy(): KMeansDataGenerator = new KMeansDataGenerator(numCenters, numColumns, seed) } diff --git a/mllib-tests/v1p5/src/main/scala/mllib/perf/MLAlgorithmTests.scala b/mllib-tests/v1p5/src/main/scala/mllib/perf/MLAlgorithmTests.scala index 1c06465..1f1ec27 100644 --- a/mllib-tests/v1p5/src/main/scala/mllib/perf/MLAlgorithmTests.scala +++ b/mllib-tests/v1p5/src/main/scala/mllib/perf/MLAlgorithmTests.scala @@ -97,12 +97,10 @@ abstract class GLMTests(sc: SparkContext) class GLMRegressionTest(sc: SparkContext) extends GLMTests(sc) { val INTERCEPT = ("intercept", "intercept for random data generation") - val FEATURE_NOISE = ("feature-noise", - "scale factor for the noise during feature generation; CURRENTLY IGNORED") val LABEL_NOISE = ("label-noise", "scale factor for the noise during label generation") val LOSS = ("loss", "loss to minimize. Supported: l2 (squared error).") - doubleOptions = doubleOptions ++ Seq(INTERCEPT, FEATURE_NOISE, LABEL_NOISE) + doubleOptions = doubleOptions ++ Seq(INTERCEPT, LABEL_NOISE) stringOptions = stringOptions ++ Seq(LOSS) val options = intOptions ++ stringOptions ++ booleanOptions ++ doubleOptions ++ longOptions @@ -160,7 +158,6 @@ class GLMRegressionTest(sc: SparkContext) extends GLMTests(sc) { .setElasticNetParam(elasticNetParam) .setRegParam(regParam) .setMaxIter(numIterations) - .setTol(0.0) val sqlContext = new SQLContext(rdd.context) import sqlContext.implicits._ val mlModel = rr.fit(rdd.toDF()) @@ -268,7 +265,6 @@ class GLMClassificationTest(sc: SparkContext) extends GLMTests(sc) { .setElasticNetParam(elasticNetParam) .setRegParam(regParam) .setMaxIter(numIterations) - .setTol(0.0) val sqlContext = new SQLContext(rdd.context) import sqlContext.implicits._ val mlModel = lor.fit(rdd.toDF()) @@ -383,8 +379,6 @@ abstract class RecommendationTests(sc: SparkContext) extends PerfTest { val testMetric = validate(model, testRdd) - /* - // Removed temporarily because these methods are really slow. val numThingsToRecommend = 10 start = System.currentTimeMillis() model.recommendProductsForUsers(numThingsToRecommend).count() @@ -392,11 +386,11 @@ abstract class RecommendationTests(sc: SparkContext) extends PerfTest { start = System.currentTimeMillis() model.recommendUsersForProducts(numThingsToRecommend).count() val recommendUsersForProductsTime = (System.currentTimeMillis() - start).toDouble / 1000.0 - */ + Map("trainingTime" -> trainingTime, "testTime" -> testTime, - "trainingMetric" -> trainingMetric, "testMetric" -> testMetric) - // "recommendProductsForUsersTime" -> recommendProductsForUsersTime, - // "recommendUsersForProductsTime" -> recommendUsersForProductsTime) + "trainingMetric" -> trainingMetric, "testMetric" -> testMetric, + "recommendProductsForUsersTime" -> recommendProductsForUsersTime, + "recommendUsersForProductsTime" -> recommendUsersForProductsTime) } } @@ -404,13 +398,13 @@ abstract class ClusteringTests(sc: SparkContext) extends PerfTest { def runTest(rdd: RDD[Vector]): KMeansModel - val NUM_EXAMPLES = ("num-examples", "number of examples for clustering tests") - val NUM_FEATURES = ("num-features", "number of features for each example for clustering tests") + val NUM_POINTS = ("num-points", "number of points for clustering tests") + val NUM_COLUMNS = ("num-columns", "number of columns for each point for clustering tests") val NUM_CENTERS = ("num-centers", "number of centers for clustering tests") val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm") - intOptions = intOptions ++ Seq(NUM_CENTERS, NUM_FEATURES, NUM_ITERATIONS) - longOptions = longOptions ++ Seq(NUM_EXAMPLES) + intOptions = intOptions ++ Seq(NUM_CENTERS, NUM_COLUMNS, NUM_ITERATIONS) + longOptions = longOptions ++ Seq(NUM_POINTS) val options = intOptions ++ stringOptions ++ booleanOptions ++ longOptions ++ doubleOptions addOptionsToParser() @@ -418,21 +412,21 @@ abstract class ClusteringTests(sc: SparkContext) extends PerfTest { var testRdd: RDD[Vector] = _ def validate(model: KMeansModel, rdd: RDD[Vector]): Double = { - val numExamples = rdd.cache().count() + val numPoints = rdd.cache().count() val error = model.computeCost(rdd) - math.sqrt(error/numExamples) + math.sqrt(error/numPoints) } override def createInputData(seed: Long) = { val numPartitions: Int = intOptionValue(NUM_PARTITIONS) - val numExamples: Long = longOptionValue(NUM_EXAMPLES) - val numFeatures: Int = intOptionValue(NUM_FEATURES) + val numPoints: Long = longOptionValue(NUM_POINTS) + val numColumns: Int = intOptionValue(NUM_COLUMNS) val numCenters: Int = intOptionValue(NUM_CENTERS) - val data = DataGenerator.generateKMeansVectors(sc, math.ceil(numExamples*1.25).toLong, numFeatures, + val data = DataGenerator.generateKMeansVectors(sc, math.ceil(numPoints*1.25).toLong, numColumns, numCenters, numPartitions, seed) val split = data.randomSplit(Array(0.8, 0.2), seed) @@ -530,10 +524,9 @@ class ALSTest(sc: SparkContext) extends RecommendationTests(sc) { val rank: Int = intOptionValue(RANK) val regParam = doubleOptionValue(REG_PARAM) val seed = intOptionValue(RANDOM_SEED) + 12 - val implicitRatings: Boolean = booleanOptionValue(IMPLICIT) new ALS().setIterations(numIterations).setRank(rank).setSeed(seed).setLambda(regParam) - .setBlocks(rdd.partitions.length).setImplicitPrefs(implicitRatings).run(rdd) + .setBlocks(rdd.partitions.length).run(rdd) } } diff --git a/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala b/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala index 13da1ac..95ce9c6 100644 --- a/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala +++ b/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala @@ -15,21 +15,21 @@ import mllib.perf.PerfTest class GaussianMixtureTest(sc: SparkContext) extends PerfTest { - val NUM_EXAMPLES = ("num-examples", "number of examples for clustering tests") - val NUM_FEATURES = ("num-features", "number of features for each example for clustering tests") + val NUM_POINTS = ("num-points", "number of points for clustering tests") + val NUM_COLUMNS = ("num-columns", "number of columns for each point for clustering tests") val NUM_CENTERS = ("num-centers", "number of centers for clustering tests") val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm") - intOptions ++= Seq(NUM_CENTERS, NUM_FEATURES, NUM_ITERATIONS) - longOptions ++= Seq(NUM_EXAMPLES) + intOptions ++= Seq(NUM_CENTERS, NUM_COLUMNS, NUM_ITERATIONS) + longOptions ++= Seq(NUM_POINTS) val options = intOptions ++ stringOptions ++ booleanOptions ++ longOptions ++ doubleOptions addOptionsToParser() var data: RDD[Vector] = _ override def createInputData(seed: Long): Unit = { - val m = longOptionValue(NUM_EXAMPLES) - val n = intOptionValue(NUM_FEATURES) + val m = longOptionValue(NUM_POINTS) + val n = intOptionValue(NUM_COLUMNS) val k = intOptionValue(NUM_CENTERS) val p = intOptionValue(NUM_PARTITIONS) @@ -46,7 +46,7 @@ class GaussianMixtureTest(sc: SparkContext) extends PerfTest { Vectors.dense(y.data) } }.cache() - logInfo(s"Generated ${data.count()} examples.") + logInfo(s"Generated ${data.count()} points.") } override def run(): JValue = { diff --git a/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/PICTest.scala b/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/PICTest.scala index 2018c61..6832ffa 100644 --- a/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/PICTest.scala +++ b/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/PICTest.scala @@ -11,28 +11,28 @@ import mllib.perf.PerfTest class PICTest(sc: SparkContext) extends PerfTest { - val NUM_EXAMPLES = ("num-examples", "number of examples") + val NUM_POINTS = ("num-points", "number of points") val NODE_DEGREE = ("node-degree", "number of neighbors each node is connected to") val NUM_CENTERS = ("num-centers", "number of centers for clustering tests") val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm") intOptions ++= Seq(NODE_DEGREE, NUM_CENTERS, NUM_ITERATIONS) - longOptions ++= Seq(NUM_EXAMPLES) + longOptions ++= Seq(NUM_POINTS) val options = intOptions ++ stringOptions ++ booleanOptions ++ longOptions ++ doubleOptions addOptionsToParser() var data: RDD[(Long, Long, Double)] = _ override def createInputData(seed: Long): Unit = { - val numExamples = longOptionValue(NUM_EXAMPLES) + val numPoints = longOptionValue(NUM_POINTS) val nodeDegree = intOptionValue(NODE_DEGREE) val numPartitions = intOptionValue(NUM_PARTITIONS) // Generates a periodic banded matrix with bandwidth = nodeDegree - data = sc.parallelize(0L to numExamples, numPartitions) + val data = sc.parallelize(0L to numPoints, numPartitions) .flatMap { id => - (((id - nodeDegree / 2) % numExamples) until id).map { nbr => - (id, (nbr + numExamples) % numExamples, 1D) + (((id - nodeDegree / 2) % numPoints) until id).map { nbr => + (id, (nbr + numPoints) % numPoints, 1D) } } logInfo(s"Generated ${data.count()} pairwise similarities.") @@ -46,7 +46,6 @@ class PICTest(sc: SparkContext) extends PerfTest { .setK(k) .setMaxIterations(numIterations) val model = pic.run(data) - model.assignments.count() val duration = (System.currentTimeMillis() - start) / 1e3 "time" -> duration } diff --git a/mllib-tests/v1p5/src/main/scala/mllib/perf/util/DataGenerator.scala b/mllib-tests/v1p5/src/main/scala/mllib/perf/util/DataGenerator.scala index 33f041e..e65a5a5 100644 --- a/mllib-tests/v1p5/src/main/scala/mllib/perf/util/DataGenerator.scala +++ b/mllib-tests/v1p5/src/main/scala/mllib/perf/util/DataGenerator.scala @@ -548,7 +548,7 @@ class FeaturesGenerator(val categoricalArities: Array[Int], val numContinuous: I class KMeansDataGenerator( val numCenters: Int, - val numFeatures: Int, + val numColumns: Int, val seed: Long) extends RandomDataGenerator[Vector] { private val rng = new java.util.Random(seed) @@ -567,7 +567,7 @@ class KMeansDataGenerator( } private val centers = (0 until numCenters).map{i => - Array.fill(numFeatures)((2 * rng.nextDouble() - 1)*scale_factors(i)) + Array.fill(numColumns)((2 * rng.nextDouble() - 1)*scale_factors(i)) } override def nextValue(): Vector = { @@ -575,12 +575,12 @@ class KMeansDataGenerator( val centerToAddTo = centers(concentrations.indexWhere(p => pick_center_rand <= p)) - Vectors.dense(Array.tabulate(numFeatures)(i => centerToAddTo(i) + rng2.nextGaussian())) + Vectors.dense(Array.tabulate(numColumns)(i => centerToAddTo(i) + rng2.nextGaussian())) } override def setSeed(seed: Long) { rng.setSeed(seed) } - override def copy(): KMeansDataGenerator = new KMeansDataGenerator(numCenters, numFeatures, seed) + override def copy(): KMeansDataGenerator = new KMeansDataGenerator(numCenters, numColumns, seed) } diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/LinearAlgebraTests.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/LinearAlgebraTests.scala new file mode 100644 index 0000000..b992173 --- /dev/null +++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/LinearAlgebraTests.scala @@ -0,0 +1,68 @@ +package mllib.perf + +import org.json4s.JsonAST.JValue +import org.json4s.JsonDSL._ + +import org.apache.spark.SparkContext +import org.apache.spark.mllib.linalg.distributed.RowMatrix + +import mllib.perf.util.DataGenerator + +/** Parent class for linear algebra tests which run on a large dataset. + * Generated this way so that SVD / PCA can be added easily + */ +abstract class LinearAlgebraTests(sc: SparkContext) extends PerfTest { + + def runTest(rdd: RowMatrix, rank: Int) + + val NUM_ROWS = ("num-rows", "number of rows of the matrix") + val NUM_COLS = ("num-cols", "number of columns of the matrix") + val RANK = ("rank", "number of leading singular values") + + longOptions = Seq(NUM_ROWS) + intOptions = intOptions ++ Seq(RANK, NUM_COLS) + + var rdd: RowMatrix = _ + + val options = intOptions ++ stringOptions ++ booleanOptions ++ doubleOptions ++ longOptions + addOptionsToParser() + override def createInputData(seed: Long) = { + val m: Long = longOptionValue(NUM_ROWS) + val n: Int = intOptionValue(NUM_COLS) + val numPartitions: Int = intOptionValue(NUM_PARTITIONS) + + rdd = DataGenerator.generateDistributedSquareMatrix(sc, m, n, numPartitions, seed) + } + + override def run(): JValue = { + val rank = intOptionValue(RANK) + + val start = System.currentTimeMillis() + runTest(rdd, rank) + val end = System.currentTimeMillis() + val time = (end - start).toDouble / 1000.0 + + Map("time" -> time) + } +} + + +class SVDTest(sc: SparkContext) extends LinearAlgebraTests(sc) { + override def runTest(data: RowMatrix, rank: Int) { + data.computeSVD(rank, computeU = true) + } +} + +class PCATest(sc: SparkContext) extends LinearAlgebraTests(sc) { + override def runTest(data: RowMatrix, rank: Int) { + val principal = data.computePrincipalComponents(rank) + sc.broadcast(principal) + data.multiply(principal) + } +} + +class ColumnSummaryStatisticsTest(sc: SparkContext) extends LinearAlgebraTests(sc) { + override def runTest(data: RowMatrix, rank: Int) { + data.computeColumnSummaryStatistics() + } +} diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/MLAlgorithmTests.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/MLAlgorithmTests.scala new file mode 100644 index 0000000..693ca7c --- /dev/null +++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/MLAlgorithmTests.scala @@ -0,0 +1,779 @@ +package mllib.perf + +import org.json4s.JsonAST._ +import org.json4s.JsonDSL._ + +import org.apache.spark.SparkContext +import org.apache.spark.ml.PredictionModel +import org.apache.spark.ml.classification.{GBTClassificationModel, GBTClassifier, RandomForestClassificationModel, RandomForestClassifier, LogisticRegression} +import org.apache.spark.ml.regression.{GBTRegressionModel, GBTRegressor, RandomForestRegressionModel, RandomForestRegressor, LinearRegression} +import org.apache.spark.mllib.classification._ +import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} +import org.apache.spark.mllib.linalg.{Vector, Vectors} +import org.apache.spark.mllib.optimization.{SquaredL2Updater, L1Updater, SimpleUpdater} +import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating} +import org.apache.spark.mllib.regression._ +import org.apache.spark.mllib.tree.{GradientBoostedTrees, RandomForest} +import org.apache.spark.mllib.tree.configuration.{Algo, BoostingStrategy, QuantileStrategy, Strategy} +import org.apache.spark.mllib.tree.impurity.Variance +import org.apache.spark.mllib.tree.loss.{LogLoss, SquaredError} +import org.apache.spark.mllib.tree.model.{GradientBoostedTreesModel, RandomForestModel} +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.Row +import org.apache.spark.sql.SQLContext + +import mllib.perf.util.{DataGenerator, DataLoader} + +/** Parent class for tests which run on a large dataset. */ +abstract class RegressionAndClassificationTests[M](sc: SparkContext) extends PerfTest { + + def runTest(rdd: RDD[LabeledPoint]): M + + def validate(model: M, rdd: RDD[LabeledPoint]): Double + + val NUM_EXAMPLES = ("num-examples", "number of examples for regression tests") + val NUM_FEATURES = ("num-features", "number of features of each example for regression tests") + + intOptions = intOptions ++ Seq(NUM_FEATURES) + longOptions = Seq(NUM_EXAMPLES) + + var rdd: RDD[LabeledPoint] = _ + var testRdd: RDD[LabeledPoint] = _ + + override def run(): JValue = { + var start = System.currentTimeMillis() + val model = runTest(rdd) + val trainingTime = (System.currentTimeMillis() - start).toDouble / 1000.0 + + start = System.currentTimeMillis() + val trainingMetric = validate(model, rdd) + val testTime = (System.currentTimeMillis() - start).toDouble / 1000.0 + + val testMetric = validate(model, testRdd) + Map("trainingTime" -> trainingTime, "testTime" -> testTime, + "trainingMetric" -> trainingMetric, "testMetric" -> testMetric) + } + + /** + * For classification + * @param predictions RDD over (prediction, truth) for each instance + * @return Percent correctly classified + */ + def calculateAccuracy(predictions: RDD[(Double, Double)], numExamples: Long): Double = { + predictions.map{case (pred, label) => + if (pred == label) 1.0 else 0.0 + }.sum() * 100.0 / numExamples + } + + /** + * For regression + * @param predictions RDD over (prediction, truth) for each instance + * @return Root mean squared error (RMSE) + */ + def calculateRMSE(predictions: RDD[(Double, Double)], numExamples: Long): Double = { + val error = predictions.map{ case (pred, label) => + (pred - label) * (pred - label) + }.sum() + math.sqrt(error / numExamples) + } +} + +/** Parent class for Generalized Linear Model (GLM) tests */ +abstract class GLMTests(sc: SparkContext) + extends RegressionAndClassificationTests[GeneralizedLinearModel](sc) { + + val STEP_SIZE = ("step-size", "step size for SGD") + val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm") + val REG_TYPE = ("reg-type", "type of regularization: none, l1, l2, elastic-net") + val ELASTIC_NET_PARAM = ("elastic-net-param", "elastic-net param, 0.0 for L2, and 1.0 for L1") + val REG_PARAM = ("reg-param", "the regularization parameter against overfitting") + val OPTIMIZER = ("optimizer", "optimization algorithm (elastic-net only supports l-bfgs): sgd, l-bfgs") + + intOptions = intOptions ++ Seq(NUM_ITERATIONS) + doubleOptions = doubleOptions ++ Seq(ELASTIC_NET_PARAM, STEP_SIZE, REG_PARAM) + stringOptions = stringOptions ++ Seq(REG_TYPE, OPTIMIZER) +} + +class GLMRegressionTest(sc: SparkContext) extends GLMTests(sc) { + + val INTERCEPT = ("intercept", "intercept for random data generation") + val LABEL_NOISE = ("label-noise", "scale factor for the noise during label generation") + val LOSS = ("loss", "loss to minimize. Supported: l2 (squared error).") + + doubleOptions = doubleOptions ++ Seq(INTERCEPT, LABEL_NOISE) + stringOptions = stringOptions ++ Seq(LOSS) + + val options = intOptions ++ stringOptions ++ booleanOptions ++ doubleOptions ++ longOptions + addOptionsToParser() + + override def createInputData(seed: Long) = { + val numExamples: Long = longOptionValue(NUM_EXAMPLES) + val numFeatures: Int = intOptionValue(NUM_FEATURES) + val numPartitions: Int = intOptionValue(NUM_PARTITIONS) + + val intercept: Double = doubleOptionValue(INTERCEPT) + val labelNoise: Double = doubleOptionValue(LABEL_NOISE) + + val data = DataGenerator.generateLabeledPoints(sc, math.ceil(numExamples * 1.25).toLong, + numFeatures, intercept, labelNoise, numPartitions, seed) + + val split = data.randomSplit(Array(0.8, 0.2), seed) + + rdd = split(0).cache() + testRdd = split(1) + + // Materialize rdd + println("Num Examples: " + rdd.count()) + } + + override def validate(model: GeneralizedLinearModel, rdd: RDD[LabeledPoint]): Double = { + val numExamples = rdd.count() + val predictions: RDD[(Double, Double)] = rdd.map { example => + (model.predict(example.features), example.label) + } + calculateRMSE(predictions, numExamples) + } + + override def runTest(rdd: RDD[LabeledPoint]): GeneralizedLinearModel = { + val stepSize = doubleOptionValue(STEP_SIZE) + val loss = stringOptionValue(LOSS) + val regType = stringOptionValue(REG_TYPE) + val regParam = doubleOptionValue(REG_PARAM) + val elasticNetParam = doubleOptionValue(ELASTIC_NET_PARAM) + val numIterations = intOptionValue(NUM_ITERATIONS) + val optimizer = stringOptionValue(OPTIMIZER) + + // Linear Regression only supports squared loss for now. + if (!Array("l2").contains(loss)) { + throw new IllegalArgumentException( + s"GLMRegressionTest run with unknown loss ($loss). Supported values: l2.") + } + + if (regType == "elastic-net") { // use spark.ml + assert(optimizer == "auto" || optimizer == "l-bfgs", "GLMClassificationTest with" + + s" regType=elastic-net expects optimizer to be in {auto, l-bfgs}, but found: $optimizer") + println("WARNING: Linear Regression with elastic-net in ML package uses LBFGS/OWLQN for" + + " optimization which ignores stepSize in Spark 1.5.") + val rr = new LinearRegression() + .setElasticNetParam(elasticNetParam) + .setRegParam(regParam) + .setMaxIter(numIterations) + val sqlContext = new SQLContext(rdd.context) + import sqlContext.implicits._ + val mlModel = rr.fit(rdd.toDF()) + + new LinearRegressionModel(Vectors.fromML(mlModel.coefficients), + mlModel.intercept) + + } else { + assert(optimizer == "sgd", "GLMClassificationTest with" + + s" regType!=elastic-net expects optimizer to be sgd, but found: $optimizer") + (loss, regType) match { + case ("l2", "none") => + val lr = new LinearRegressionWithSGD().setIntercept(addIntercept = true) + lr.optimizer + .setNumIterations(numIterations) + .setStepSize(stepSize) + .setConvergenceTol(0.0) + lr.run(rdd) + case ("l2", "l1") => + val lasso = new LassoWithSGD().setIntercept(addIntercept = true) + lasso.optimizer + .setNumIterations(numIterations) + .setStepSize(stepSize) + .setRegParam(regParam) + .setConvergenceTol(0.0) + lasso.run(rdd) + case ("l2", "l2") => + val rr = new RidgeRegressionWithSGD().setIntercept(addIntercept = true) + rr.optimizer + .setNumIterations(numIterations) + .setStepSize(stepSize) + .setRegParam(regParam) + .setConvergenceTol(0.0) + rr.run(rdd) + case _ => + throw new IllegalArgumentException( + s"GLMRegressionTest given incompatible (loss, regType) = ($loss, $regType)." + + s" Note the set of supported combinations increases in later Spark versions.") + } + } + } +} + +class GLMClassificationTest(sc: SparkContext) extends GLMTests(sc) { + + val THRESHOLD = ("per-negative", "probability for a negative label during data generation") + val FEATURE_NOISE = ("feature-noise", "scale factor for the noise during feature generation") + val LOSS = ("loss", "loss to minimize. Supported: logistic, hinge (SVM).") + + doubleOptions = doubleOptions ++ Seq(THRESHOLD, FEATURE_NOISE) + stringOptions = stringOptions ++ Seq(LOSS) + + val options = intOptions ++ stringOptions ++ booleanOptions ++ doubleOptions ++ longOptions + addOptionsToParser() + + override def validate(model: GeneralizedLinearModel, rdd: RDD[LabeledPoint]): Double = { + val numExamples = rdd.count() + val predictions: RDD[(Double, Double)] = rdd.map { example => + (model.predict(example.features), example.label) + } + calculateAccuracy(predictions, numExamples) + } + + override def createInputData(seed: Long) = { + val numExamples: Long = longOptionValue(NUM_EXAMPLES) + val numFeatures: Int = intOptionValue(NUM_FEATURES) + val numPartitions: Int = intOptionValue(NUM_PARTITIONS) + + val threshold: Double = doubleOptionValue(THRESHOLD) + val featureNoise: Double = doubleOptionValue(FEATURE_NOISE) + + val data = DataGenerator.generateClassificationLabeledPoints(sc, + math.ceil(numExamples * 1.25).toLong, numFeatures, threshold, featureNoise, numPartitions, + seed) + + val split = data.randomSplit(Array(0.8, 0.2), seed) + + rdd = split(0).cache() + testRdd = split(1) + + // Materialize rdd + println("Num Examples: " + rdd.count()) + } + + override def runTest(rdd: RDD[LabeledPoint]): GeneralizedLinearModel = { + val stepSize = doubleOptionValue(STEP_SIZE) + val loss = stringOptionValue(LOSS) + val regType = stringOptionValue(REG_TYPE) + val regParam = doubleOptionValue(REG_PARAM) + val elasticNetParam = doubleOptionValue(ELASTIC_NET_PARAM) + val numIterations = intOptionValue(NUM_ITERATIONS) + val optimizer = stringOptionValue(OPTIMIZER) + + // For classification problem in GLM, we currently support logistic loss and hinge loss. + if (!Array("logistic", "hinge").contains(loss)) { + throw new IllegalArgumentException( + s"GLMClassificationTest run with unknown loss ($loss). Supported values: logistic, hinge.") + } + + if (regType == "elastic-net") { // use spark.ml + assert(optimizer == "auto" || optimizer == "l-bfgs", "GLMClassificationTest with" + + " regType=elastic-net expects optimizer to be in {auto, l-bfgs}") + loss match { + case "logistic" => + println("WARNING: Logistic Regression with elastic-net in ML package uses LBFGS/OWLQN" + + " for optimization which ignores stepSize in Spark 1.5.") + val lor = new LogisticRegression() + .setElasticNetParam(elasticNetParam) + .setRegParam(regParam) + .setMaxIter(numIterations) + val sqlContext = new SQLContext(rdd.context) + import sqlContext.implicits._ + val mlModel = lor.fit(rdd.toDF()) + new LogisticRegressionModel(Vectors.fromML(mlModel.coefficients), mlModel.intercept) + case _ => + throw new IllegalArgumentException( + s"GLMClassificationTest given unsupported loss = $loss." + + s" Note the set of supported combinations increases in later Spark versions.") + } + } else { + val updater = regType match { + case "none" => new SimpleUpdater + case "l1" => new L1Updater + case "l2" => new SquaredL2Updater + } + (loss, optimizer) match { + case ("logistic", "sgd") => + val lr = new LogisticRegressionWithSGD() + lr.optimizer + .setStepSize(stepSize) + .setNumIterations(numIterations) + .setConvergenceTol(0.0) + .setUpdater(updater) + lr.run(rdd) + case ("logistic", "l-bfgs") => + println("WARNING: LogisticRegressionWithLBFGS ignores stepSize in this Spark version.") + val lr = new LogisticRegressionWithLBFGS() + lr.optimizer + .setNumIterations(numIterations) + .setConvergenceTol(0.0) + .setUpdater(updater) + lr.run(rdd) + case ("hinge", "sgd") => + val svm = new SVMWithSGD() + svm.optimizer + .setNumIterations(numIterations) + .setStepSize(stepSize) + .setRegParam(regParam) + .setConvergenceTol(0.0) + .setUpdater(updater) + svm.run(rdd) + case _ => + throw new IllegalArgumentException( + s"GLMClassificationTest given incompatible (loss, regType) = ($loss, $regType)." + + s" Supported combinations include: (elastic-net, _), (logistic, sgd), (logistic, l-bfgs), (hinge, sgd)." + + s" Note the set of supported combinations increases in later Spark versions.") + } + } + } +} + +abstract class RecommendationTests(sc: SparkContext) extends PerfTest { + + def runTest(rdd: RDD[Rating]): MatrixFactorizationModel + + val NUM_USERS = ("num-users", "number of users for recommendation tests") + val NUM_PRODUCTS = ("num-products", "number of features of each example for recommendation tests") + val NUM_RATINGS = ("num-ratings", "number of ratings for recommendation tests") + val RANK = ("rank", "rank of factorized matrices for recommendation tests") + val IMPLICIT = ("implicit-prefs", "use implicit ratings") + val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm") + val REG_PARAM = ("reg-param", "the regularization parameter against overfitting") + + intOptions = intOptions ++ Seq(NUM_USERS, NUM_PRODUCTS, RANK, NUM_ITERATIONS) + longOptions = longOptions ++ Seq(NUM_RATINGS) + booleanOptions = booleanOptions ++ Seq(IMPLICIT) + doubleOptions = doubleOptions ++ Seq(REG_PARAM) + val options = intOptions ++ stringOptions ++ booleanOptions ++ longOptions ++ doubleOptions + addOptionsToParser() + + var rdd: RDD[Rating] = _ + var testRdd: RDD[Rating] = _ + + override def createInputData(seed: Long) = { + val numPartitions: Int = intOptionValue(NUM_PARTITIONS) + + val numUsers: Int = intOptionValue(NUM_USERS) + val numProducts: Int = intOptionValue(NUM_PRODUCTS) + val numRatings: Long = longOptionValue(NUM_RATINGS) + val implicitRatings: Boolean = booleanOptionValue(IMPLICIT) + + val data = DataGenerator.generateRatings(sc, numUsers, numProducts, + numRatings, implicitRatings, numPartitions, seed) + + rdd = data._1.cache() + testRdd = data._2 + + // Materialize rdd + println("Num Examples: " + rdd.count()) + } + + def validate(model: MatrixFactorizationModel, + data: RDD[Rating]): Double = { + val implicitPrefs: Boolean = booleanOptionValue(IMPLICIT) + val predictions: RDD[Rating] = model.predict(data.map(x => (x.user, x.product))) + val predictionsAndRatings: RDD[(Double, Double)] = predictions.map{ x => + def mapPredictedRating(r: Double) = if (implicitPrefs) math.max(math.min(r, 1.0), 0.0) else r + ((x.user, x.product), mapPredictedRating(x.rating)) + }.join(data.map(x => ((x.user, x.product), x.rating))).values + + math.sqrt(predictionsAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).mean()) + } + + override def run(): JValue = { + var start = System.currentTimeMillis() + val model = runTest(rdd) + val trainingTime = (System.currentTimeMillis() - start).toDouble / 1000.0 + + start = System.currentTimeMillis() + val trainingMetric = validate(model, rdd) + val testTime = (System.currentTimeMillis() - start).toDouble / 1000.0 + + val testMetric = validate(model, testRdd) + + val numThingsToRecommend = 10 + start = System.currentTimeMillis() + model.recommendProductsForUsers(numThingsToRecommend).count() + val recommendProductsForUsersTime = (System.currentTimeMillis() - start).toDouble / 1000.0 + start = System.currentTimeMillis() + model.recommendUsersForProducts(numThingsToRecommend).count() + val recommendUsersForProductsTime = (System.currentTimeMillis() - start).toDouble / 1000.0 + + Map("trainingTime" -> trainingTime, "testTime" -> testTime, + "trainingMetric" -> trainingMetric, "testMetric" -> testMetric, + "recommendProductsForUsersTime" -> recommendProductsForUsersTime, + "recommendUsersForProductsTime" -> recommendUsersForProductsTime) + } +} + +abstract class ClusteringTests(sc: SparkContext) extends PerfTest { + + def runTest(rdd: RDD[Vector]): KMeansModel + + val NUM_POINTS = ("num-points", "number of points for clustering tests") + val NUM_COLUMNS = ("num-columns", "number of columns for each point for clustering tests") + val NUM_CENTERS = ("num-centers", "number of centers for clustering tests") + val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm") + + intOptions = intOptions ++ Seq(NUM_CENTERS, NUM_COLUMNS, NUM_ITERATIONS) + longOptions = longOptions ++ Seq(NUM_POINTS) + val options = intOptions ++ stringOptions ++ booleanOptions ++ longOptions ++ doubleOptions + addOptionsToParser() + + var rdd: RDD[Vector] = _ + var testRdd: RDD[Vector] = _ + + def validate(model: KMeansModel, rdd: RDD[Vector]): Double = { + val numPoints = rdd.cache().count() + + val error = model.computeCost(rdd) + + math.sqrt(error/numPoints) + } + + override def createInputData(seed: Long) = { + val numPartitions: Int = intOptionValue(NUM_PARTITIONS) + + val numPoints: Long = longOptionValue(NUM_POINTS) + val numColumns: Int = intOptionValue(NUM_COLUMNS) + val numCenters: Int = intOptionValue(NUM_CENTERS) + + val data = DataGenerator.generateKMeansVectors(sc, math.ceil(numPoints*1.25).toLong, numColumns, + numCenters, numPartitions, seed) + + val split = data.randomSplit(Array(0.8, 0.2), seed) + + rdd = split(0).cache() + testRdd = split(1) + + // Materialize rdd + println("Num Examples: " + rdd.count()) + } + + override def run(): JValue = { + var start = System.currentTimeMillis() + val model = runTest(rdd) + val trainingTime = (System.currentTimeMillis() - start).toDouble / 1000.0 + + start = System.currentTimeMillis() + val trainingMetric = validate(model, rdd) + val testTime = (System.currentTimeMillis() - start).toDouble / 1000.0 + + val testMetric = validate(model, testRdd) + Map("trainingTime" -> trainingTime, "testTime" -> testTime, + "trainingMetric" -> trainingMetric, "testMetric" -> testMetric) + } +} + +// Classification Algorithms + +class NaiveBayesTest(sc: SparkContext) + extends RegressionAndClassificationTests[NaiveBayesModel](sc) { + + val THRESHOLD = ("per-negative", "probability for a negative label during data generation") + val FEATURE_NOISE = ("feature-noise", "scale factor for the noise during feature generation") + val SMOOTHING = ("nb-lambda", "the smoothing parameter lambda for Naive Bayes") + val MODEL_TYPE = ("model-type", "either multinomial (default) or bernoulli") + + doubleOptions = doubleOptions ++ Seq(THRESHOLD, FEATURE_NOISE, SMOOTHING) + stringOptions = stringOptions ++ Seq(MODEL_TYPE) + val options = intOptions ++ stringOptions ++ booleanOptions ++ doubleOptions ++ longOptions + addOptionsToParser() + + /** Note: using same data generation as for GLMClassificationTest, but should change later */ + override def createInputData(seed: Long) = { + val numExamples: Long = longOptionValue(NUM_EXAMPLES) + val numFeatures: Int = intOptionValue(NUM_FEATURES) + val numPartitions: Int = intOptionValue(NUM_PARTITIONS) + + val threshold: Double = doubleOptionValue(THRESHOLD) + val featureNoise: Double = doubleOptionValue(FEATURE_NOISE) + val modelType = stringOptionValue(MODEL_TYPE) + + val data = if (modelType == "bernoulli") { + DataGenerator.generateBinaryLabeledPoints(sc, + math.ceil(numExamples * 1.25).toLong, numFeatures, threshold, numPartitions, seed) + } else { + val negdata = DataGenerator.generateClassificationLabeledPoints(sc, + math.ceil(numExamples * 1.25).toLong, numFeatures, threshold, featureNoise, numPartitions, + seed) + val dataNonneg = negdata.map { lp => + LabeledPoint(lp.label, Vectors.dense(lp.features.toArray.map(math.abs))) + } + dataNonneg + } + + val split = data.randomSplit(Array(0.8, 0.2), seed) + + rdd = split(0).cache() + testRdd = split(1) + + // Materialize rdd + println("Num Examples: " + rdd.count()) + } + + override def validate(model: NaiveBayesModel, rdd: RDD[LabeledPoint]): Double = { + val numExamples = rdd.count() + val predictions: RDD[(Double, Double)] = rdd.map { example => + (model.predict(example.features), example.label) + } + calculateAccuracy(predictions, numExamples) + } + + override def runTest(rdd: RDD[LabeledPoint]): NaiveBayesModel = { + val lambda = doubleOptionValue(SMOOTHING) + + val modelType = stringOptionValue(MODEL_TYPE) + NaiveBayes.train(rdd, lambda, modelType) + } +} + + +// Recommendation +class ALSTest(sc: SparkContext) extends RecommendationTests(sc) { + override def runTest(rdd: RDD[Rating]): MatrixFactorizationModel = { + val numIterations: Int = intOptionValue(NUM_ITERATIONS) + val rank: Int = intOptionValue(RANK) + val regParam = doubleOptionValue(REG_PARAM) + val seed = intOptionValue(RANDOM_SEED) + 12 + + new ALS().setIterations(numIterations).setRank(rank).setSeed(seed).setLambda(regParam) + .setBlocks(rdd.partitions.length).run(rdd) + } +} + +// Clustering +// TODO: refactor into mllib.perf.clustering like the other clustering tests +class KMeansTest(sc: SparkContext) extends ClusteringTests(sc) { + override def runTest(rdd: RDD[Vector]): KMeansModel = { + val numIterations: Int = intOptionValue(NUM_ITERATIONS) + val k: Int = intOptionValue(NUM_CENTERS) + KMeans.train(rdd, k, numIterations) + } +} + +// Decision-tree +sealed trait TreeBasedModel +case class MLlibRFModel(model: RandomForestModel) extends TreeBasedModel +case class MLlibGBTModel(model: GradientBoostedTreesModel) extends TreeBasedModel +case class MLRFRegressionModel(model: RandomForestRegressionModel) extends TreeBasedModel +case class MLRFClassificationModel(model: RandomForestClassificationModel) extends TreeBasedModel +case class MLGBTRegressionModel(model: GBTRegressionModel) extends TreeBasedModel +case class MLGBTClassificationModel(model: GBTClassificationModel) extends TreeBasedModel + +/** + * Parent class for DecisionTree-based tests which run on a large dataset. + */ +abstract class DecisionTreeTests(sc: SparkContext) + extends RegressionAndClassificationTests[TreeBasedModel](sc) { + + val TEST_DATA_FRACTION = + ("test-data-fraction", "fraction of data to hold out for testing (ignored if given training and test dataset)") + val LABEL_TYPE = + ("label-type", "Type of label: 0 indicates regression, 2+ indicates " + + "classification with this many classes") + val FRAC_CATEGORICAL_FEATURES = ("frac-categorical-features", + "Fraction of features which are categorical") + val FRAC_BINARY_FEATURES = + ("frac-binary-features", "Fraction of categorical features which are binary. " + + "Others have 20 categories.") + val TREE_DEPTH = ("tree-depth", "Depth of true decision tree model used to label examples.") + val MAX_BINS = ("max-bins", "Maximum number of bins for the decision tree learning algorithm.") + val NUM_TREES = ("num-trees", "Number of trees to train. If 1, run DecisionTree. If >1, run an ensemble method (RandomForest).") + val FEATURE_SUBSET_STRATEGY = + ("feature-subset-strategy", "Strategy for feature subset sampling. Supported: auto, all, sqrt, log2, onethird.") + + intOptions = intOptions ++ Seq(LABEL_TYPE, TREE_DEPTH, MAX_BINS, NUM_TREES) + doubleOptions = doubleOptions ++ Seq(TEST_DATA_FRACTION, FRAC_CATEGORICAL_FEATURES, FRAC_BINARY_FEATURES) + stringOptions = stringOptions ++ Seq(FEATURE_SUBSET_STRATEGY) + + addOptionalOptionToParser("training-data", "path to training dataset (if not given, use random data)", "", classOf[String]) + addOptionalOptionToParser("test-data", "path to test dataset (only used if training dataset given)" + + " (if not given, hold out part of training data for validation)", "", classOf[String]) + + var categoricalFeaturesInfo: Map[Int, Int] = Map.empty + + protected var labelType = -1 + + def validate(model: TreeBasedModel, rdd: RDD[LabeledPoint]): Double = { + val numExamples = rdd.count() + val predictions: RDD[(Double, Double)] = model match { + case MLlibRFModel(rfModel) => rfModel.predict(rdd.map(_.features)).zip(rdd.map(_.label)) + case MLlibGBTModel(gbtModel) => gbtModel.predict(rdd.map(_.features)).zip(rdd.map(_.label)) + case MLRFRegressionModel(rfModel) => makePredictions(rfModel, rdd) + case MLRFClassificationModel(rfModel) => makePredictions(rfModel, rdd) + case MLGBTRegressionModel(gbtModel) => makePredictions(gbtModel, rdd) + case MLGBTClassificationModel(gbtModel) => makePredictions(gbtModel, rdd) + } + val labelType: Int = intOptionValue(LABEL_TYPE) + if (labelType == 0) { + calculateRMSE(predictions, numExamples) + } else { + calculateAccuracy(predictions, numExamples) + } + } + + // TODO: generate DataFrame outside of `runTest` so it is not included in timing results + private def makePredictions( + model: PredictionModel[org.apache.spark.ml.linalg.Vector, _], rdd: RDD[LabeledPoint]): RDD[(Double, Double)] = { + val labelType: Int = intOptionValue(LABEL_TYPE) + val dataFrame = DataGenerator.setMetadata(rdd, categoricalFeaturesInfo, labelType) + val results = model.transform(dataFrame) + results + .select(model.getPredictionCol, model.getLabelCol) + .rdd + .map { case Row(prediction: Double, label: Double) => (prediction, label) } + } +} + +class DecisionTreeTest(sc: SparkContext) extends DecisionTreeTests(sc) { + val supportedTreeTypes = Array("RandomForest", "GradientBoostedTrees", + "ml.RandomForest", "ml.GradientBoostedTrees") + + val ENSEMBLE_TYPE = ("ensemble-type", "Type of ensemble algorithm: " + supportedTreeTypes.mkString(" ")) + + stringOptions = stringOptions ++ Seq(ENSEMBLE_TYPE) + + val options = intOptions ++ stringOptions ++ booleanOptions ++ doubleOptions ++ longOptions + addOptionsToParser() + + private def getTestDataFraction: Double = { + val testDataFraction: Double = doubleOptionValue(TEST_DATA_FRACTION) + assert(testDataFraction >= 0 && testDataFraction <= 1, s"Bad testDataFraction: $testDataFraction") + testDataFraction + } + + override def createInputData(seed: Long) = { + val trainingDataPath: String = optionValue[String]("training-data") + val (rdds, categoricalFeaturesInfo_, numClasses) = if (trainingDataPath != "") { + println(s"LOADING FILE: $trainingDataPath") + val numPartitions: Int = intOptionValue(NUM_PARTITIONS) + val testDataPath: String = optionValue[String]("test-data") + val testDataFraction: Double = getTestDataFraction + DataLoader.loadLibSVMFiles(sc, numPartitions, trainingDataPath, testDataPath, + testDataFraction, seed) + } else { + createSyntheticInputData(seed) + } + assert(rdds.length == 2) + rdd = rdds(0).cache() + testRdd = rdds(1) + categoricalFeaturesInfo = categoricalFeaturesInfo_ + this.labelType = numClasses + + // Materialize rdd + println("Num Examples: " + rdd.count()) + } + + /** + * Create synthetic training and test datasets. + * @return (trainTestDatasets, categoricalFeaturesInfo, numClasses) where + * trainTestDatasets = Array(trainingData, testData), + * categoricalFeaturesInfo is a map of categorical feature arities, and + * numClasses = number of classes label can take. + */ + private def createSyntheticInputData( + seed: Long): (Array[RDD[LabeledPoint]], Map[Int, Int], Int) = { + // Generic test options + val numPartitions: Int = intOptionValue(NUM_PARTITIONS) + // Data dimensions and type + val numExamples: Long = longOptionValue(NUM_EXAMPLES) + val numFeatures: Int = intOptionValue(NUM_FEATURES) + val labelType: Int = intOptionValue(LABEL_TYPE) + val fracCategoricalFeatures: Double = doubleOptionValue(FRAC_CATEGORICAL_FEATURES) + val fracBinaryFeatures: Double = doubleOptionValue(FRAC_BINARY_FEATURES) + // Model specification + val treeDepth: Int = intOptionValue(TREE_DEPTH) + + val (rdd_, categoricalFeaturesInfo_) = + DataGenerator.generateDecisionTreeLabeledPoints(sc, math.ceil(numExamples * 1.25).toLong, + numFeatures, numPartitions, labelType, + fracCategoricalFeatures, fracBinaryFeatures, treeDepth, seed) + + val splits = rdd_.randomSplit(Array(0.8, 0.2), seed) + (splits, categoricalFeaturesInfo_, labelType) + } + + // TODO: generate DataFrame outside of `runTest` so it is not included in timing results + override def runTest(rdd: RDD[LabeledPoint]): TreeBasedModel = { + val treeDepth: Int = intOptionValue(TREE_DEPTH) + val maxBins: Int = intOptionValue(MAX_BINS) + val numTrees: Int = intOptionValue(NUM_TREES) + val featureSubsetStrategy: String = stringOptionValue(FEATURE_SUBSET_STRATEGY) + val ensembleType: String = stringOptionValue(ENSEMBLE_TYPE) + if (!supportedTreeTypes.contains(ensembleType)) { + throw new IllegalArgumentException( + s"DecisionTreeTest given unknown ensembleType param: $ensembleType." + + " Supported values: " + supportedTreeTypes.mkString(" ")) + } + if (labelType == 0) { + // Regression + ensembleType match { + case "RandomForest" => + MLlibRFModel(RandomForest.trainRegressor(rdd, categoricalFeaturesInfo, numTrees, + featureSubsetStrategy, "variance", treeDepth, maxBins, this.getRandomSeed)) + case "ml.RandomForest" => + val labelType: Int = intOptionValue(LABEL_TYPE) + val dataset = DataGenerator.setMetadata(rdd, categoricalFeaturesInfo, labelType) + val model = new RandomForestRegressor() + .setImpurity("variance") + .setMaxDepth(treeDepth) + .setMaxBins(maxBins) + .setNumTrees(numTrees) + .setFeatureSubsetStrategy(featureSubsetStrategy) + .setSeed(this.getRandomSeed) + .fit(dataset) + MLRFRegressionModel(model) + case "GradientBoostedTrees" => + val treeStrategy = new Strategy(Algo.Regression, Variance, treeDepth, + labelType, maxBins, QuantileStrategy.Sort, categoricalFeaturesInfo) + val boostingStrategy = BoostingStrategy(treeStrategy, SquaredError, numTrees, + learningRate = 0.1) + MLlibGBTModel(GradientBoostedTrees.train(rdd, boostingStrategy)) + case "ml.GradientBoostedTrees" => + val labelType: Int = intOptionValue(LABEL_TYPE) + val dataset = DataGenerator.setMetadata(rdd, categoricalFeaturesInfo, labelType) + val model = new GBTRegressor() + .setLossType("squared") + .setMaxBins(maxBins) + .setMaxDepth(treeDepth) + .setMaxIter(numTrees) + .setStepSize(0.1) + .setSeed(this.getRandomSeed) + .fit(dataset) + MLGBTRegressionModel(model) + } + } else if (labelType >= 2) { + // Classification + ensembleType match { + case "RandomForest" => + MLlibRFModel(RandomForest.trainClassifier(rdd, labelType, categoricalFeaturesInfo, numTrees, + featureSubsetStrategy, "gini", treeDepth, maxBins, this.getRandomSeed)) + case "ml.RandomForest" => + val labelType: Int = intOptionValue(LABEL_TYPE) + val dataset = DataGenerator.setMetadata(rdd, categoricalFeaturesInfo, labelType) + val model = new RandomForestClassifier() + .setImpurity("gini") + .setMaxDepth(treeDepth) + .setMaxBins(maxBins) + .setNumTrees(numTrees) + .setFeatureSubsetStrategy(featureSubsetStrategy) + .setSeed(this.getRandomSeed) + .fit(dataset) + MLRFClassificationModel(model) + case "GradientBoostedTrees" => + val treeStrategy = new Strategy(Algo.Classification, Variance, treeDepth, + labelType, maxBins, QuantileStrategy.Sort, categoricalFeaturesInfo) + val boostingStrategy = BoostingStrategy(treeStrategy, LogLoss, numTrees, + learningRate = 0.1) + MLlibGBTModel(GradientBoostedTrees.train(rdd, boostingStrategy)) + case "ml.GradientBoostedTrees" => + val labelType: Int = intOptionValue(LABEL_TYPE) + val dataset = DataGenerator.setMetadata(rdd, categoricalFeaturesInfo, labelType) + val model = new GBTClassifier() + .setLossType("logistic") + .setMaxBins(maxBins) + .setMaxDepth(treeDepth) + .setMaxIter(numTrees) + .setStepSize(0.1) + .setSeed(this.getRandomSeed) + .fit(dataset) + MLGBTClassificationModel(model) + } + } else { + throw new IllegalArgumentException(s"Bad label-type parameter " + + s"given to DecisionTreeTest: $labelType") + } + } +} diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/PerfTest.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/PerfTest.scala new file mode 100644 index 0000000..bf51482 --- /dev/null +++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/PerfTest.scala @@ -0,0 +1,134 @@ +package mllib.perf + +import scala.collection.JavaConverters._ + +import joptsimple.{OptionSet, OptionParser} + +import org.json4s._ + +import org.slf4j._ + +abstract class PerfTest { + + val NUM_TRIALS = ("num-trials", "number of trials to run") + val INTER_TRIAL_WAIT = ("inter-trial-wait", "seconds to sleep between trials") + val NUM_PARTITIONS = ("num-partitions", "number of input partitions") + val RANDOM_SEED = ("random-seed", "seed for random number generator") + + val log = LoggerFactory.getLogger("PerfTest") + def logInfo(msg: String) { + if (log.isInfoEnabled) { + log.info(msg) + } + } + + /** Initialize internal state based on arguments */ + def initialize(testName_ : String, otherArgs: Array[String]) { + testName = testName_ + optionSet = parser.parse(otherArgs:_*) + } + + def getRandomSeed: Int = { + intOptionValue(RANDOM_SEED) + } + + def getNumTrials: Int = { + intOptionValue(NUM_TRIALS) + } + + def getWait: Int = { + intOptionValue(INTER_TRIAL_WAIT) * 1000 + } + + def createInputData(seed: Long) + + /** + * Runs the test and returns a JSON object that captures performance metrics, such as time taken, + * and values of any parameters. + * + * The rendered JSON will look like this (except it will be minified): + * + * { + * "options": { + * "num-partitions": "10", + * "unique-values": "10", + * ... + * }, + * "results": [ + * { + * "trainingTime": 0.211, + * "trainingMetric": 98.1, + * ... + * }, + * ... + * ] + * } + * + * @return metrics from run (e.g. ("time" -> time) + * */ + def run(): JValue + + val parser = new OptionParser() + var optionSet: OptionSet = _ + var testName: String = _ + + var intOptions: Seq[(String, String)] = Seq(NUM_TRIALS, INTER_TRIAL_WAIT, NUM_PARTITIONS, + RANDOM_SEED) + + var doubleOptions: Seq[(String, String)] = Seq() + var longOptions: Seq[(String, String)] = Seq() + + var stringOptions: Seq[(String, String)] = Seq() + var booleanOptions: Seq[(String, String)] = Seq() + + def addOptionsToParser() { + // add all the options to parser + stringOptions.map{case (opt, desc) => + parser.accepts(opt, desc).withRequiredArg().ofType(classOf[String]).required() + } + booleanOptions.map{case (opt, desc) => + parser.accepts(opt, desc) + } + intOptions.map{case (opt, desc) => + parser.accepts(opt, desc).withRequiredArg().ofType(classOf[Int]).required() + } + doubleOptions.map{case (opt, desc) => + parser.accepts(opt, desc).withRequiredArg().ofType(classOf[Double]).required() + } + longOptions.map{case (opt, desc) => + parser.accepts(opt, desc).withRequiredArg().ofType(classOf[Long]).required() + } + } + + def addOptionalOptionToParser[T](opt: String, desc: String, default: T, clazz: Class[T]): Unit = { + parser.accepts(opt, desc).withOptionalArg().ofType(clazz).defaultsTo(default) + } + + def intOptionValue(option: (String, String)) = + optionSet.valueOf(option._1).asInstanceOf[Int] + + def stringOptionValue(option: (String, String)) = + optionSet.valueOf(option._1).asInstanceOf[String] + + def booleanOptionValue(option: (String, String)) = + optionSet.has(option._1) + + def doubleOptionValue(option: (String, String)) = + optionSet.valueOf(option._1).asInstanceOf[Double] + + def longOptionValue(option: (String, String)) = + optionSet.valueOf(option._1).asInstanceOf[Long] + + def optionValue[T](option: String) = + optionSet.valueOf(option).asInstanceOf[T] + + def getOptions: Map[String, String] = { + optionSet.asMap().asScala.flatMap { case (spec, values) => + if (spec.options().size() == 1 && values.size() == 1) { + Some((spec.options().iterator().next(), values.iterator().next().toString)) + } else { + None + } + }.toMap + } +} diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/StatTests.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/StatTests.scala new file mode 100644 index 0000000..21c286c --- /dev/null +++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/StatTests.scala @@ -0,0 +1,109 @@ +package mllib.perf + +import org.json4s.JsonDSL._ +import org.json4s.JsonAST._ + +import scala.util.Random + +import org.apache.spark.mllib.linalg.{Matrices, Vectors, Matrix, Vector} +import org.apache.spark.SparkContext +import org.apache.spark.mllib.random.RandomRDDs +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.stat.Statistics +import org.apache.spark.rdd.RDD + +import mllib.perf.util.DataGenerator + + +/** + * Parent class for the tests for the statistics toolbox + */ +abstract class StatTests[T](sc: SparkContext) extends PerfTest { + + def runTest(rdd: T) + + val NUM_ROWS = ("num-rows", "number of rows of the matrix") + val NUM_COLS = ("num-cols", "number of columns of the matrix") + + longOptions = Seq(NUM_ROWS) + intOptions = intOptions ++ Seq(NUM_COLS) + + var rdd: T = _ + + val options = intOptions ++ stringOptions ++ booleanOptions ++ doubleOptions ++ longOptions + addOptionsToParser() + + override def run(): JValue = { + val start = System.currentTimeMillis() + runTest(rdd) + val end = System.currentTimeMillis() + val time = (end - start).toDouble / 1000.0 + Map("time" -> time) + } +} + +abstract class CorrelationTests(sc: SparkContext) extends StatTests[RDD[Vector]](sc){ + override def createInputData(seed: Long) = { + val m: Long = longOptionValue(NUM_ROWS) + val n: Int = intOptionValue(NUM_COLS) + val numPartitions: Int = intOptionValue(NUM_PARTITIONS) + + rdd = RandomRDDs.normalVectorRDD(sc, m, n, numPartitions, seed).cache() + + // Materialize rdd + println("Num Examples: " + rdd.count()) + } +} + +class PearsonCorrelationTest(sc: SparkContext) extends CorrelationTests(sc) { + override def runTest(data: RDD[Vector]) { + Statistics.corr(data) + } +} + +class SpearmanCorrelationTest(sc: SparkContext) extends CorrelationTests(sc) { + override def runTest(data: RDD[Vector]) { + Statistics.corr(data, "spearman") + } +} + +class ChiSquaredFeatureTest(sc: SparkContext) extends StatTests[RDD[LabeledPoint]](sc) { + override def createInputData(seed: Long) = { + val m: Long = longOptionValue(NUM_ROWS) + val n: Int = intOptionValue(NUM_COLS) + val numPartitions: Int = intOptionValue(NUM_PARTITIONS) + + rdd = DataGenerator.generateClassificationLabeledPoints(sc, m, n, 0.5, 1.0, numPartitions, + seed, chiSq = true).cache() + + // Materialize rdd + println("Num Examples: " + rdd.count()) + } + override def runTest(data: RDD[LabeledPoint]) { + Statistics.chiSqTest(data) + } +} + +class ChiSquaredGoFTest(sc: SparkContext) extends StatTests[Vector](sc) { + override def createInputData(seed: Long) = { + val m: Long = longOptionValue(NUM_ROWS) + val rng = new Random(seed) + + rdd = Vectors.dense(Array.fill(m.toInt)(rng.nextDouble())) + } + override def runTest(data: Vector) { + Statistics.chiSqTest(data) + } +} + +class ChiSquaredMatTest(sc: SparkContext) extends StatTests[Matrix](sc) { + override def createInputData(seed: Long) = { + val m: Long = longOptionValue(NUM_ROWS) + val rng = new Random(seed) + + rdd = Matrices.dense(m.toInt, m.toInt, Array.fill(m.toInt * m.toInt)(rng.nextDouble())) + } + override def runTest(data: Matrix) { + Statistics.chiSqTest(data) + } +} diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/TestRunner.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/TestRunner.scala new file mode 100644 index 0000000..421b62f --- /dev/null +++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/TestRunner.scala @@ -0,0 +1,87 @@ +package mllib.perf + +import scala.collection.JavaConverters._ + +import org.json4s.JsonAST._ +import org.json4s.JsonDSL._ +import org.json4s.jackson.JsonMethods._ + +import org.apache.spark.{SparkConf, SparkContext} + +import mllib.perf.clustering.{GaussianMixtureTest, LDATest, PICTest} +import mllib.perf.feature.Word2VecTest +import mllib.perf.fpm.{FPGrowthTest, PrefixSpanTest} +import mllib.perf.linalg.BlockMatrixMultTest + +object TestRunner { + def main(args: Array[String]) { + if (args.length < 1) { + println( + "mllib.perf.TestRunner requires 1 or more args, you gave %s, exiting".format(args.length)) + System.exit(1) + } + val testName = args(0) + val perfTestArgs = args.slice(1, args.length) + val sc = new SparkContext(new SparkConf().setAppName("TestRunner: " + testName)) + + // Unfortunate copy of code because there are Perf Tests in both projects and the compiler doesn't like it + val test: PerfTest = testName match { + case "glm-regression" => new GLMRegressionTest(sc) + case "glm-classification" => new GLMClassificationTest(sc) + case "naive-bayes" => new NaiveBayesTest(sc) + // recommendation + case "als" => new ALSTest(sc) + // clustering + case "gmm" => new GaussianMixtureTest(sc) + case "kmeans" => new KMeansTest(sc) + case "lda" => new LDATest(sc) + case "pic" => new PICTest(sc) + // trees + case "decision-tree" => new DecisionTreeTest(sc) + // linalg + case "svd" => new SVDTest(sc) + case "pca" => new PCATest(sc) + case "block-matrix-mult" => new BlockMatrixMultTest(sc) + // stats + case "summary-statistics" => new ColumnSummaryStatisticsTest(sc) + case "pearson" => new PearsonCorrelationTest(sc) + case "spearman" => new SpearmanCorrelationTest(sc) + case "chi-sq-feature" => new ChiSquaredFeatureTest(sc) + case "chi-sq-gof" => new ChiSquaredGoFTest(sc) + case "chi-sq-mat" => new ChiSquaredMatTest(sc) + // feature + case "word2vec" => new Word2VecTest(sc) + // frequent pattern mining + case "fp-growth" => new FPGrowthTest(sc) + case "prefix-span" => new PrefixSpanTest(sc) + } + test.initialize(testName, perfTestArgs) + // Generate a new dataset for each test + val rand = new java.util.Random(test.getRandomSeed) + + val numTrials = test.getNumTrials + val interTrialWait = test.getWait + + var testOptions: JValue = test.getOptions + val results: Seq[JValue] = (1 to numTrials).map { i => + test.createInputData(rand.nextLong()) + val res: JValue = test.run() + System.gc() + Thread.sleep(interTrialWait) + res + } + // Report the test results as a JSON object describing the test options, Spark + // configuration, Java system properties, as well as the per-test results. + // This extra information helps to ensure reproducibility and makes automatic analysis easier. + val json: JValue = + ("testName" -> testName) ~ + ("options" -> testOptions) ~ + ("sparkConf" -> sc.getConf.getAll.toMap) ~ + ("sparkVersion" -> sc.version) ~ + ("systemProperties" -> System.getProperties.asScala.toMap) ~ + ("results" -> results) + println("results: " + compact(render(json))) + + sc.stop() + } +} diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala new file mode 100644 index 0000000..95ce9c6 --- /dev/null +++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala @@ -0,0 +1,63 @@ +package mllib.perf.clustering + +import java.util.Random + +import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV} +import org.json4s.JValue +import org.json4s.JsonDSL._ + +import org.apache.spark.SparkContext +import org.apache.spark.mllib.clustering.GaussianMixture +import org.apache.spark.mllib.linalg.{Vector, Vectors} +import org.apache.spark.rdd.RDD + +import mllib.perf.PerfTest + +class GaussianMixtureTest(sc: SparkContext) extends PerfTest { + + val NUM_POINTS = ("num-points", "number of points for clustering tests") + val NUM_COLUMNS = ("num-columns", "number of columns for each point for clustering tests") + val NUM_CENTERS = ("num-centers", "number of centers for clustering tests") + val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm") + + intOptions ++= Seq(NUM_CENTERS, NUM_COLUMNS, NUM_ITERATIONS) + longOptions ++= Seq(NUM_POINTS) + val options = intOptions ++ stringOptions ++ booleanOptions ++ longOptions ++ doubleOptions + addOptionsToParser() + + var data: RDD[Vector] = _ + + override def createInputData(seed: Long): Unit = { + val m = longOptionValue(NUM_POINTS) + val n = intOptionValue(NUM_COLUMNS) + val k = intOptionValue(NUM_CENTERS) + val p = intOptionValue(NUM_PARTITIONS) + + val random = new Random(seed ^ 8793480384L) + val mu = Array.fill(k)(new BDV[Double](Array.fill(n)(random.nextGaussian()))) + val f = Array.fill(k)(new BDM[Double](n, n, Array.fill(n * n)(random.nextGaussian()))) + data = sc.parallelize(0L until m, p) + .mapPartitionsWithIndex { (idx, part) => + val rng = new Random(seed & idx) + part.map { _ => + val i = (rng.nextDouble() * k).toInt + val x = new BDV[Double](Array.fill(n)(rng.nextGaussian())) + val y = f(i) * x + mu(i) + Vectors.dense(y.data) + } + }.cache() + logInfo(s"Generated ${data.count()} points.") + } + + override def run(): JValue = { + val numIterations = intOptionValue(NUM_ITERATIONS) + val k = intOptionValue(NUM_CENTERS) + val start = System.currentTimeMillis() + val gmm = new GaussianMixture() + .setK(k) + .setMaxIterations(numIterations) + val model = gmm.run(data) + val duration = (System.currentTimeMillis() - start) / 1e3 + "time" -> duration + } +} diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/clustering/LDATest.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/clustering/LDATest.scala new file mode 100644 index 0000000..812f2da --- /dev/null +++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/clustering/LDATest.scala @@ -0,0 +1,73 @@ +package mllib.perf.clustering + +import mllib.perf.PerfTest + +import org.json4s.JValue +import org.json4s.JsonDSL._ + +import scala.collection.mutable.{HashMap => MHashMap} + +import org.apache.commons.math3.random.Well19937c +import org.apache.spark.SparkContext +import org.apache.spark.mllib.clustering.LDA +import org.apache.spark.mllib.linalg.{Vector, Vectors} +import org.apache.spark.rdd.RDD + +class LDATest(sc: SparkContext) extends PerfTest { + + val NUM_DOCUMENTS = ("num-documents", "number of documents in corpus") + val NUM_VOCABULARY = ("num-vocab", "number of terms in vocabulary") + val NUM_TOPICS = ("num-topics", "number of topics to infer") + val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm") + val DOCUMENT_LENGTH = ("document-length", "number of words per document for the algorithm") + val OPTIMIZER = ("optimizer", "optimization algorithm: em or online") + + intOptions ++= Seq(NUM_VOCABULARY, NUM_TOPICS, NUM_ITERATIONS, DOCUMENT_LENGTH) + longOptions ++= Seq(NUM_DOCUMENTS) + stringOptions ++= Seq(OPTIMIZER) + val options = intOptions ++ stringOptions ++ booleanOptions ++ longOptions ++ doubleOptions + addOptionsToParser() + + var data: RDD[(Long, Vector)] = _ + + override def createInputData(seed: Long): Unit = { + val numDocs = longOptionValue(NUM_DOCUMENTS) + val numVocab = intOptionValue(NUM_VOCABULARY) + val k = intOptionValue(NUM_TOPICS) + + val numPartitions = intOptionValue(NUM_PARTITIONS) + val docLength = intOptionValue(DOCUMENT_LENGTH) + + data = sc.parallelize(0L until numDocs, numPartitions) + .mapPartitionsWithIndex { (idx, part) => + val rng = new Well19937c(seed ^ idx) + part.map { case docIndex => + var currentSize = 0 + val entries = MHashMap[Int, Int]() + while (currentSize < docLength) { + val index = rng.nextInt(numVocab) + entries(index) = entries.getOrElse(index, 0) + 1 + currentSize += 1 + } + + val iter = entries.toSeq.map(v => (v._1, v._2.toDouble)) + (docIndex, Vectors.sparse(numVocab, iter)) + } + }.cache() + logInfo(s"Number of documents = ${data.count()}.") + } + + override def run(): JValue = { + val k = intOptionValue(NUM_TOPICS) + val numIterations = intOptionValue(NUM_ITERATIONS) + val optimizer = stringOptionValue(OPTIMIZER) + val start = System.currentTimeMillis() + val lda = new LDA() + .setK(k) + .setMaxIterations(numIterations) + .setOptimizer(optimizer) + val model = lda.run(data) + val duration = (System.currentTimeMillis() - start) / 1e3 + "time" -> duration + } +} diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/clustering/PICTest.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/clustering/PICTest.scala new file mode 100644 index 0000000..6832ffa --- /dev/null +++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/clustering/PICTest.scala @@ -0,0 +1,53 @@ +package mllib.perf.clustering + +import org.json4s.JValue +import org.json4s.JsonDSL._ + +import org.apache.spark.SparkContext +import org.apache.spark.mllib.clustering.PowerIterationClustering +import org.apache.spark.rdd.RDD + +import mllib.perf.PerfTest + +class PICTest(sc: SparkContext) extends PerfTest { + + val NUM_POINTS = ("num-points", "number of points") + val NODE_DEGREE = ("node-degree", "number of neighbors each node is connected to") + val NUM_CENTERS = ("num-centers", "number of centers for clustering tests") + val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm") + + intOptions ++= Seq(NODE_DEGREE, NUM_CENTERS, NUM_ITERATIONS) + longOptions ++= Seq(NUM_POINTS) + val options = intOptions ++ stringOptions ++ booleanOptions ++ longOptions ++ doubleOptions + addOptionsToParser() + + var data: RDD[(Long, Long, Double)] = _ + + override def createInputData(seed: Long): Unit = { + val numPoints = longOptionValue(NUM_POINTS) + val nodeDegree = intOptionValue(NODE_DEGREE) + val numPartitions = intOptionValue(NUM_PARTITIONS) + + // Generates a periodic banded matrix with bandwidth = nodeDegree + val data = sc.parallelize(0L to numPoints, numPartitions) + .flatMap { id => + (((id - nodeDegree / 2) % numPoints) until id).map { nbr => + (id, (nbr + numPoints) % numPoints, 1D) + } + } + logInfo(s"Generated ${data.count()} pairwise similarities.") + } + + override def run(): JValue = { + val numIterations = intOptionValue(NUM_ITERATIONS) + val k = intOptionValue(NUM_CENTERS) + val start = System.currentTimeMillis() + val pic = new PowerIterationClustering() + .setK(k) + .setMaxIterations(numIterations) + val model = pic.run(data) + val duration = (System.currentTimeMillis() - start) / 1e3 + "time" -> duration + } +} + diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/feature/Word2VecTest.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/feature/Word2VecTest.scala new file mode 100644 index 0000000..389d094 --- /dev/null +++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/feature/Word2VecTest.scala @@ -0,0 +1,69 @@ +package mllib.perf.feature + +import scala.collection.mutable + +import org.apache.commons.math3.random.Well19937c +import org.json4s.JValue +import org.json4s.JsonDSL._ + +import org.apache.spark.SparkContext +import org.apache.spark.mllib.feature.Word2Vec +import org.apache.spark.rdd.RDD + +import mllib.perf.PerfTest + +class Word2VecTest(sc: SparkContext) extends PerfTest { + + val NUM_SENTENCES = ("num-sentences", "number of sentences") + val NUM_WORDS = ("num-words", "vocabulary size") + val VECTOR_SIZE = ("vector-size", "vector size") + val NUM_ITERATIONS = ("num-iterations", "number of iterations") + val MIN_COUNT = ("min-count", "minimum count for a word to be included") + + intOptions ++= Seq(NUM_SENTENCES, NUM_WORDS, VECTOR_SIZE, NUM_ITERATIONS, MIN_COUNT) + + val options = intOptions ++ stringOptions ++ booleanOptions ++ doubleOptions ++ longOptions + addOptionsToParser() + + private val avgSentenceLength = 16 + private var sentences: RDD[Seq[String]] = _ + + override def createInputData(seed: Long): Unit = { + val numSentences = intOptionValue(NUM_SENTENCES) + val numPartitions = intOptionValue(NUM_PARTITIONS) + val numWords = intOptionValue(NUM_WORDS) + val p = 1.0 / avgSentenceLength + sentences = sc.parallelize(0 until numSentences, numPartitions) + .mapPartitionsWithIndex { (idx, part) => + val rng = new Well19937c(seed ^ idx) + part.map { case i => + var cur = rng.nextInt(numWords) + val sentence = mutable.ArrayBuilder.make[Int] + while (rng.nextDouble() > p) { + cur = (cur + rng.nextGaussian() * 10).toInt % numWords + if (cur < 0) { + cur += numWords + } + sentence += cur + } + sentence.result().map(_.toString).toSeq + } + }.cache() + logInfo(s"Number of sentences = ${sentences.count()}.") + } + + override def run(): JValue = { + val start = System.currentTimeMillis() + val numIterations = intOptionValue(NUM_ITERATIONS) + val numPartitions = math.ceil(math.pow(numIterations, 1.5)).toInt + val w2v = new Word2Vec() + .setNumPartitions(numPartitions) + .setNumIterations(numIterations) + .setVectorSize(intOptionValue(VECTOR_SIZE)) + .setMinCount(intOptionValue(MIN_COUNT)) + .setSeed(0L) + val model = w2v.fit(sentences) + val duration = (System.currentTimeMillis() - start) / 1e3 + "time" -> duration + } +} diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/fpm/FPGrowthTest.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/fpm/FPGrowthTest.scala new file mode 100644 index 0000000..ddc19d0 --- /dev/null +++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/fpm/FPGrowthTest.scala @@ -0,0 +1,65 @@ +package mllib.perf.fpm + +import org.apache.commons.math3.distribution.BinomialDistribution +import org.apache.commons.math3.random.Well19937c +import org.json4s.JValue +import org.json4s.JsonDSL._ + +import org.apache.spark.SparkContext +import org.apache.spark.mllib.fpm.FPGrowth +import org.apache.spark.rdd.RDD + +import mllib.perf.PerfTest + +class FPGrowthTest(sc: SparkContext) extends PerfTest { + + val NUM_BASKETS = ("num-baskets", "number of baskets") + val AVG_BASKET_SIZE = ("avg-basket-size", "average basket size. " + + "The distribution of basket sizes follows binomial distribution with B(10n,1/10).") + val NUM_ITEMS = ("num-items", "number of distinct items") + val MIN_SUPPORT = ("min-support", "minimum support level") + + intOptions = intOptions ++ Seq(NUM_BASKETS, AVG_BASKET_SIZE, NUM_ITEMS) + doubleOptions = doubleOptions ++ Seq(MIN_SUPPORT) + + val options = intOptions ++ stringOptions ++ booleanOptions ++ doubleOptions ++ longOptions + addOptionsToParser() + + private var baskets: RDD[Array[Int]] = _ + + override def createInputData(seed: Long): Unit = { + val numPartitions = intOptionValue(NUM_PARTITIONS) + val numBaskets = intOptionValue(NUM_BASKETS) + val numItems = intOptionValue(NUM_ITEMS) + val avgBasketSize = intOptionValue(AVG_BASKET_SIZE) + val maxRatio = 10 + baskets = sc.parallelize(0 until numBaskets, numPartitions) + .mapPartitionsWithIndex { (idx, part) => + val rng = new Well19937c(seed ^ idx) + val binom = new BinomialDistribution(rng, maxRatio * avgBasketSize, 1.0 / maxRatio) + part.map { i => + val basketSize = binom.sample() + // Use math.pow to create a skewed item distribution. + val items = Array.fill(basketSize)((numItems * math.pow(rng.nextDouble(), 0.1)).toInt) + items.toSet[Int].toArray // dedup + }.filter(_.nonEmpty) + }.cache() + val exactNumBaskets = baskets.count() + logInfo(s"Number of baskets: $exactNumBaskets.") + val totalNumItems = baskets.map(_.length.toLong).reduce(_ + _) + logInfo(s"Total number of items: $totalNumItems.") + logInfo(s"Average basket size: ${totalNumItems.toDouble/exactNumBaskets}.") + } + + override def run(): JValue = { + val start = System.currentTimeMillis() + val model = new FPGrowth() + .setMinSupport(doubleOptionValue(MIN_SUPPORT)) + .setNumPartitions(baskets.partitions.length * 8) + .run(baskets) + val numFreqItemsets = model.freqItemsets.count() + val duration = (System.currentTimeMillis() - start) / 1000.0 + logInfo(s"Number of frequent itemsets: $numFreqItemsets.") + "time" -> duration + } +} diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/fpm/PrefixSpanTest.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/fpm/PrefixSpanTest.scala new file mode 100644 index 0000000..b8fc590 --- /dev/null +++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/fpm/PrefixSpanTest.scala @@ -0,0 +1,82 @@ +package mllib.perf.fpm + +import org.apache.commons.math3.distribution.BinomialDistribution +import org.apache.commons.math3.random.Well19937c +import org.json4s.JValue +import org.json4s.JsonDSL._ + +import org.apache.spark.SparkContext +import org.apache.spark.mllib.fpm.PrefixSpan +import org.apache.spark.rdd.RDD + +import mllib.perf.PerfTest + +class PrefixSpanTest(sc: SparkContext) extends PerfTest { + + val NUM_SEQUENCES = ("num-sequences", "number of itemset sequences") + val AVG_SEQUENCE_SIZE = ("avg-sequence-size", "average number of itemsets in a sequence. " + + "The distribution of itemset sequence sizes follows binomial distribution with B(10n,1/10).") + val AVG_ITEMSET_SIZE = ("avg-itemset-size", "average number of items in a itemset. " + + "The distribution of itemset sizes follows binomial distribution with B(10n,1/10).") + val NUM_ITEMS = ("num-items", "number of distinct items") + val MIN_SUPPORT = ("min-support", "minimum support level") + val MAX_PATTERN_LEN = ("max-pattern-len", "maximum length of frequent itemset sequences") + val MAX_LOCAL_PROJ_DB_SIZE = ("max-local-proj-db-size", "maximum number of items allowed in a " + + "locally processed projected database") + + intOptions ++= Seq(NUM_SEQUENCES, AVG_SEQUENCE_SIZE, AVG_ITEMSET_SIZE, NUM_ITEMS, + MAX_PATTERN_LEN, MAX_LOCAL_PROJ_DB_SIZE) + doubleOptions ++= Seq(MIN_SUPPORT) + longOptions ++= Seq(MAX_LOCAL_PROJ_DB_SIZE) + + + val options = intOptions ++ stringOptions ++ booleanOptions ++ doubleOptions ++ longOptions + addOptionsToParser() + + private var sequences: RDD[Array[Array[Int]]] = _ + + override def createInputData(seed: Long): Unit = { + val numPartitions = intOptionValue(NUM_PARTITIONS) + val numSequences = intOptionValue(NUM_SEQUENCES) + val numItems = intOptionValue(NUM_ITEMS) + val avgSequenceSize = intOptionValue(AVG_SEQUENCE_SIZE) + val avgItemsetSize = intOptionValue(AVG_ITEMSET_SIZE) + val maxRatio = 10 + sequences = sc.parallelize(0 until numSequences, numPartitions) + .mapPartitionsWithIndex { (idx, part) => + val rng = new Well19937c(seed ^ idx) + val binomSeq = new BinomialDistribution(rng, maxRatio * avgSequenceSize, 1.0 / maxRatio) + val binomItemset = new BinomialDistribution(rng, maxRatio * avgItemsetSize, 1.0 / maxRatio) + part.map { i => + val seqSize = binomSeq.sample() + // Use math.pow to create a skewed item distribution. + val items = Array.fill(seqSize)( + Array.fill(binomItemset.sample())((numItems * math.pow(rng.nextDouble(), 0.1)).toInt) + ) + items.map(_.toSet[Int].toArray) // dedup + }.filter(_.nonEmpty) + }.cache() + val exactNumSeqs = sequences.count() + logInfo(s"Number of sequences: $exactNumSeqs.") + val totalNumItems = sequences.map(_.flatten.length.toLong).reduce(_ + _) + val totalNumItemsets = sequences.map(_.length.toLong).reduce(_ + _) + logInfo(s"Total number of items: $totalNumItems.") + logInfo(s"Total number of itemsets: $totalNumItemsets.") + logInfo(s"Average num itemsets per sequence: ${totalNumItemsets.toDouble/exactNumSeqs}.") + logInfo(s"Average num items per itemset: ${totalNumItems.toDouble/totalNumItemsets}.") + } + + override def run(): JValue = { + val start = System.currentTimeMillis() + val model = new PrefixSpan() + .setMinSupport(doubleOptionValue(MIN_SUPPORT)) + .setMaxPatternLength(intOptionValue(MAX_PATTERN_LEN)) + .setMaxLocalProjDBSize(longOptionValue(MAX_LOCAL_PROJ_DB_SIZE)) + .run(sequences) + val numFreqItemsets = model.freqSequences.count() + val duration = (System.currentTimeMillis() - start) / 1000.0 + logInfo(s"Number of frequent sequences: $numFreqItemsets.") + "time" -> duration + } +} + diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/linalg/BlockMatrixMultTest.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/linalg/BlockMatrixMultTest.scala new file mode 100644 index 0000000..123368a --- /dev/null +++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/linalg/BlockMatrixMultTest.scala @@ -0,0 +1,74 @@ +package mllib.perf.linalg + +import java.util.Random + +import org.json4s.JValue +import org.json4s.JsonDSL._ + +import org.apache.spark.SparkContext +import org.apache.spark.mllib.linalg.Matrices +import org.apache.spark.mllib.linalg.distributed.BlockMatrix + +import mllib.perf.PerfTest + +class BlockMatrixMultTest(sc: SparkContext) extends PerfTest { + + val M = ("m", "number of rows of A") + val K = ("k", "number of columns of A, the same as number of rows of B") + val N = ("n", "number of columns of B") + val BLOCK_SIZE = ("block-size", "block size") + + intOptions ++= Seq(BLOCK_SIZE) + longOptions ++= Seq(M, K, N) + + val options = intOptions ++ stringOptions ++ booleanOptions ++ doubleOptions ++ longOptions + addOptionsToParser() + + private var A: BlockMatrix = _ + private var B: BlockMatrix = _ + + override def createInputData(seed: Long): Unit = { + val m = longOptionValue(M) + val k = longOptionValue(K) + val n = longOptionValue(N) + val blockSize = intOptionValue(BLOCK_SIZE) + val numPartitions = intOptionValue(NUM_PARTITIONS) + + val random = new Random(seed) + + A = randn(m, k, blockSize, numPartitions, seed ^ random.nextLong()) + B = randn(k, n, blockSize, numPartitions, seed ^ random.nextLong()) + } + + def randn( + m: Long, + n: Long, + blockSize: Int, + numPartitions: Int, + seed: Long): BlockMatrix = { + val numRowBlocks = math.ceil(m / blockSize).toInt + val numColBlocks = math.ceil(n / blockSize).toInt + val sqrtParts = math.ceil(math.sqrt(numPartitions)).toInt + val rowBlockIds = sc.parallelize(0 until numRowBlocks, sqrtParts) + val colBlockIds = sc.parallelize(0 until numColBlocks, sqrtParts) + val blockIds = rowBlockIds.cartesian(colBlockIds) + val blocks = blockIds.mapPartitionsWithIndex { (idx, ids) => + val random = new Random(idx ^ seed) + ids.map { case (rowBlockId, colBlockId) => + val mi = math.min(m - rowBlockId * blockSize, blockSize).toInt + val ni = math.min(n - colBlockId * blockSize, blockSize).toInt + ((rowBlockId, colBlockId), Matrices.randn(mi, ni, random)) + } + }.cache() + logInfo(s"Generated ${blocks.count()} blocks.") + new BlockMatrix(blocks, blockSize, blockSize, m, n) + } + + override def run(): JValue = { + val start = System.currentTimeMillis() + val C = A.multiply(B) + C.blocks.count() + val duration = (System.currentTimeMillis() - start) / 1e3 + "time" -> duration + } +} diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/util/DataGenerator.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/util/DataGenerator.scala new file mode 100644 index 0000000..e65a5a5 --- /dev/null +++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/util/DataGenerator.scala @@ -0,0 +1,586 @@ +package mllib.perf.util + +import org.apache.spark.ml.attribute.{AttributeGroup, NumericAttribute, NominalAttribute} +import org.apache.spark.sql.{SQLContext, DataFrame} + +import scala.collection.mutable + +import org.apache.spark.mllib.linalg.{Vectors, Vector} +import org.apache.spark.mllib.linalg.distributed.RowMatrix +import org.apache.spark.mllib.random._ +import org.apache.spark.mllib.recommendation.Rating +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.tree.configuration.{Algo, FeatureType} +import org.apache.spark.mllib.tree.model.{Split, DecisionTreeModel, Node, Predict} +import org.apache.spark.rdd.{PairRDDFunctions, RDD} +import org.apache.spark.SparkContext + +object DataGenerator { + + def generateLabeledPoints( + sc: SparkContext, + numRows: Long, + numCols: Int, + intercept: Double, + labelNoise: Double, + numPartitions: Int, + seed: Long = System.currentTimeMillis(), + problem: String = ""): RDD[LabeledPoint] = { + + RandomRDDs.randomRDD(sc, new LinearDataGenerator(numCols,intercept, seed, labelNoise, problem), + numRows, numPartitions, seed) + + } + + def generateDistributedSquareMatrix( + sc: SparkContext, + m: Long, + n: Int, + numPartitions: Int, + seed: Long = System.currentTimeMillis()): RowMatrix = { + + val data: RDD[Vector] = RandomRDDs.normalVectorRDD(sc, m, n, numPartitions, seed) + + new RowMatrix(data,m,n) + } + + def generateClassificationLabeledPoints( + sc: SparkContext, + numRows: Long, + numCols: Int, + threshold: Double, + featureNoise: Double, + numPartitions: Int, + seed: Long = System.currentTimeMillis(), + chiSq: Boolean = false): RDD[LabeledPoint] = { + + RandomRDDs.randomRDD(sc, new ClassLabelGenerator(numCols,threshold, featureNoise, chiSq), + numRows, numPartitions, seed) + } + + def generateBinaryLabeledPoints( + sc: SparkContext, + numRows: Long, + numCols: Int, + threshold: Double, + numPartitions: Int, + seed: Long = System.currentTimeMillis()): RDD[LabeledPoint] = { + + RandomRDDs.randomRDD(sc, new BinaryLabeledDataGenerator(numCols,threshold), + numRows, numPartitions, seed) + } + + /** + * @param labelType 0 = regression with labels in [0,1]. Values >= 2 indicate classification. + * @param fracCategorical Fraction of columns/features to be categorical. + * @param fracBinary Fraction of categorical features to be binary. Others are high-arity (20). + * @param treeDepth Depth of "true" tree used to label points. + * @return (data, categoricalFeaturesInfo) + * data is an RDD of data points. + * categoricalFeaturesInfo is a map storing the arity of categorical features. + * E.g., an entry (n -> k) indicates that feature n is categorical + * with k categories indexed from 0: {0, 1, ..., k-1}. + */ + def generateDecisionTreeLabeledPoints( + sc: SparkContext, + numRows: Long, + numCols: Int, + numPartitions: Int, + labelType: Int, + fracCategorical: Double, + fracBinary: Double, + treeDepth: Int, + seed: Long = System.currentTimeMillis()): (RDD[LabeledPoint], Map[Int, Int]) = { + + val highArity = 20 + + require(fracCategorical >= 0 && fracCategorical <= 1, + s"fracCategorical must be in [0,1], but it is $fracCategorical") + require(fracBinary >= 0 && fracBinary <= 1, + s"fracBinary must be in [0,1], but it is $fracBinary") + + val isRegression = labelType == 0 + if (!isRegression) { + require(labelType >= 2, s"labelType must be >= 2 for classification. 0 indicates regression.") + } + val numCategorical = (numCols * fracCategorical).toInt + val numContinuous = numCols - numCategorical + val numBinary = (numCategorical * fracBinary).toInt + val numHighArity = numCategorical - numBinary + val categoricalArities = Array.concat(Array.fill(numBinary)(2), + Array.fill(numHighArity)(highArity)) + + val featuresGenerator = new FeaturesGenerator(categoricalArities, numContinuous) + val featureMatrix = RandomRDDs.randomRDD(sc, featuresGenerator, + numRows, numPartitions, seed) + + // Create random DecisionTree. + val featureArity = Array.concat(categoricalArities, Array.fill(numContinuous)(0)) + val trueModel = randomBalancedDecisionTree(treeDepth, labelType, featureArity, seed) + println(trueModel) + + // Label points using tree. + val labelVector = featureMatrix.map(trueModel.predict) + + val data = labelVector.zip(featureMatrix).map(pair => new LabeledPoint(pair._1, pair._2)) + val categoricalFeaturesInfo = featuresGenerator.getCategoricalFeaturesInfo + (data, categoricalFeaturesInfo) + } + + /** + * From spark.ml.impl.TreeTests + * + * Convert the given data to a DataFrame, and set the features and label metadata. + * @param data Dataset. Categorical features and labels must already have 0-based indices. + * This must be non-empty. + * @param categoricalFeatures Map: categorical feature index -> number of distinct values + * @param numClasses Number of classes label can take. If 0, mark as continuous. + * @return DataFrame with metadata + */ + def setMetadata( + data: RDD[LabeledPoint], + categoricalFeatures: Map[Int, Int], + numClasses: Int): DataFrame = { + val sqlContext = SQLContext.getOrCreate(data.sparkContext) + import sqlContext.implicits._ + val df = data.toDF() + val numFeatures = data.first().features.size + val featuresAttributes = Range(0, numFeatures).map { feature => + if (categoricalFeatures.contains(feature)) { + NominalAttribute.defaultAttr.withIndex(feature).withNumValues(categoricalFeatures(feature)) + } else { + NumericAttribute.defaultAttr.withIndex(feature) + } + }.toArray + val featuresMetadata = new AttributeGroup("features", featuresAttributes).toMetadata() + val labelAttribute = if (numClasses == 0) { + NumericAttribute.defaultAttr.withName("label") + } else { + NominalAttribute.defaultAttr.withName("label").withNumValues(numClasses) + } + val labelMetadata = labelAttribute.toMetadata() + df.select(df("features").as("features", featuresMetadata), + df("label").as("label", labelMetadata)) + } + + + def randomBalancedDecisionTree( + depth: Int, + labelType: Int, + featureArity: Array[Int], + seed: Long = System.currentTimeMillis()): DecisionTreeModel = { + + require(depth >= 0, s"randomBalancedDecisionTree given depth < 0.") + require(depth <= featureArity.size, + s"randomBalancedDecisionTree requires depth <= featureArity.size," + + s" but depth = $depth and featureArity.size = ${featureArity.size}") + val isRegression = labelType == 0 + if (!isRegression) { + require(labelType >= 2, s"labelType must be >= 2 for classification. 0 indicates regression.") + } + + val rng = new scala.util.Random() + rng.setSeed(seed) + + val labelGenerator = if (isRegression) { + new RealLabelPairGenerator() + } else { + new ClassLabelPairGenerator(labelType) + } + + val topNode = randomBalancedDecisionTreeHelper(0, depth, featureArity, labelGenerator, + Set.empty, rng) + if (isRegression) { + new DecisionTreeModel(topNode, Algo.Regression) + } else { + new DecisionTreeModel(topNode, Algo.Classification) + } + } + + /** + * Create an internal node. Either create the leaf nodes beneath it, or recurse as needed. + * @param nodeIndex Index of node. + * @param subtreeDepth Depth of subtree to build. Depth 0 means this is a leaf node. + * @param featureArity Indicates feature type. Value 0 indicates continuous feature. + * Other values >= 2 indicate a categorical feature, + * where the value is the number of categories. + * @param usedFeatures Features appearing in the path from the tree root to the node + * being constructed. + * @param labelGenerator Generates pairs of distinct labels. + * @return + */ + def randomBalancedDecisionTreeHelper( + nodeIndex: Int, + subtreeDepth: Int, + featureArity: Array[Int], + labelGenerator: RandomDataGenerator[Pair[Double, Double]], + usedFeatures: Set[Int], + rng: scala.util.Random): Node = { + + if (subtreeDepth == 0) { + // This case only happens for a depth 0 tree. + return new Node(id = nodeIndex, predict = new Predict(0), impurity = 0, isLeaf = true, + split = None, leftNode = None, rightNode = None, stats = None) + } + + val numFeatures = featureArity.size + if (usedFeatures.size >= numFeatures) { + // Should not happen. + throw new RuntimeException(s"randomBalancedDecisionTreeSplitNode ran out of " + + s"features for splits.") + } + + // Make node internal. + var feature: Int = rng.nextInt(numFeatures) + while (usedFeatures.contains(feature)) { + feature = rng.nextInt(numFeatures) + } + val split: Split = if (featureArity(feature) == 0) { + // continuous feature + new Split(feature = feature, threshold = rng.nextDouble(), + featureType = FeatureType.Continuous, categories = List()) + } else { + // categorical feature + // Put nCatsSplit categories on left, and the rest on the right. + // nCatsSplit is in {1,...,arity-1}. + val nCatsSplit = rng.nextInt(featureArity(feature) - 1) + 1 + val splitCategories = rng.shuffle(Range(0,featureArity(feature)).toList).take(nCatsSplit) + new Split(feature = feature, threshold = 0, + featureType = FeatureType.Categorical, categories = + splitCategories.asInstanceOf[List[Double]]) + } + + val leftChildIndex = nodeIndex * 2 + 1 + val rightChildIndex = nodeIndex * 2 + 2 + if (subtreeDepth == 1) { + // Add leaf nodes. + val predictions = labelGenerator.nextValue() + new Node(id = nodeIndex, predict = new Predict(0), impurity = 0, isLeaf = false, split = Some(split), + leftNode = Some(new Node(id = leftChildIndex, predict = new Predict(predictions._1), impurity = 0, isLeaf = true, + split = None, leftNode = None, rightNode = None, stats = None)), + rightNode = Some(new Node(id = rightChildIndex, predict = new Predict(predictions._2), impurity = 0, isLeaf = true, + split = None, leftNode = None, rightNode = None, stats = None)), stats = None) + } else { + new Node(id = nodeIndex, predict = new Predict(0), impurity = 0, isLeaf = false, split = Some(split), + leftNode = Some(randomBalancedDecisionTreeHelper(leftChildIndex, subtreeDepth - 1, + featureArity, labelGenerator, usedFeatures + feature, rng)), + rightNode = Some(randomBalancedDecisionTreeHelper(rightChildIndex, subtreeDepth - 1, + featureArity, labelGenerator, usedFeatures + feature, rng)), stats = None) + } + } + + def generateKMeansVectors( + sc: SparkContext, + numRows: Long, + numCols: Int, + numCenters: Int, + numPartitions: Int, + seed: Long = System.currentTimeMillis()): RDD[Vector] = { + + RandomRDDs.randomRDD(sc, new KMeansDataGenerator(numCenters, numCols, seed), + numRows, numPartitions, seed) + } + + + // Problems with having a userID or productID in the test set but not training set + // leads to a lot of work... + def generateRatings( + sc: SparkContext, + numUsers: Int, + numProducts: Int, + numRatings: Long, + implicitPrefs: Boolean, + numPartitions: Int, + seed: Long = System.currentTimeMillis()): (RDD[Rating],RDD[Rating]) = { + + val train = RandomRDDs.randomRDD(sc, + new RatingGenerator(numUsers, numProducts,implicitPrefs), + numRatings, numPartitions, seed).cache() + + val test = RandomRDDs.randomRDD(sc, + new RatingGenerator(numUsers, numProducts,implicitPrefs), + math.ceil(numRatings * 0.25).toLong, numPartitions, seed + 24) + + // Now get rid of duplicate ratings and remove non-existant userID's + // and prodID's from the test set + val commons: PairRDDFunctions[(Int,Int),Rating] = + new PairRDDFunctions(train.keyBy(rating => (rating.user, rating.product)).cache()) + + val exact = commons.join(test.keyBy(rating => (rating.user, rating.product))) + + val trainPruned = commons.subtractByKey(exact).map(_._2).cache() + + // Now get rid of users that don't exist in the train set + val trainUsers: RDD[(Int,Rating)] = trainPruned.keyBy(rating => rating.user) + val testUsers: PairRDDFunctions[Int,Rating] = + new PairRDDFunctions(test.keyBy(rating => rating.user)) + val testWithAdditionalUsers = testUsers.subtractByKey(trainUsers) + + val userPrunedTestProds: RDD[(Int,Rating)] = + testUsers.subtractByKey(testWithAdditionalUsers).map(_._2).keyBy(rating => rating.product) + + val trainProds: RDD[(Int,Rating)] = trainPruned.keyBy(rating => rating.product) + + val testWithAdditionalProds = + new PairRDDFunctions[Int, Rating](userPrunedTestProds).subtractByKey(trainProds) + val finalTest = + new PairRDDFunctions[Int, Rating](userPrunedTestProds).subtractByKey(testWithAdditionalProds) + .map(_._2) + + (trainPruned, finalTest) + } + +} + +class RatingGenerator( + private val numUsers: Int, + private val numProducts: Int, + private val implicitPrefs: Boolean) extends RandomDataGenerator[Rating] { + + private val rng = new java.util.Random() + + private val observed = new mutable.HashMap[(Int, Int), Boolean]() + + override def nextValue(): Rating = { + var tuple = (rng.nextInt(numUsers),rng.nextInt(numProducts)) + while (observed.getOrElse(tuple,false)){ + tuple = (rng.nextInt(numUsers),rng.nextInt(numProducts)) + } + observed += (tuple -> true) + + val rating = if (implicitPrefs) rng.nextInt(2)*1.0 else rng.nextDouble()*5 + + new Rating(tuple._1, tuple._2, rating) + } + + override def setSeed(seed: Long) { + rng.setSeed(seed) + } + + override def copy(): RatingGenerator = new RatingGenerator(numUsers, numProducts, implicitPrefs) +} + +// For general classification +class ClassLabelGenerator( + private val numFeatures: Int, + private val threshold: Double, + private val featureNoise: Double, + private val chiSq: Boolean) extends RandomDataGenerator[LabeledPoint] { + + private val rng = new java.util.Random() + + override def nextValue(): LabeledPoint = { + val y = if (rng.nextDouble() < threshold) 0.0 else 1.0 + val x = Array.fill[Double](numFeatures) { + if (!chiSq) rng.nextGaussian() + (y * featureNoise) else rng.nextInt(6) * 1.0 + } + + LabeledPoint(y, Vectors.dense(x)) + } + + override def setSeed(seed: Long) { + rng.setSeed(seed) + } + + override def copy(): ClassLabelGenerator = + new ClassLabelGenerator(numFeatures, threshold, featureNoise, chiSq) +} + +class BinaryLabeledDataGenerator( + private val numFeatures: Int, + private val threshold: Double) extends RandomDataGenerator[LabeledPoint] { + + private val rng = new java.util.Random() + + override def nextValue(): LabeledPoint = { + val y = if (rng.nextDouble() < threshold) 0.0 else 1.0 + val x = Array.fill[Double](numFeatures) { + if (rng.nextDouble() < threshold) 0.0 else 1.0 + } + LabeledPoint(y, Vectors.dense(x)) + } + + override def setSeed(seed: Long) { + rng.setSeed(seed) + } + + override def copy(): BinaryLabeledDataGenerator = + new BinaryLabeledDataGenerator(numFeatures, threshold) + +} + +class LinearDataGenerator( + val numFeatures: Int, + val intercept: Double, + val seed: Long, + val labelNoise: Double, + val problem: String = "", + val sparsity: Double = 1.0) extends RandomDataGenerator[LabeledPoint] { + + private val rng = new java.util.Random(seed) + + private val weights = Array.fill(numFeatures)(rng.nextDouble()) + private val nnz: Int = math.ceil(numFeatures*sparsity).toInt + + override def nextValue(): LabeledPoint = { + val x = Array.fill[Double](nnz)(2*rng.nextDouble()-1) + + val y = weights.zip(x).map(p => p._1 * p._2).sum + intercept + labelNoise*rng.nextGaussian() + val yD = + if (problem == "SVM"){ + if (y < 0.0) 0.0 else 1.0 + } else{ + y + } + + LabeledPoint(yD, Vectors.dense(x)) + } + + override def setSeed(seed: Long) { + rng.setSeed(seed) + } + + override def copy(): LinearDataGenerator = + new LinearDataGenerator(numFeatures, intercept, seed, labelNoise, problem, sparsity) +} + + +/** + * Generator for a pair of distinct class labels from the set {0,...,numClasses-1}. + * @param numClasses Number of classes. + */ +class ClassLabelPairGenerator(val numClasses: Int) + extends RandomDataGenerator[Pair[Double, Double]] { + + require(numClasses >= 2, + s"ClassLabelPairGenerator given label numClasses = $numClasses, but numClasses should be >= 2.") + + private val rng = new java.util.Random() + + override def nextValue(): Pair[Double, Double] = { + val left = rng.nextInt(numClasses) + var right = rng.nextInt(numClasses) + while (right == left) { + right = rng.nextInt(numClasses) + } + new Pair[Double, Double](left, right) + } + + override def setSeed(seed: Long) { + rng.setSeed(seed) + } + + override def copy(): ClassLabelPairGenerator = new ClassLabelPairGenerator(numClasses) +} + + +/** + * Generator for a pair of real-valued labels. + */ +class RealLabelPairGenerator() extends RandomDataGenerator[Pair[Double, Double]] { + + private val rng = new java.util.Random() + + override def nextValue(): Pair[Double, Double] = + new Pair[Double, Double](rng.nextDouble(), rng.nextDouble()) + + override def setSeed(seed: Long) { + rng.setSeed(seed) + } + + override def copy(): RealLabelPairGenerator = new RealLabelPairGenerator() +} + + +/** + * Generator for a feature vector which can include a mix of categorical and continuous features. + * @param categoricalArities Specifies the number of categories for each categorical feature. + * @param numContinuous Number of continuous features. Feature values are in range [0,1]. + */ +class FeaturesGenerator(val categoricalArities: Array[Int], val numContinuous: Int) + extends RandomDataGenerator[Vector] { + + categoricalArities.foreach { arity => + require(arity >= 2, s"FeaturesGenerator given categorical arity = $arity, " + + s"but arity should be >= 2.") + } + + val numFeatures = categoricalArities.size + numContinuous + + private val rng = new java.util.Random() + + /** + * Generates vector with categorical features first, and continuous features in [0,1] second. + */ + override def nextValue(): Vector = { + // Feature ordering matches getCategoricalFeaturesInfo. + val arr = new Array[Double](numFeatures) + var j = 0 + while (j < categoricalArities.size) { + arr(j) = rng.nextInt(categoricalArities(j)) + j += 1 + } + while (j < numFeatures) { + arr(j) = rng.nextDouble() + j += 1 + } + Vectors.dense(arr) + } + + override def setSeed(seed: Long) { + rng.setSeed(seed) + } + + override def copy(): FeaturesGenerator = new FeaturesGenerator(categoricalArities, numContinuous) + + /** + * @return categoricalFeaturesInfo Map storing arity of categorical features. + * E.g., an entry (n -> k) indicates that feature n is categorical + * with k categories indexed from 0: {0, 1, ..., k-1}. + */ + def getCategoricalFeaturesInfo: Map[Int, Int] = { + // Categorical features are indexed from 0 because of the implementation of nextValue(). + categoricalArities.zipWithIndex.map(_.swap).toMap + } + +} + + +class KMeansDataGenerator( + val numCenters: Int, + val numColumns: Int, + val seed: Long) extends RandomDataGenerator[Vector] { + + private val rng = new java.util.Random(seed) + private val rng2 = new java.util.Random(seed + 24) + private val scale_factors = Array.fill(numCenters)(rng.nextInt(20) - 10) + + // Have a random number of points around a cluster + private val concentrations: Seq[Double] = { + val rand = Array.fill(numCenters)(rng.nextDouble()) + val randSum = rand.sum + val scaled = rand.map(x => x / randSum) + + (1 to numCenters).map{i => + scaled.slice(0, i).sum + } + } + + private val centers = (0 until numCenters).map{i => + Array.fill(numColumns)((2 * rng.nextDouble() - 1)*scale_factors(i)) + } + + override def nextValue(): Vector = { + val pick_center_rand = rng2.nextDouble() + + val centerToAddTo = centers(concentrations.indexWhere(p => pick_center_rand <= p)) + + Vectors.dense(Array.tabulate(numColumns)(i => centerToAddTo(i) + rng2.nextGaussian())) + } + + override def setSeed(seed: Long) { + rng.setSeed(seed) + } + + override def copy(): KMeansDataGenerator = new KMeansDataGenerator(numCenters, numColumns, seed) +} diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/util/DataLoader.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/util/DataLoader.scala new file mode 100644 index 0000000..f0bd48c --- /dev/null +++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/util/DataLoader.scala @@ -0,0 +1,143 @@ +package mllib.perf.util + +import org.apache.spark.SparkContext +import org.apache.spark.mllib.regression._ +import org.apache.spark.mllib.util.MLUtils +import org.apache.spark.rdd.RDD + +object DataLoader { + + // For DecisionTreeTest: PartitionLabelStats tracks the stats for each partition. + class PartitionLabelStats( + var min: Double, + var max: Double, + var distinct: Long, + var nonInteger: Boolean) + extends Serializable + + object PartitionLabelStats extends Serializable { + /** Max categories allowed for categorical label (for inferring labelType) */ + val MAX_CATEGORIES = 1000 + + def labelSeqOp(lps: Iterator[LabeledPoint]): Iterator[PartitionLabelStats] = { + val stats = new PartitionLabelStats(Double.MaxValue, Double.MinValue, 0, false) + val labelSet = new scala.collection.mutable.HashSet[Double]() + lps.foreach { lp => + if (lp.label.toInt != lp.label) { + stats.nonInteger = true + } + stats.min = Math.min(lp.label, stats.min) + stats.max = Math.max(lp.label, stats.max) + if (labelSet.size <= MAX_CATEGORIES) { + labelSet.add(lp.label) + } + } + stats.distinct = labelSet.size + Iterator(stats) + Iterator(new PartitionLabelStats(0,0,0,false)) + } + + def labelCombOp( + labelStatsA: PartitionLabelStats, + labelStatsB: PartitionLabelStats): PartitionLabelStats = { + labelStatsA.min = Math.min(labelStatsA.min, labelStatsB.min) + labelStatsA.max = Math.max(labelStatsA.max, labelStatsB.max) + labelStatsA.distinct = Math.max(labelStatsB.distinct, labelStatsB.distinct) + labelStatsA + } + } + + /** Infer label type from data */ + private def isClassification(data: RDD[LabeledPoint]): Boolean = { + val labelStats = + data.mapPartitions(PartitionLabelStats.labelSeqOp) + .fold(new PartitionLabelStats(Double.MaxValue, Double.MinValue, 0, false))( + PartitionLabelStats.labelCombOp) + labelStats.distinct <= PartitionLabelStats.MAX_CATEGORIES && !labelStats.nonInteger + } + + /** + * Load training and test LibSVM-format data files. + * @return (trainTestDatasets, categoricalFeaturesInfo, numClasses) where + * trainTestDatasets = Array(trainingData, testData), + * categoricalFeaturesInfo is a map of categorical feature arities, and + * numClasses = number of classes label can take. + */ + private[perf] def loadLibSVMFiles( + sc: SparkContext, + numPartitions: Int, + trainingDataPath: String, + testDataPath: String, + testDataFraction: Double, + seed: Long): (Array[RDD[LabeledPoint]], Map[Int, Int], Int) = { + + val trainingData = MLUtils.loadLibSVMFile(sc, trainingDataPath, -1, numPartitions) + + val (rdds, categoricalFeaturesInfo_) = if (testDataPath == "") { + // randomly split trainingData into train, test + val splits = trainingData.randomSplit(Array(1.0 - testDataFraction, testDataFraction), seed) + (splits, Map.empty[Int, Int]) + } else { + // load test data + val numFeatures = trainingData.take(1)(0).features.size + val testData = MLUtils.loadLibSVMFile(sc, testDataPath, numFeatures, numPartitions) + (Array(trainingData, testData), Map.empty[Int, Int]) + } + + // For classification, re-index classes if needed. + val (finalDatasets, classIndexMap, numClasses) = { + if (isClassification(rdds(0)) && isClassification(rdds(1))) { + // classCounts: class --> # examples in class + val classCounts: Map[Double, Long] = { + val trainClassCounts = rdds(0).map(_.label).countByValue() + val testClassCounts = rdds(1).map(_.label).countByValue() + val mutableClassCounts = new scala.collection.mutable.HashMap[Double, Long]() + trainClassCounts.foreach { case (label, cnt) => + mutableClassCounts(label) = mutableClassCounts.getOrElseUpdate(label, 0) + cnt + } + testClassCounts.foreach { case (label, cnt) => + mutableClassCounts(label) = mutableClassCounts.getOrElseUpdate(label, 0) + cnt + } + mutableClassCounts.toMap + } + val sortedClasses = classCounts.keys.toList.sorted + val numClasses = classCounts.size + // classIndexMap: class --> index in 0,...,numClasses-1 + val classIndexMap = { + if (classCounts.keySet != Set(0.0, 1.0)) { + sortedClasses.zipWithIndex.toMap + } else { + Map[Double, Int]() + } + } + val indexedRdds = { + if (classIndexMap.isEmpty) { + rdds + } else { + rdds.map { rdd => + rdd.map(lp => LabeledPoint(classIndexMap(lp.label), lp.features)) + } + } + } + val numTrain = indexedRdds(0).count() + val numTest = indexedRdds(1).count() + val numTotalInstances = numTrain + numTest + println(s"numTrain: $numTrain") + println(s"numTest: $numTest") + println(s"numClasses: $numClasses") + println(s"Per-class example fractions, counts:") + println(s"Class\tFrac\tCount") + sortedClasses.foreach { c => + val frac = classCounts(c) / numTotalInstances.toDouble + println(s"$c\t$frac\t${classCounts(c)}") + } + (indexedRdds, classIndexMap, numClasses) + } else { + (rdds, null, 0) + } + } + + (finalDatasets, categoricalFeaturesInfo_, numClasses) + } + +} From b1d19475996ba279c24e6c11e5d8e166787d6df4 Mon Sep 17 00:00:00 2001 From: Adam Roberts Date: Thu, 30 Jun 2016 16:58:47 +0100 Subject: [PATCH 05/22] Add config file template back in --- config/config.py.template | 797 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 797 insertions(+) create mode 100755 config/config.py.template diff --git a/config/config.py.template b/config/config.py.template new file mode 100755 index 0000000..019cbac --- /dev/null +++ b/config/config.py.template @@ -0,0 +1,797 @@ +""" +Configuration options for running Spark performance tests. + +When updating `spark-perf`, you should probably use `diff` to compare the updated template to +your modified `config.py` file and copy over any new configurations. +""" + +import time +import os +import os.path +import socket + +from sparkperf.config_utils import FlagSet, JavaOptionSet, OptionSet, ConstantOption + + +# ================================ # +# Standard Configuration Options # +# ================================ # + +DEFAULT_HOME=os.environ['HOME'] + +SPARK_HOME_DIR = os.getenv('SPARK_HOME', DEFAULT_HOME) + +# Use a custom configuration directory +SPARK_CONF_DIR = SPARK_HOME_DIR + "/conf" + +# Master used when submitting Spark jobs. +# For local clusters: "spark://%s:7077" % socket.gethostname() +# For Yarn clusters: "yarn" +# Otherwise, the default uses the specified EC2 cluster + +SPARK_CLUSTER_URL = "spark://%s:7077" % socket.gethostname() +IS_YARN_MODE = "yarn" in SPARK_CLUSTER_URL +IS_MESOS_MODE = "mesos" in SPARK_CLUSTER_URL + +# Specify URI to download spark executor. This only applied for running with Mesos. +#SPARK_EXECUTOR_URI = "http://localhost:8000/spark.tgz" + +# Path to the Mesos native library. This is only required for running with Mesos. +#MESOS_NATIVE_LIBRARY = "/usr/local/lib/libmesos.so" + +# Run Mesos client in coarse or fine grain mode. This is only applied for running with Mesos. +#SPARK_MESOS_COARSE = True + + +# If this is true, we'll submit your job using an existing Spark installation. +# If this is false, we'll clone and build a specific version of Spark, and +# copy configurations from your existing Spark installation. +USE_CLUSTER_SPARK = True + +# URL of the HDFS installation in the Spark EC2 cluster +HDFS_URL = "hdfs://%s:9000/test/" % socket.gethostname() + +# Set the following if not using existing Spark installation +# Commit id and repo used if you are not using an existing Spark cluster +# custom version of Spark. The remote name in your git repo is assumed +# to be "origin". +# +# The commit ID can specify any of the following: +# 1. A git commit hash e.g. "4af93ff3" +# 2. A branch name e.g. "origin/branch-0.7" +# 3. A tag name e.g. "origin/tag/v0.8.0-incubating" +# 4. A pull request e.g. "origin/pr/675" +SPARK_COMMIT_ID = "" +SPARK_GIT_REPO = "https://github.com/apache/spark.git" +SPARK_MERGE_COMMIT_INTO_MASTER = False # Whether to merge the commit into master + +# Whether to install and build Spark. Set this to true only for the +# first installation if an existing one does not already exist. +PREP_SPARK = not USE_CLUSTER_SPARK + +# Whether to restart the Master and all Workers +# This should always be false for Yarn +RESTART_SPARK_CLUSTER = True +RESTART_SPARK_CLUSTER = RESTART_SPARK_CLUSTER and not IS_YARN_MODE + +# Rsync SPARK_HOME to all the slaves or not +RSYNC_SPARK_HOME = True + +# Which tests to run +RUN_SPARK_TESTS = False +RUN_PYSPARK_TESTS = False +RUN_STREAMING_TESTS = False +RUN_MLLIB_TESTS = True +RUN_PYTHON_MLLIB_TESTS = True + +# Which tests to prepare. Set this to true for the first +# installation or whenever you make a change to the tests. +PREP_SPARK_TESTS = True +PREP_PYSPARK_TESTS = True +PREP_STREAMING_TESTS = True +PREP_MLLIB_TESTS = True + +# Whether to warm up local disks (warm-up is only necesary on EC2). +DISK_WARMUP = False + +# Total number of bytes used to warm up each local directory. +DISK_WARMUP_BYTES = 200 * 1024 * 1024 + +# Number of files to create when warming up each local directory. +# Bytes will be evenly divided across files. +DISK_WARMUP_FILES = 200 + +# Prompt for confirmation when deleting temporary files. +PROMPT_FOR_DELETES = False + +# Files to write results to +SPARK_OUTPUT_FILENAME = "results/spark_perf_output_%s_%s" % ( + SPARK_COMMIT_ID.replace("/", "-"), time.strftime("%Y-%m-%d_%H-%M-%S")) +PYSPARK_OUTPUT_FILENAME = "results/python_perf_output_%s_%s" % ( + SPARK_COMMIT_ID.replace("/", "-"), time.strftime("%Y-%m-%d_%H-%M-%S")) +STREAMING_OUTPUT_FILENAME = "results/streaming_perf_output_%s_%s" % ( + SPARK_COMMIT_ID.replace("/", "-"), time.strftime("%Y-%m-%d_%H-%M-%S")) +MLLIB_OUTPUT_FILENAME = "results/mllib_perf_output_%s_%s" % ( + SPARK_COMMIT_ID.replace("/", "-"), time.strftime("%Y-%m-%d_%H-%M-%S")) +PYTHON_MLLIB_OUTPUT_FILENAME = "results/python_mllib_perf_output_%s_%s" % ( + SPARK_COMMIT_ID.replace("/", "-"), time.strftime("%Y-%m-%d_%H-%M-%S")) + + +# ============================ # +# Test Configuration Options # +# ============================ # + +# The default values configured below are appropriate for approximately 20 m1.xlarge nodes, +# in which each node has 15 GB of memory. Use this variable to scale the values (e.g. +# number of records in a generated dataset) if you are running the tests with more +# or fewer nodes. When developing new test suites, you might want to set this to a small +# value suitable for a single machine, such as 0.001. +SCALE_FACTOR = 0.01 + +assert SCALE_FACTOR > 0, "SCALE_FACTOR must be > 0." + +# If set, removes the first N trials for each test from all reported statistics. Useful for +# tests which have outlier behavior due to JIT and other system cache warm-ups. If any test +# returns fewer N + 1 results, an exception is thrown. +IGNORED_TRIALS = 2 + +# Command used to launch Scala or Java. + +# Set up OptionSets. Note that giant cross product is done over all JavaOptionsSets + OptionSets +# passed to each test which may be combinations of those set up here. + +# Java options. +COMMON_JAVA_OPTS = [ + # Fraction of JVM memory used for caching RDDs. + JavaOptionSet("spark.storage.memoryFraction", [0.66]), + JavaOptionSet("spark.serializer", ["org.apache.spark.serializer.JavaSerializer"]), + JavaOptionSet("spark.executor.memory", ["2g"]), + # Turn event logging on in order better diagnose failed tests. Off by default as it crashes + # releases prior to 1.0.2 + # JavaOptionSet("spark.eventLog.enabled", [True]), + # To ensure consistency across runs, we disable delay scheduling + JavaOptionSet("spark.locality.wait", [str(60 * 1000 * 1000)]) +] +# Set driver memory here +SPARK_DRIVER_MEMORY = "2g" +# The following options value sets are shared among all tests. +COMMON_OPTS = [ + # How many times to run each experiment - used to warm up system caches. + # This OptionSet should probably only have a single value (i.e., length 1) + # since it doesn't make sense to have multiple values here. + OptionSet("num-trials", [10]), + # Extra pause added between trials, in seconds. For runs with large amounts + # of shuffle data, this gives time for buffer cache write-back. + OptionSet("inter-trial-wait", [3]) +] + +# The following options value sets are shared among all tests of +# operations on key-value data. +SPARK_KEY_VAL_TEST_OPTS = [ + # The number of input partitions. + OptionSet("num-partitions", [400], can_scale=True), + # The number of reduce tasks. + OptionSet("reduce-tasks", [400], can_scale=True), + # A random seed to make tests reproducable. + OptionSet("random-seed", [5]), + # Input persistence strategy (can be "memory", "disk", or "hdfs"). + # NOTE: If "hdfs" is selected, datasets will be re-used across runs of + # this script. This means parameters here are effectively ignored if + # an existing input dataset is present. + OptionSet("persistent-type", ["memory"]), + # Whether to wait for input in order to exit the JVM. + FlagSet("wait-for-exit", [False]), + # Total number of records to create. + OptionSet("num-records", [200 * 1000 * 1000], True), + # Number of unique keys to sample from. + OptionSet("unique-keys",[20 * 1000], True), + # Length in characters of each key. + OptionSet("key-length", [10]), + # Number of unique values to sample from. + OptionSet("unique-values", [1000 * 1000], True), + # Length in characters of each value. + OptionSet("value-length", [10]), + # Use hashes instead of padded numbers for keys and values + FlagSet("hash-records", [False]), + # Storage location if HDFS persistence is used + OptionSet("storage-location", [ + HDFS_URL + "/spark-perf-kv-data"]) +] + + +# ======================= # +# Spark Core Test Setup # +# ======================= # + +# Set up the actual tests. Each test is represtented by a tuple: +# (short_name, test_cmd, scale_factor, list, list) + +SPARK_KV_OPTS = COMMON_OPTS + SPARK_KEY_VAL_TEST_OPTS +SPARK_TESTS = [] + +SCHEDULING_THROUGHPUT_OPTS = [ + # The number of tasks that should be launched in each job: + OptionSet("num-tasks", [10 * 1000]), + # The number of jobs that should be run: + OptionSet("num-jobs", [1]), + # The size of the task closure (in bytes): + OptionSet("closure-size", [0]), + # A random seed to make tests reproducible: + OptionSet("random-seed", [5]), +] + +SPARK_TESTS += [("scheduling-throughput", "spark.perf.TestRunner", + SCALE_FACTOR, COMMON_JAVA_OPTS, + [ConstantOption("scheduling-throughput")] + COMMON_OPTS + SCHEDULING_THROUGHPUT_OPTS)] + +SPARK_TESTS += [("scala-agg-by-key", "spark.perf.TestRunner", SCALE_FACTOR, + COMMON_JAVA_OPTS, [ConstantOption("aggregate-by-key")] + SPARK_KV_OPTS)] + +# Scale the input for this test by 2x since ints are smaller. +SPARK_TESTS += [("scala-agg-by-key-int", "spark.perf.TestRunner", SCALE_FACTOR * 2, + COMMON_JAVA_OPTS, [ConstantOption("aggregate-by-key-int")] + SPARK_KV_OPTS)] + +SPARK_TESTS += [("scala-agg-by-key-naive", "spark.perf.TestRunner", SCALE_FACTOR, + COMMON_JAVA_OPTS, [ConstantOption("aggregate-by-key-naive")] + SPARK_KV_OPTS)] + +# Scale the input for this test by 0.10. +SPARK_TESTS += [("scala-sort-by-key", "spark.perf.TestRunner", SCALE_FACTOR * 0.1, + COMMON_JAVA_OPTS, [ConstantOption("sort-by-key")] + SPARK_KV_OPTS)] + +SPARK_TESTS += [("scala-sort-by-key-int", "spark.perf.TestRunner", SCALE_FACTOR * 0.2, + COMMON_JAVA_OPTS, [ConstantOption("sort-by-key-int")] + SPARK_KV_OPTS)] + +SPARK_TESTS += [("scala-count", "spark.perf.TestRunner", SCALE_FACTOR, + COMMON_JAVA_OPTS, [ConstantOption("count")] + SPARK_KV_OPTS)] + +SPARK_TESTS += [("scala-count-w-fltr", "spark.perf.TestRunner", SCALE_FACTOR, + COMMON_JAVA_OPTS, [ConstantOption("count-with-filter")] + SPARK_KV_OPTS)] + + +# ==================== # +# Pyspark Test Setup # +# ==================== # + +PYSPARK_TESTS = [] + +BROADCAST_TEST_OPTS = [ + # The size of broadcast + OptionSet("broadcast-size", [200 << 20], can_scale=True), +] + +PYSPARK_TESTS += [("python-scheduling-throughput", "core_tests.py", + SCALE_FACTOR, COMMON_JAVA_OPTS, + [ConstantOption("SchedulerThroughputTest"), OptionSet("num-tasks", [5000])] + COMMON_OPTS)] + +PYSPARK_TESTS += [("python-agg-by-key", "core_tests.py", SCALE_FACTOR, + COMMON_JAVA_OPTS, [ConstantOption("AggregateByKey")] + SPARK_KV_OPTS)] + +# Scale the input for this test by 2x since ints are smaller. +PYSPARK_TESTS += [("python-agg-by-key-int", "core_tests.py", SCALE_FACTOR * 2, + COMMON_JAVA_OPTS, [ConstantOption("AggregateByKeyInt")] + SPARK_KV_OPTS)] + +PYSPARK_TESTS += [("python-agg-by-key-naive", "core_tests.py", SCALE_FACTOR, + COMMON_JAVA_OPTS, [ConstantOption("AggregateByKeyNaive")] + SPARK_KV_OPTS)] + +# Scale the input for this test by 0.10. +PYSPARK_TESTS += [("python-sort-by-key", "core_tests.py", SCALE_FACTOR * 0.1, + COMMON_JAVA_OPTS, [ConstantOption("SortByKey")] + SPARK_KV_OPTS)] + +PYSPARK_TESTS += [("python-sort-by-key-int", "core_tests.py", SCALE_FACTOR * 0.2, + COMMON_JAVA_OPTS, [ConstantOption("SortByKeyInt")] + SPARK_KV_OPTS)] + +PYSPARK_TESTS += [("python-count", "core_tests.py", SCALE_FACTOR, + COMMON_JAVA_OPTS, [ConstantOption("Count")] + SPARK_KV_OPTS)] + +PYSPARK_TESTS += [("python-count-w-fltr", "core_tests.py", SCALE_FACTOR, + COMMON_JAVA_OPTS, [ConstantOption("CountWithFilter")] + SPARK_KV_OPTS)] + +PYSPARK_TESTS += [("python-broadcast-w-bytes", "core_tests.py", SCALE_FACTOR, + COMMON_JAVA_OPTS, [ConstantOption("BroadcastWithBytes")] + SPARK_KV_OPTS + BROADCAST_TEST_OPTS)] + +PYSPARK_TESTS += [("python-broadcast-w-set", "core_tests.py", SCALE_FACTOR, + COMMON_JAVA_OPTS, [ConstantOption("BroadcastWithSet")] + SPARK_KV_OPTS + BROADCAST_TEST_OPTS)] + + +# ============================ # +# Spark Streaming Test Setup # +# ============================ # + +STREAMING_TESTS = [] + +# The following function generates options for setting batch duration in streaming tests +def streaming_batch_duration_opts(duration): + return [OptionSet("batch-duration", [duration])] + +# The following function generates options for setting window duration in streaming tests +def streaming_window_duration_opts(duration): + return [OptionSet("window-duration", [duration])] + +STREAMING_COMMON_OPTS = [ + OptionSet("total-duration", [60]), + OptionSet("hdfs-url", [HDFS_URL]), +] + +STREAMING_COMMON_JAVA_OPTS = [ + # Fraction of JVM memory used for caching RDDs. + JavaOptionSet("spark.storage.memoryFraction", [0.66]), + JavaOptionSet("spark.serializer", ["org.apache.spark.serializer.JavaSerializer"]), + # JavaOptionSet("spark.executor.memory", ["2g"]) +] + +STREAMING_KEY_VAL_TEST_OPTS = STREAMING_COMMON_OPTS + streaming_batch_duration_opts(2000) + [ + # Number of input streams. + OptionSet("num-streams", [1], can_scale=True), + # Number of records per second per input stream + OptionSet("records-per-sec", [10 * 1000]), + # Number of reduce tasks. + OptionSet("reduce-tasks", [10], can_scale=True), + # memory serialization ("true" or "false"). + OptionSet("memory-serialization", ["true"]), + # Number of unique keys to sample from. + OptionSet("unique-keys",[100 * 1000], can_scale=True), + # Length in characters of each key. + OptionSet("unique-values", [1000 * 1000], can_scale=True), + # Send data through receiver + OptionSet("use-receiver", ["true"]), +] + +STREAMING_HDFS_RECOVERY_TEST_OPTS = STREAMING_COMMON_OPTS + streaming_batch_duration_opts(5000) + [ + OptionSet("records-per-file", [10000]), + OptionSet("file-cleaner-delay", [300]) +] + +# This test is just to see if everything is setup properly +STREAMING_TESTS += [("basic", "streaming.perf.TestRunner", SCALE_FACTOR, + STREAMING_COMMON_JAVA_OPTS, [ConstantOption("basic")] + STREAMING_COMMON_OPTS + streaming_batch_duration_opts(1000))] + +STREAMING_TESTS += [("state-by-key", "streaming.perf.TestRunner", SCALE_FACTOR, + STREAMING_COMMON_JAVA_OPTS, [ConstantOption("state-by-key")] + STREAMING_KEY_VAL_TEST_OPTS)] + +STREAMING_TESTS += [("group-by-key-and-window", "streaming.perf.TestRunner", SCALE_FACTOR, + STREAMING_COMMON_JAVA_OPTS, [ConstantOption("group-by-key-and-window")] + STREAMING_KEY_VAL_TEST_OPTS + streaming_window_duration_opts(10000) )] + +STREAMING_TESTS += [("reduce-by-key-and-window", "streaming.perf.TestRunner", SCALE_FACTOR, + STREAMING_COMMON_JAVA_OPTS, [ConstantOption("reduce-by-key-and-window")] + STREAMING_KEY_VAL_TEST_OPTS + streaming_window_duration_opts(10000) )] + +STREAMING_TESTS += [("hdfs-recovery", "streaming.perf.TestRunner", SCALE_FACTOR, + STREAMING_COMMON_JAVA_OPTS, [ConstantOption("hdfs-recovery")] + STREAMING_HDFS_RECOVERY_TEST_OPTS)] + + +# ================== # +# MLlib Test Setup # +# ================== # + +MLLIB_TESTS = [] +MLLIB_PERF_TEST_RUNNER = "mllib.perf.TestRunner" + +# Set this to 1.0, 1.1, 1.2, ... (the major version) to test MLlib with a particular Spark version. +# Note: You should also build mllib-perf using -Dspark.version to specify the same version. +# Note: To run perf tests against a snapshot version of Spark which has not yet been packaged into a release: +# * Build Spark locally by running `build/sbt assembly; build/sbt publishLocal` in the Spark root directory +# * Set `USE_CLUSTER_SPARK = True` and `MLLIB_SPARK_VERSION = {desired Spark version, e.g. 1.5}` +# * Don't use PREP_MLLIB_TESTS = True; instead manually run `cd mllib-tests; sbt/sbt -Dspark.version=1.5.0-SNAPSHOT clean assembly` to build perf tests + +MLLIB_SPARK_VERSION = "2.0" + +MLLIB_JAVA_OPTS = COMMON_JAVA_OPTS +if MLLIB_SPARK_VERSION >= 1.1: + MLLIB_JAVA_OPTS = MLLIB_JAVA_OPTS + [ + # Shuffle manager: SORT, HASH + JavaOptionSet("spark.shuffle.manager", ["SORT"]) + ] + +# The following options value sets are shared among all tests of +# operations on MLlib algorithms. +MLLIB_COMMON_OPTS = COMMON_OPTS + [ + # The number of input partitions. + # The default setting is suitable for a 16-node m3.2xlarge EC2 cluster. + OptionSet("num-partitions", [128], can_scale=True), + # A random seed to make tests reproducable. + OptionSet("random-seed", [5]) +] + +# Algorithms available in Spark-1.0 # + +# Regression and Classification Tests # +MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS = MLLIB_COMMON_OPTS + [ + # The number of rows or examples + OptionSet("num-examples", [1000000], can_scale=True) +] + +# Generalized Linear Model (GLM) Tests # +MLLIB_GLM_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [ + # The number of features per example + OptionSet("num-features", [10000], can_scale=False), + # The number of iterations for SGD + OptionSet("num-iterations", [20]), + # The step size for SGD + OptionSet("step-size", [0.001]), + # Regularization type: none, l1, l2 + OptionSet("reg-type", ["l2"]), + # Regularization parameter + OptionSet("reg-param", [0.1]) +] +if MLLIB_SPARK_VERSION >= 1.1: + MLLIB_GLM_TEST_OPTS += [ + # Optimization algorithm: sgd, l-bfgs + OptionSet("optimizer", ["sgd", "l-bfgs"]) + ] +if MLLIB_SPARK_VERSION >= 1.5: + MLLIB_GLM_TEST_OPTS += [ + # Ignored, but required for config + OptionSet("elastic-net-param", [0.0]) + ] + +# GLM Regression Tests # +MLLIB_GLM_REGRESSION_TEST_OPTS = MLLIB_GLM_TEST_OPTS + [ + # The intercept for the data + OptionSet("intercept", [0.0]), + # The scale factor for label noise + OptionSet("label-noise", [0.1]), + # Loss to minimize: l2 (squared error) + OptionSet("loss", ["l2"]) +] + +MLLIB_TESTS += [("glm-regression", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("glm-regression")] + MLLIB_GLM_REGRESSION_TEST_OPTS)] + +# Classification Tests # +MLLIB_CLASSIFICATION_TEST_OPTS = MLLIB_GLM_TEST_OPTS + [ + # Expected fraction of examples which are negative + OptionSet("per-negative", [0.3]), +] + +# GLM Classification Tests # +MLLIB_GLM_CLASSIFICATION_TEST_OPTS = MLLIB_CLASSIFICATION_TEST_OPTS + [ + # Loss to minimize: logistic, hinge (SVM) + OptionSet("loss", ["logistic"]) +] + +MLLIB_TESTS += [("glm-classification", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("glm-classification")] + + MLLIB_GLM_CLASSIFICATION_TEST_OPTS)] + +if MLLIB_SPARK_VERSION >= 1.5: + MLLIB_GLM_ELASTIC_NET_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [ + # The max number of iterations for LBFGS/OWLQN + OptionSet("num-iterations", [20]), + # LBFGS/OWLQN is used with elastic-net regularization. + OptionSet("optimizer", ["auto"]), + # Using elastic-net regularization. + OptionSet("reg-type", ["elastic-net"]), + # Runs with L2 (param = 0.0), L1 (param = 1.0). + OptionSet("elastic-net-param", [0.0, 1.0]), + # Regularization param (lambda) + OptionSet("reg-param", [0.01]), + # The scale factor for the noise in feature values + OptionSet("feature-noise", [1.0]), + # The scale factor for the noise in label values + OptionSet("label-noise", [0.1]), + # The intercept for the data + OptionSet("intercept", [0.2]), + # The step size is not used in LBFGS, but this is required in parameter checking. + OptionSet("step-size", [0.0]) + ] + + MLLIB_GLM_ELASTIC_NET_REGRESSION_TEST_OPTS = MLLIB_GLM_ELASTIC_NET_TEST_OPTS + [ + # Loss to minimize: l2 (squared error) + OptionSet("loss", ["l2"]) + ] + + # Test L-BFGS + MLLIB_TESTS += [("glm-regression", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("glm-regression")] + + MLLIB_GLM_ELASTIC_NET_REGRESSION_TEST_OPTS + + [OptionSet("num-features", [10000], can_scale=False)])] + # Test normal equation solver + MLLIB_TESTS += [("glm-regression", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("glm-regression")] + + MLLIB_GLM_ELASTIC_NET_REGRESSION_TEST_OPTS + + [OptionSet("num-features", [100], can_scale=False)])] + + MLLIB_GLM_ELASTIC_NET_CLASSIFICATION_TEST_OPTS = MLLIB_GLM_ELASTIC_NET_TEST_OPTS + [ + # In GLM classification with elastic-net regularization, only logistic loss is supported. + OptionSet("loss", ["logistic"]) + ] + + # Test L-BFGS + MLLIB_TESTS += [("glm-classification", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("glm-classification")] + + MLLIB_GLM_ELASTIC_NET_CLASSIFICATION_TEST_OPTS + + [OptionSet("num-features", [10000], can_scale=False)])] + # Test normal equation solver + MLLIB_TESTS += [("glm-classification", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("glm-classification")] + + MLLIB_GLM_ELASTIC_NET_CLASSIFICATION_TEST_OPTS + + [OptionSet("num-features", [100], can_scale=False)])] + +NAIVE_BAYES_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [ + # The number of features per example + OptionSet("num-features", [10000], can_scale=False), + # Expected fraction of examples which are negative + OptionSet("per-negative", [0.3]), + # The scale factor for the noise in feature values + OptionSet("feature-noise", [1.0]), + # Naive Bayes smoothing lambda. + OptionSet("nb-lambda", [1.0]), + # Model type: either multinomial or bernoulli (bernoulli only available in Spark 1.4+) + OptionSet("model-type", ["multinomial"]), +] + +MLLIB_TESTS += [("naive-bayes", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("naive-bayes")] + + NAIVE_BAYES_TEST_OPTS)] + +# Decision Trees # +MLLIB_DECISION_TREE_TEST_OPTS = MLLIB_COMMON_OPTS + [ + # The number of rows or examples + OptionSet("num-examples", [1000000], can_scale=True), + # The number of features per example + OptionSet("num-features", [500], can_scale=False), + # Type of label: 0 indicates regression, 2+ indicates classification with this many classes + # Note: multi-class (>2) is not supported in Spark 1.0. + OptionSet("label-type", [0, 2], can_scale=False), + # Fraction of features which are categorical + OptionSet("frac-categorical-features", [0.5], can_scale=False), + # Fraction of categorical features which are binary. Others have 20 categories. + OptionSet("frac-binary-features", [0.5], can_scale=False), + # Depth of true decision tree model used to label examples. + # WARNING: The meaning of depth changed from Spark 1.0 to Spark 1.1: + # depth=N for Spark 1.0 should be depth=N-1 for Spark 1.1 + OptionSet("tree-depth", [5, 10], can_scale=False), + # Maximum number of bins for the decision tree learning algorithm. + OptionSet("max-bins", [32], can_scale=False), +] + +if MLLIB_SPARK_VERSION >= 1.2: + ensembleTypes = ["RandomForest"] + if MLLIB_SPARK_VERSION >= 1.3: + ensembleTypes.append("GradientBoostedTrees") + if MLLIB_SPARK_VERSION >= 1.4: + ensembleTypes.extend(["ml.RandomForest", "ml.GradientBoostedTrees"]) + MLLIB_DECISION_TREE_TEST_OPTS += [ + # Ensemble type: mllib.RandomForest, mllib.GradientBoostedTrees, + # ml.RandomForest, ml.GradientBoostedTrees + OptionSet("ensemble-type", ensembleTypes), + # Path to training dataset (if not given, use random data). + OptionSet("training-data", [""]), + # Path to test dataset (only used if training dataset given). + # If not given, hold out part of training data for validation. + OptionSet("test-data", [""]), + # Fraction of data to hold out for testing + # (Ignored if given training and test dataset, or if using synthetic data.) + OptionSet("test-data-fraction", [0.2], can_scale=False), + # Number of trees. If 1, then run DecisionTree. If >1, then run RandomForest. + OptionSet("num-trees", [1, 10], can_scale=False), + # Feature subset sampling strategy: auto, all, sqrt, log2, onethird + # (only used for RandomForest) + OptionSet("feature-subset-strategy", ["auto"]) + ] + +MLLIB_TESTS += [("decision-tree", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("decision-tree")] + + MLLIB_DECISION_TREE_TEST_OPTS)] + +# Recommendation Tests # +MLLIB_RECOMMENDATION_TEST_OPTS = MLLIB_COMMON_OPTS + [ + # The number of users + OptionSet("num-users", [6000000], can_scale=True), + # The number of products + OptionSet("num-products", [5000000], can_scale=False), + # The number of ratings + OptionSet("num-ratings", [50000000], can_scale=True), + # The number of iterations for ALS + OptionSet("num-iterations", [10]), + # The rank of the factorized matrix model + OptionSet("rank", [10]), + # The regularization parameter + OptionSet("reg-param", [0.1]), + # Whether to use implicit preferences or not + FlagSet("implicit-prefs", [False]) +] + +MLLIB_TESTS += [("als", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("als")] + + MLLIB_RECOMMENDATION_TEST_OPTS)] + +# Clustering Tests # +MLLIB_CLUSTERING_TEST_OPTS = MLLIB_COMMON_OPTS + [ + # The number of points + OptionSet("num-points", [1000000], can_scale=True), + # The number of features per point + OptionSet("num-columns", [10000], can_scale=False), + # The number of centers + OptionSet("num-centers", [20]), + # The number of iterations for KMeans + OptionSet("num-iterations", [20]) +] + +MLLIB_TESTS += [("kmeans", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("kmeans")] + MLLIB_CLUSTERING_TEST_OPTS)] + +MLLIB_GMM_TEST_OPTS = MLLIB_COMMON_OPTS + [ + OptionSet("num-points", [1000000], can_scale=True), + OptionSet("num-columns", [100], can_scale=False), + OptionSet("num-centers", [20], can_scale=False), + OptionSet("num-iterations", [20])] + +if MLLIB_SPARK_VERSION >= 1.3: + MLLIB_TESTS += [("gmm", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("gmm")] + MLLIB_GMM_TEST_OPTS)] + +MLLIB_LDA_TEST_OPTS = MLLIB_COMMON_OPTS + [ + OptionSet("num-documents", [50000], can_scale=True), + OptionSet("num-vocab", [10000], can_scale=False), + OptionSet("num-topics", [20], can_scale=False), + OptionSet("num-iterations", [20]), + OptionSet("document-length", [100]), + OptionSet("optimizer", ["em", "online"])] + +if MLLIB_SPARK_VERSION >= 1.4: + MLLIB_TESTS += [("lda", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("lda")] + MLLIB_LDA_TEST_OPTS)] + +# TODO: tune PIC test size to run in 20-30 seconds +MLLIB_PIC_TEST_OPTS = MLLIB_COMMON_OPTS + [ + OptionSet("num-points", [10000], can_scale=True), + OptionSet("node-degree", [10], can_scale=False), + OptionSet("num-centers", [20], can_scale=False), + OptionSet("num-iterations", [20])] + +if MLLIB_SPARK_VERSION >= 1.3: + MLLIB_TESTS += [("pic", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("pic")] + MLLIB_CLUSTERING_TEST_OPTS)] + +# Linear Algebra Tests # +MLLIB_LINALG_TEST_OPTS = MLLIB_COMMON_OPTS + [ + # The number of rows for the matrix + OptionSet("num-rows", [1000000], can_scale=True), + # The number of columns for the matrix + OptionSet("num-cols", [1000], can_scale=False), + # The number of top singular values wanted for SVD and PCA + OptionSet("rank", [50], can_scale=False) +] +# Linear Algebra Tests which take more time (slightly smaller settings) # +MLLIB_BIG_LINALG_TEST_OPTS = MLLIB_COMMON_OPTS + [ + # The number of rows for the matrix + OptionSet("num-rows", [1000000], can_scale=True), + # The number of columns for the matrix + OptionSet("num-cols", [500], can_scale=False), + # The number of top singular values wanted for SVD and PCA + OptionSet("rank", [10], can_scale=False) +] + +MLLIB_TESTS += [("svd", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("svd")] + MLLIB_BIG_LINALG_TEST_OPTS)] + +MLLIB_TESTS += [("pca", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("pca")] + MLLIB_LINALG_TEST_OPTS)] + +MLLIB_TESTS += [("summary-statistics", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("summary-statistics")] + + MLLIB_LINALG_TEST_OPTS)] + +MLLIB_BLOCK_MATRIX_MULT_TEST_OPTS = MLLIB_COMMON_OPTS + [ + OptionSet("m", [20000], can_scale=True), + OptionSet("k", [10000], can_scale=False), + OptionSet("n", [10000], can_scale=False), + OptionSet("block-size", [1024], can_scale=False)] + +if MLLIB_SPARK_VERSION >= 1.3: + MLLIB_TESTS += [("block-matrix-mult", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("block-matrix-mult")] + MLLIB_BLOCK_MATRIX_MULT_TEST_OPTS)] + +# Statistic Toolkit Tests # +MLLIB_STATS_TEST_OPTS = MLLIB_COMMON_OPTS + +MLLIB_PEARSON_TEST_OPTS = MLLIB_STATS_TEST_OPTS + \ + [OptionSet("num-rows", [1000000], can_scale=True), + OptionSet("num-cols", [1000], can_scale=False)] + +MLLIB_SPEARMAN_TEST_OPTS = MLLIB_STATS_TEST_OPTS + \ + [OptionSet("num-rows", [1000000], can_scale=True), + OptionSet("num-cols", [100], can_scale=False)] + +MLLIB_CHI_SQ_FEATURE_TEST_OPTS = MLLIB_STATS_TEST_OPTS + \ + [OptionSet("num-rows", [2000000], can_scale=True), + OptionSet("num-cols", [500], can_scale=False)] + +MLLIB_CHI_SQ_GOF_TEST_OPTS = MLLIB_STATS_TEST_OPTS + \ + [OptionSet("num-rows", [50000000], can_scale=True), + OptionSet("num-cols", [0], can_scale=False)] + +MLLIB_CHI_SQ_MAT_TEST_OPTS = MLLIB_STATS_TEST_OPTS + \ + [OptionSet("num-rows", [20000], can_scale=True), + OptionSet("num-cols", [0], can_scale=False)] + +if MLLIB_SPARK_VERSION >= 1.1: + MLLIB_TESTS += [("pearson", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("pearson")] + MLLIB_PEARSON_TEST_OPTS)] + + MLLIB_TESTS += [("spearman", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("spearman")] + MLLIB_SPEARMAN_TEST_OPTS)] + + MLLIB_TESTS += [("chi-sq-feature", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("chi-sq-feature")] + MLLIB_CHI_SQ_FEATURE_TEST_OPTS)] + + MLLIB_TESTS += [("chi-sq-gof", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("chi-sq-gof")] + MLLIB_CHI_SQ_GOF_TEST_OPTS)] + + MLLIB_TESTS += [("chi-sq-mat", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("chi-sq-mat")] + MLLIB_CHI_SQ_MAT_TEST_OPTS)] + +# Feature Transformation Tests # + +MLLIB_FEATURE_TEST_OPTS = MLLIB_COMMON_OPTS + +MLLIB_WORD2VEC_TEST_OPTS = MLLIB_FEATURE_TEST_OPTS + \ + [OptionSet("num-sentences", [1000000], can_scale=True), + OptionSet("num-words", [10000], can_scale=False), + OptionSet("vector-size", [100], can_scale=False), + OptionSet("num-iterations", [3], can_scale=False), + OptionSet("min-count", [5], can_scale=False)] + +if MLLIB_SPARK_VERSION >= 1.3: # TODO: make it work in 1.2 + MLLIB_TESTS += [("word2vec", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("word2vec")] + MLLIB_WORD2VEC_TEST_OPTS)] + +# Frequent Pattern Matching Tests # + +MLLIB_FPM_TEST_OPTS = MLLIB_COMMON_OPTS + +MLLIB_FP_GROWTH_TEST_OPTS = MLLIB_FPM_TEST_OPTS + \ + [OptionSet("num-baskets", [5000000], can_scale=True), + OptionSet("avg-basket-size", [10], can_scale=False), + OptionSet("num-items", [1000], can_scale=False), + OptionSet("min-support", [0.01], can_scale=False)] + +if MLLIB_SPARK_VERSION >= 1.3: + MLLIB_TESTS += [("fp-growth", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("fp-growth")] + MLLIB_FP_GROWTH_TEST_OPTS)] + +# TODO: tune test size to have runtime within 30-60 seconds +MLLIB_PREFIX_SPAN_TEST_OPTS = MLLIB_FPM_TEST_OPTS + \ + [OptionSet("num-sequences", [5000000], can_scale=True), + OptionSet("avg-sequence-size", [5], can_scale=False), + OptionSet("avg-itemset-size", [1], can_scale=False), + OptionSet("num-items", [100], can_scale=False), + OptionSet("min-support", [0.5], can_scale=False), + OptionSet("max-pattern-len", [10], can_scale=False), + OptionSet("max-local-proj-db-size", [32000000], can_scale=False)] + +if MLLIB_SPARK_VERSION >= 1.5: + MLLIB_TESTS += [("prefix-span", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("prefix-span")] + MLLIB_PREFIX_SPAN_TEST_OPTS)] + +# Python MLlib tests +PYTHON_MLLIB_TESTS = [] + +PYTHON_MLLIB_TESTS += [("python-glm-classification", "mllib_tests.py", SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("GLMClassificationTest")] + + MLLIB_GLM_CLASSIFICATION_TEST_OPTS)] + +PYTHON_MLLIB_TESTS += [("python-glm-regression", "mllib_tests.py", SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("GLMRegressionTest")] + + MLLIB_GLM_REGRESSION_TEST_OPTS)] + +PYTHON_MLLIB_TESTS += [("python-naive-bayes", "mllib_tests.py", SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("NaiveBayesTest")] + + NAIVE_BAYES_TEST_OPTS)] + +PYTHON_MLLIB_TESTS += [("python-als", "mllib_tests.py", SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("ALSTest")] + + MLLIB_RECOMMENDATION_TEST_OPTS)] + +PYTHON_MLLIB_TESTS += [("python-kmeans", "mllib_tests.py", SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("KMeansTest")] + MLLIB_CLUSTERING_TEST_OPTS)] + +if MLLIB_SPARK_VERSION >= 1.1: + PYTHON_MLLIB_TESTS += [("python-pearson", "mllib_tests.py", SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("PearsonCorrelationTest")] + + MLLIB_PEARSON_TEST_OPTS)] + + PYTHON_MLLIB_TESTS += [("python-spearman", "mllib_tests.py", SCALE_FACTOR, + MLLIB_JAVA_OPTS, [ConstantOption("SpearmanCorrelationTest")] + + MLLIB_SPEARMAN_TEST_OPTS)] + From cd7eb1244525dcb983bc7cb268318eeb4721fca0 Mon Sep 17 00:00:00 2001 From: Adam Roberts Date: Thu, 30 Jun 2016 17:01:22 +0100 Subject: [PATCH 06/22] Add ability to override scala version and cleanup in mllib project file --- mllib-tests/project/MLlibTestsBuild.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mllib-tests/project/MLlibTestsBuild.scala b/mllib-tests/project/MLlibTestsBuild.scala index ebf12ab..6a51e93 100644 --- a/mllib-tests/project/MLlibTestsBuild.scala +++ b/mllib-tests/project/MLlibTestsBuild.scala @@ -16,8 +16,8 @@ object MLlibTestsBuild extends Build { lazy val commonSettings = Seq( organization := "org.spark-project", version := "0.1", - scalaVersion := "2.11.8", - sparkVersion := sys.props.getOrElse("spark.version", default="2.0.0-SNAPSHOT"), + scalaVersion := sys.props.getOrElse("scala.version", default="2.11.8"), + sparkVersion := sys.props.getOrElse("spark.version", default="2.0.0"), libraryDependencies ++= Seq( "net.sf.jopt-simple" % "jopt-simple" % "4.6", "org.scalatest" %% "scalatest" % "2.2.1" % "test", @@ -42,9 +42,9 @@ object MLlibTestsBuild extends Build { println("sparkVersion.value is: " + sparkVersion.value) val targetFolder = sparkVersion.value match { case v if v.startsWith("1.4.") => "v1p4" - case v if v.startsWith("1.5.") => "v1p5" + case v if v.startsWith("1.5.") => "v1p5" // acceptable for now, but change later when new algs are added case v if v.startsWith("1.6.") => "v1p5" - case v if v.startsWith("2.0") => "v2p0" + case v if v.startsWith("2.0") => "v2p0" case _ => throw new IllegalArgumentException(s"This Spark version isn't suppored: ${sparkVersion.value}.") } baseDirectory.value / targetFolder / "src" / "main" / "scala" From 95d31d300bb8c3f52ca74a8210ce185245c7a799 Mon Sep 17 00:00:00 2001 From: Adam Roberts Date: Thu, 30 Jun 2016 17:15:38 +0100 Subject: [PATCH 07/22] Add QA 1.6 fixes code back in --- .../scala/mllib/perf/MLAlgorithmTests.scala | 22 +++---- .../mllib/perf/MLAlgorithmTests.scala.rej | 58 +++++++++++++++++++ .../perf/clustering/GaussianMixtureTest.scala | 14 ++--- .../scala/mllib/perf/clustering/PICTest.scala | 13 +++-- .../scala/mllib/perf/util/DataGenerator.scala | 8 +-- .../scala/mllib/perf/MLAlgorithmTests.scala | 37 +++++++----- .../perf/clustering/GaussianMixtureTest.scala | 14 ++--- .../scala/mllib/perf/clustering/PICTest.scala | 13 +++-- .../scala/mllib/perf/util/DataGenerator.scala | 8 +-- pyspark-tests/mllib_tests.py | 8 +-- 10 files changed, 131 insertions(+), 64 deletions(-) create mode 100644 mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala.rej diff --git a/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala b/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala index 6f89aac..0d438db 100644 --- a/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala +++ b/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala @@ -315,13 +315,13 @@ abstract class ClusteringTests(sc: SparkContext) extends PerfTest { def runTest(rdd: RDD[Vector]): KMeansModel - val NUM_POINTS = ("num-points", "number of points for clustering tests") - val NUM_COLUMNS = ("num-columns", "number of columns for each point for clustering tests") + val NUM_EXAMPLES = ("num-examples", "number of examples for clustering tests") + val NUM_FEATURES = ("num-features", "number of features for each example for clustering tests") val NUM_CENTERS = ("num-centers", "number of centers for clustering tests") val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm") - intOptions = intOptions ++ Seq(NUM_CENTERS, NUM_COLUMNS, NUM_ITERATIONS) - longOptions = longOptions ++ Seq(NUM_POINTS) + intOptions = intOptions ++ Seq(NUM_CENTERS, NUM_FEATURES, NUM_ITERATIONS) + longOptions = longOptions ++ Seq(NUM_EXAMPLES) val options = intOptions ++ stringOptions ++ booleanOptions ++ longOptions ++ doubleOptions addOptionsToParser() @@ -329,21 +329,21 @@ abstract class ClusteringTests(sc: SparkContext) extends PerfTest { var testRdd: RDD[Vector] = _ def validate(model: KMeansModel, rdd: RDD[Vector]): Double = { - val numPoints = rdd.cache().count() + val numExamples = rdd.cache().count() val error = model.computeCost(rdd) - math.sqrt(error/numPoints) + math.sqrt(error/numExamples) } override def createInputData(seed: Long) = { val numPartitions: Int = intOptionValue(NUM_PARTITIONS) - val numPoints: Long = longOptionValue(NUM_POINTS) - val numColumns: Int = intOptionValue(NUM_COLUMNS) + val numExamples: Long = longOptionValue(NUM_EXAMPLES) + val numFeatures: Int = intOptionValue(NUM_FEATURES) val numCenters: Int = intOptionValue(NUM_CENTERS) - val data = DataGenerator.generateKMeansVectors(sc, math.ceil(numPoints*1.25).toLong, numColumns, + val data = DataGenerator.generateKMeansVectors(sc, math.ceil(numExamples*1.25).toLong, numFeatures, numCenters, numPartitions, seed) val split = data.randomSplit(Array(0.8, 0.2), seed) @@ -441,9 +441,10 @@ class ALSTest(sc: SparkContext) extends RecommendationTests(sc) { val rank: Int = intOptionValue(RANK) val regParam = doubleOptionValue(REG_PARAM) val seed = intOptionValue(RANDOM_SEED) + 12 + val implicitRatings: Boolean = booleanOptionValue(IMPLICIT) new ALS().setIterations(numIterations).setRank(rank).setSeed(seed).setLambda(regParam) - .setBlocks(rdd.partitions.size).run(rdd) + .setBlocks(rdd.partitions.length).setImplicitPrefs(implicitRatings).run(rdd) } } @@ -602,3 +603,4 @@ class DecisionTreeTest(sc: SparkContext) extends DecisionTreeTests(sc) { } } } + diff --git a/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala.rej b/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala.rej new file mode 100644 index 0000000..7c1776b --- /dev/null +++ b/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala.rej @@ -0,0 +1,58 @@ +diff a/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala b/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala (rejected hunks) +@@ -315,13 +315,13 @@ abstract class ClusteringTests(sc: SparkContext) extends PerfTest { + + def runTest(rdd: RDD[Vector]): KMeansModel + +- val NUM_POINTS = ("num-points", "number of points for clustering tests") +- val NUM_COLUMNS = ("num-columns", "number of columns for each point for clustering tests") ++ val NUM_EXAMPLES = ("num-examples", "number of examples for clustering tests") ++ val NUM_FEATURES = ("num-features", "number of features for each example for clustering tests") + val NUM_CENTERS = ("num-centers", "number of centers for clustering tests") + val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm") + +- intOptions = intOptions ++ Seq(NUM_CENTERS, NUM_COLUMNS, NUM_ITERATIONS) +- longOptions = longOptions ++ Seq(NUM_POINTS) ++ intOptions = intOptions ++ Seq(NUM_CENTERS, NUM_FEATURES, NUM_ITERATIONS) ++ longOptions = longOptions ++ Seq(NUM_EXAMPLES) + val options = intOptions ++ stringOptions ++ booleanOptions ++ longOptions ++ doubleOptions + addOptionsToParser() + +@@ -329,21 +329,21 @@ abstract class ClusteringTests(sc: SparkContext) extends PerfTest { + var testRdd: RDD[Vector] = _ + + def validate(model: KMeansModel, rdd: RDD[Vector]): Double = { +- val numPoints = rdd.cache().count() ++ val numExamples = rdd.cache().count() + + val error = model.computeCost(rdd) + +- math.sqrt(error/numPoints) ++ math.sqrt(error/numExamples) + } + + override def createInputData(seed: Long) = { + val numPartitions: Int = intOptionValue(NUM_PARTITIONS) + +- val numPoints: Long = longOptionValue(NUM_POINTS) +- val numColumns: Int = intOptionValue(NUM_COLUMNS) ++ val numExamples: Long = longOptionValue(NUM_EXAMPLES) ++ val numFeatures: Int = intOptionValue(NUM_FEATURES) + val numCenters: Int = intOptionValue(NUM_CENTERS) + +- val data = DataGenerator.generateKMeansVectors(sc, math.ceil(numPoints*1.25).toLong, numColumns, ++ val data = DataGenerator.generateKMeansVectors(sc, math.ceil(numExamples*1.25).toLong, numFeatures, + numCenters, numPartitions, seed) + + val split = data.randomSplit(Array(0.8, 0.2), seed) +@@ -441,9 +441,10 @@ class ALSTest(sc: SparkContext) extends RecommendationTests(sc) { + val rank: Int = intOptionValue(RANK) + val regParam = doubleOptionValue(REG_PARAM) + val seed = intOptionValue(RANDOM_SEED) + 12 ++ val implicitRatings: Boolean = booleanOptionValue(IMPLICIT) + + new ALS().setIterations(numIterations).setRank(rank).setSeed(seed).setLambda(regParam) +- .setBlocks(rdd.partitions.size).run(rdd) ++ .setBlocks(rdd.partitions.length).setImplicitPrefs(implicitRatings).run(rdd) + } + } + diff --git a/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala b/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala index 0004f8d..5903e2e 100644 --- a/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala +++ b/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala @@ -16,21 +16,21 @@ import mllib.perf.PerfTest class GaussianMixtureTest(sc: SparkContext) extends PerfTest { // TODO: refactor k-means and GMM code - val NUM_POINTS = ("num-points", "number of points for clustering tests") - val NUM_COLUMNS = ("num-columns", "number of columns for each point for clustering tests") + val NUM_EXAMPLES = ("num-examples", "number of examples for clustering tests") + val NUM_FEATURES = ("num-features", "number of features for each example for clustering tests") val NUM_CENTERS = ("num-centers", "number of centers for clustering tests") val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm") - intOptions ++= Seq(NUM_CENTERS, NUM_COLUMNS, NUM_ITERATIONS) - longOptions ++= Seq(NUM_POINTS) + intOptions ++= Seq(NUM_CENTERS, NUM_FEATURES, NUM_ITERATIONS) + longOptions ++= Seq(NUM_EXAMPLES) val options = intOptions ++ stringOptions ++ booleanOptions ++ longOptions ++ doubleOptions addOptionsToParser() var data: RDD[Vector] = _ override def createInputData(seed: Long): Unit = { - val m = longOptionValue(NUM_POINTS) - val n = intOptionValue(NUM_COLUMNS) + val m = longOptionValue(NUM_EXAMPLES) + val n = intOptionValue(NUM_FEATURES) val k = intOptionValue(NUM_CENTERS) val p = intOptionValue(NUM_PARTITIONS) @@ -47,7 +47,7 @@ class GaussianMixtureTest(sc: SparkContext) extends PerfTest { Vectors.dense(y.data) } }.cache() - logInfo(s"Generated ${data.count()} points.") + logInfo(s"Generated ${data.count()} examples.") } override def run(): JValue = { diff --git a/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/PICTest.scala b/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/PICTest.scala index 6832ffa..2018c61 100644 --- a/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/PICTest.scala +++ b/mllib-tests/v1p4/src/main/scala/mllib/perf/clustering/PICTest.scala @@ -11,28 +11,28 @@ import mllib.perf.PerfTest class PICTest(sc: SparkContext) extends PerfTest { - val NUM_POINTS = ("num-points", "number of points") + val NUM_EXAMPLES = ("num-examples", "number of examples") val NODE_DEGREE = ("node-degree", "number of neighbors each node is connected to") val NUM_CENTERS = ("num-centers", "number of centers for clustering tests") val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm") intOptions ++= Seq(NODE_DEGREE, NUM_CENTERS, NUM_ITERATIONS) - longOptions ++= Seq(NUM_POINTS) + longOptions ++= Seq(NUM_EXAMPLES) val options = intOptions ++ stringOptions ++ booleanOptions ++ longOptions ++ doubleOptions addOptionsToParser() var data: RDD[(Long, Long, Double)] = _ override def createInputData(seed: Long): Unit = { - val numPoints = longOptionValue(NUM_POINTS) + val numExamples = longOptionValue(NUM_EXAMPLES) val nodeDegree = intOptionValue(NODE_DEGREE) val numPartitions = intOptionValue(NUM_PARTITIONS) // Generates a periodic banded matrix with bandwidth = nodeDegree - val data = sc.parallelize(0L to numPoints, numPartitions) + data = sc.parallelize(0L to numExamples, numPartitions) .flatMap { id => - (((id - nodeDegree / 2) % numPoints) until id).map { nbr => - (id, (nbr + numPoints) % numPoints, 1D) + (((id - nodeDegree / 2) % numExamples) until id).map { nbr => + (id, (nbr + numExamples) % numExamples, 1D) } } logInfo(s"Generated ${data.count()} pairwise similarities.") @@ -46,6 +46,7 @@ class PICTest(sc: SparkContext) extends PerfTest { .setK(k) .setMaxIterations(numIterations) val model = pic.run(data) + model.assignments.count() val duration = (System.currentTimeMillis() - start) / 1e3 "time" -> duration } diff --git a/mllib-tests/v1p4/src/main/scala/mllib/perf/util/DataGenerator.scala b/mllib-tests/v1p4/src/main/scala/mllib/perf/util/DataGenerator.scala index 6e354fd..f721ca7 100644 --- a/mllib-tests/v1p4/src/main/scala/mllib/perf/util/DataGenerator.scala +++ b/mllib-tests/v1p4/src/main/scala/mllib/perf/util/DataGenerator.scala @@ -509,7 +509,7 @@ class FeaturesGenerator(val categoricalArities: Array[Int], val numContinuous: I class KMeansDataGenerator( val numCenters: Int, - val numColumns: Int, + val numFeatures: Int, val seed: Long) extends RandomDataGenerator[Vector] { private val rng = new java.util.Random(seed) @@ -528,7 +528,7 @@ class KMeansDataGenerator( } private val centers = (0 until numCenters).map{i => - Array.fill(numColumns)((2 * rng.nextDouble() - 1)*scale_factors(i)) + Array.fill(numFeatures)((2 * rng.nextDouble() - 1)*scale_factors(i)) } override def nextValue(): Vector = { @@ -536,12 +536,12 @@ class KMeansDataGenerator( val centerToAddTo = centers(concentrations.indexWhere(p => pick_center_rand <= p)) - Vectors.dense(Array.tabulate(numColumns)(i => centerToAddTo(i) + rng2.nextGaussian())) + Vectors.dense(Array.tabulate(numFeatures)(i => centerToAddTo(i) + rng2.nextGaussian())) } override def setSeed(seed: Long) { rng.setSeed(seed) } - override def copy(): KMeansDataGenerator = new KMeansDataGenerator(numCenters, numColumns, seed) + override def copy(): KMeansDataGenerator = new KMeansDataGenerator(numCenters, numFeatures, seed) } diff --git a/mllib-tests/v1p5/src/main/scala/mllib/perf/MLAlgorithmTests.scala b/mllib-tests/v1p5/src/main/scala/mllib/perf/MLAlgorithmTests.scala index 1f1ec27..1c06465 100644 --- a/mllib-tests/v1p5/src/main/scala/mllib/perf/MLAlgorithmTests.scala +++ b/mllib-tests/v1p5/src/main/scala/mllib/perf/MLAlgorithmTests.scala @@ -97,10 +97,12 @@ abstract class GLMTests(sc: SparkContext) class GLMRegressionTest(sc: SparkContext) extends GLMTests(sc) { val INTERCEPT = ("intercept", "intercept for random data generation") + val FEATURE_NOISE = ("feature-noise", + "scale factor for the noise during feature generation; CURRENTLY IGNORED") val LABEL_NOISE = ("label-noise", "scale factor for the noise during label generation") val LOSS = ("loss", "loss to minimize. Supported: l2 (squared error).") - doubleOptions = doubleOptions ++ Seq(INTERCEPT, LABEL_NOISE) + doubleOptions = doubleOptions ++ Seq(INTERCEPT, FEATURE_NOISE, LABEL_NOISE) stringOptions = stringOptions ++ Seq(LOSS) val options = intOptions ++ stringOptions ++ booleanOptions ++ doubleOptions ++ longOptions @@ -158,6 +160,7 @@ class GLMRegressionTest(sc: SparkContext) extends GLMTests(sc) { .setElasticNetParam(elasticNetParam) .setRegParam(regParam) .setMaxIter(numIterations) + .setTol(0.0) val sqlContext = new SQLContext(rdd.context) import sqlContext.implicits._ val mlModel = rr.fit(rdd.toDF()) @@ -265,6 +268,7 @@ class GLMClassificationTest(sc: SparkContext) extends GLMTests(sc) { .setElasticNetParam(elasticNetParam) .setRegParam(regParam) .setMaxIter(numIterations) + .setTol(0.0) val sqlContext = new SQLContext(rdd.context) import sqlContext.implicits._ val mlModel = lor.fit(rdd.toDF()) @@ -379,6 +383,8 @@ abstract class RecommendationTests(sc: SparkContext) extends PerfTest { val testMetric = validate(model, testRdd) + /* + // Removed temporarily because these methods are really slow. val numThingsToRecommend = 10 start = System.currentTimeMillis() model.recommendProductsForUsers(numThingsToRecommend).count() @@ -386,11 +392,11 @@ abstract class RecommendationTests(sc: SparkContext) extends PerfTest { start = System.currentTimeMillis() model.recommendUsersForProducts(numThingsToRecommend).count() val recommendUsersForProductsTime = (System.currentTimeMillis() - start).toDouble / 1000.0 - + */ Map("trainingTime" -> trainingTime, "testTime" -> testTime, - "trainingMetric" -> trainingMetric, "testMetric" -> testMetric, - "recommendProductsForUsersTime" -> recommendProductsForUsersTime, - "recommendUsersForProductsTime" -> recommendUsersForProductsTime) + "trainingMetric" -> trainingMetric, "testMetric" -> testMetric) + // "recommendProductsForUsersTime" -> recommendProductsForUsersTime, + // "recommendUsersForProductsTime" -> recommendUsersForProductsTime) } } @@ -398,13 +404,13 @@ abstract class ClusteringTests(sc: SparkContext) extends PerfTest { def runTest(rdd: RDD[Vector]): KMeansModel - val NUM_POINTS = ("num-points", "number of points for clustering tests") - val NUM_COLUMNS = ("num-columns", "number of columns for each point for clustering tests") + val NUM_EXAMPLES = ("num-examples", "number of examples for clustering tests") + val NUM_FEATURES = ("num-features", "number of features for each example for clustering tests") val NUM_CENTERS = ("num-centers", "number of centers for clustering tests") val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm") - intOptions = intOptions ++ Seq(NUM_CENTERS, NUM_COLUMNS, NUM_ITERATIONS) - longOptions = longOptions ++ Seq(NUM_POINTS) + intOptions = intOptions ++ Seq(NUM_CENTERS, NUM_FEATURES, NUM_ITERATIONS) + longOptions = longOptions ++ Seq(NUM_EXAMPLES) val options = intOptions ++ stringOptions ++ booleanOptions ++ longOptions ++ doubleOptions addOptionsToParser() @@ -412,21 +418,21 @@ abstract class ClusteringTests(sc: SparkContext) extends PerfTest { var testRdd: RDD[Vector] = _ def validate(model: KMeansModel, rdd: RDD[Vector]): Double = { - val numPoints = rdd.cache().count() + val numExamples = rdd.cache().count() val error = model.computeCost(rdd) - math.sqrt(error/numPoints) + math.sqrt(error/numExamples) } override def createInputData(seed: Long) = { val numPartitions: Int = intOptionValue(NUM_PARTITIONS) - val numPoints: Long = longOptionValue(NUM_POINTS) - val numColumns: Int = intOptionValue(NUM_COLUMNS) + val numExamples: Long = longOptionValue(NUM_EXAMPLES) + val numFeatures: Int = intOptionValue(NUM_FEATURES) val numCenters: Int = intOptionValue(NUM_CENTERS) - val data = DataGenerator.generateKMeansVectors(sc, math.ceil(numPoints*1.25).toLong, numColumns, + val data = DataGenerator.generateKMeansVectors(sc, math.ceil(numExamples*1.25).toLong, numFeatures, numCenters, numPartitions, seed) val split = data.randomSplit(Array(0.8, 0.2), seed) @@ -524,9 +530,10 @@ class ALSTest(sc: SparkContext) extends RecommendationTests(sc) { val rank: Int = intOptionValue(RANK) val regParam = doubleOptionValue(REG_PARAM) val seed = intOptionValue(RANDOM_SEED) + 12 + val implicitRatings: Boolean = booleanOptionValue(IMPLICIT) new ALS().setIterations(numIterations).setRank(rank).setSeed(seed).setLambda(regParam) - .setBlocks(rdd.partitions.length).run(rdd) + .setBlocks(rdd.partitions.length).setImplicitPrefs(implicitRatings).run(rdd) } } diff --git a/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala b/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala index 95ce9c6..13da1ac 100644 --- a/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala +++ b/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/GaussianMixtureTest.scala @@ -15,21 +15,21 @@ import mllib.perf.PerfTest class GaussianMixtureTest(sc: SparkContext) extends PerfTest { - val NUM_POINTS = ("num-points", "number of points for clustering tests") - val NUM_COLUMNS = ("num-columns", "number of columns for each point for clustering tests") + val NUM_EXAMPLES = ("num-examples", "number of examples for clustering tests") + val NUM_FEATURES = ("num-features", "number of features for each example for clustering tests") val NUM_CENTERS = ("num-centers", "number of centers for clustering tests") val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm") - intOptions ++= Seq(NUM_CENTERS, NUM_COLUMNS, NUM_ITERATIONS) - longOptions ++= Seq(NUM_POINTS) + intOptions ++= Seq(NUM_CENTERS, NUM_FEATURES, NUM_ITERATIONS) + longOptions ++= Seq(NUM_EXAMPLES) val options = intOptions ++ stringOptions ++ booleanOptions ++ longOptions ++ doubleOptions addOptionsToParser() var data: RDD[Vector] = _ override def createInputData(seed: Long): Unit = { - val m = longOptionValue(NUM_POINTS) - val n = intOptionValue(NUM_COLUMNS) + val m = longOptionValue(NUM_EXAMPLES) + val n = intOptionValue(NUM_FEATURES) val k = intOptionValue(NUM_CENTERS) val p = intOptionValue(NUM_PARTITIONS) @@ -46,7 +46,7 @@ class GaussianMixtureTest(sc: SparkContext) extends PerfTest { Vectors.dense(y.data) } }.cache() - logInfo(s"Generated ${data.count()} points.") + logInfo(s"Generated ${data.count()} examples.") } override def run(): JValue = { diff --git a/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/PICTest.scala b/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/PICTest.scala index 6832ffa..2018c61 100644 --- a/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/PICTest.scala +++ b/mllib-tests/v1p5/src/main/scala/mllib/perf/clustering/PICTest.scala @@ -11,28 +11,28 @@ import mllib.perf.PerfTest class PICTest(sc: SparkContext) extends PerfTest { - val NUM_POINTS = ("num-points", "number of points") + val NUM_EXAMPLES = ("num-examples", "number of examples") val NODE_DEGREE = ("node-degree", "number of neighbors each node is connected to") val NUM_CENTERS = ("num-centers", "number of centers for clustering tests") val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm") intOptions ++= Seq(NODE_DEGREE, NUM_CENTERS, NUM_ITERATIONS) - longOptions ++= Seq(NUM_POINTS) + longOptions ++= Seq(NUM_EXAMPLES) val options = intOptions ++ stringOptions ++ booleanOptions ++ longOptions ++ doubleOptions addOptionsToParser() var data: RDD[(Long, Long, Double)] = _ override def createInputData(seed: Long): Unit = { - val numPoints = longOptionValue(NUM_POINTS) + val numExamples = longOptionValue(NUM_EXAMPLES) val nodeDegree = intOptionValue(NODE_DEGREE) val numPartitions = intOptionValue(NUM_PARTITIONS) // Generates a periodic banded matrix with bandwidth = nodeDegree - val data = sc.parallelize(0L to numPoints, numPartitions) + data = sc.parallelize(0L to numExamples, numPartitions) .flatMap { id => - (((id - nodeDegree / 2) % numPoints) until id).map { nbr => - (id, (nbr + numPoints) % numPoints, 1D) + (((id - nodeDegree / 2) % numExamples) until id).map { nbr => + (id, (nbr + numExamples) % numExamples, 1D) } } logInfo(s"Generated ${data.count()} pairwise similarities.") @@ -46,6 +46,7 @@ class PICTest(sc: SparkContext) extends PerfTest { .setK(k) .setMaxIterations(numIterations) val model = pic.run(data) + model.assignments.count() val duration = (System.currentTimeMillis() - start) / 1e3 "time" -> duration } diff --git a/mllib-tests/v1p5/src/main/scala/mllib/perf/util/DataGenerator.scala b/mllib-tests/v1p5/src/main/scala/mllib/perf/util/DataGenerator.scala index e65a5a5..33f041e 100644 --- a/mllib-tests/v1p5/src/main/scala/mllib/perf/util/DataGenerator.scala +++ b/mllib-tests/v1p5/src/main/scala/mllib/perf/util/DataGenerator.scala @@ -548,7 +548,7 @@ class FeaturesGenerator(val categoricalArities: Array[Int], val numContinuous: I class KMeansDataGenerator( val numCenters: Int, - val numColumns: Int, + val numFeatures: Int, val seed: Long) extends RandomDataGenerator[Vector] { private val rng = new java.util.Random(seed) @@ -567,7 +567,7 @@ class KMeansDataGenerator( } private val centers = (0 until numCenters).map{i => - Array.fill(numColumns)((2 * rng.nextDouble() - 1)*scale_factors(i)) + Array.fill(numFeatures)((2 * rng.nextDouble() - 1)*scale_factors(i)) } override def nextValue(): Vector = { @@ -575,12 +575,12 @@ class KMeansDataGenerator( val centerToAddTo = centers(concentrations.indexWhere(p => pick_center_rand <= p)) - Vectors.dense(Array.tabulate(numColumns)(i => centerToAddTo(i) + rng2.nextGaussian())) + Vectors.dense(Array.tabulate(numFeatures)(i => centerToAddTo(i) + rng2.nextGaussian())) } override def setSeed(seed: Long) { rng.setSeed(seed) } - override def copy(): KMeansDataGenerator = new KMeansDataGenerator(numCenters, numColumns, seed) + override def copy(): KMeansDataGenerator = new KMeansDataGenerator(numCenters, numFeatures, seed) } diff --git a/pyspark-tests/mllib_tests.py b/pyspark-tests/mllib_tests.py index 133d751..1b6a306 100644 --- a/pyspark-tests/mllib_tests.py +++ b/pyspark-tests/mllib_tests.py @@ -219,8 +219,8 @@ def __init__(self, sc): def createInputData(self): options = self.options - numTrain = options.num_points - numTest = int(options.num_points * 0.2) + numTrain = options.num_examples + numTest = int(options.num_examples * 0.2) self.trainRDD = LabeledDataGenerator.generateGLMData( self.sc, numTrain, options.num_features, options.num_partitions, options.random_seed, labelType=2) @@ -242,7 +242,7 @@ def __init__(self, sc): def createInputData(self): options = self.options self.data = FeaturesGenerator.generateContinuousData( - self.sc, options.num_points, options.num_columns, + self.sc, options.num_examples, options.num_features, options.num_partitions, options.random_seed) def runTest(self): @@ -368,8 +368,6 @@ def runTest(self): parser.add_option("--num-ratings", type="int", default=500) parser.add_option("--implicit-prefs", type="int", default=0) # MLLIB_CLUSTERING_TEST_OPTS - parser.add_option("--num-points", type="int", default=1000) - parser.add_option("--num-columns", type="int", default=10) parser.add_option("--num-centers", type="int", default=5) # MLLIB_LINALG_TEST_OPTS + MLLIB_STATS_TEST_OPTS parser.add_option("--num-rows", type="int", default=1000) From 8cb9e6240525143334c940fec9b8e7beab4e2b31 Mon Sep 17 00:00:00 2001 From: Adam Roberts Date: Thu, 30 Jun 2016 17:17:31 +0100 Subject: [PATCH 08/22] Remove .rej file too --- .../mllib/perf/MLAlgorithmTests.scala.rej | 58 ------------------- 1 file changed, 58 deletions(-) delete mode 100644 mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala.rej diff --git a/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala.rej b/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala.rej deleted file mode 100644 index 7c1776b..0000000 --- a/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala.rej +++ /dev/null @@ -1,58 +0,0 @@ -diff a/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala b/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala (rejected hunks) -@@ -315,13 +315,13 @@ abstract class ClusteringTests(sc: SparkContext) extends PerfTest { - - def runTest(rdd: RDD[Vector]): KMeansModel - -- val NUM_POINTS = ("num-points", "number of points for clustering tests") -- val NUM_COLUMNS = ("num-columns", "number of columns for each point for clustering tests") -+ val NUM_EXAMPLES = ("num-examples", "number of examples for clustering tests") -+ val NUM_FEATURES = ("num-features", "number of features for each example for clustering tests") - val NUM_CENTERS = ("num-centers", "number of centers for clustering tests") - val NUM_ITERATIONS = ("num-iterations", "number of iterations for the algorithm") - -- intOptions = intOptions ++ Seq(NUM_CENTERS, NUM_COLUMNS, NUM_ITERATIONS) -- longOptions = longOptions ++ Seq(NUM_POINTS) -+ intOptions = intOptions ++ Seq(NUM_CENTERS, NUM_FEATURES, NUM_ITERATIONS) -+ longOptions = longOptions ++ Seq(NUM_EXAMPLES) - val options = intOptions ++ stringOptions ++ booleanOptions ++ longOptions ++ doubleOptions - addOptionsToParser() - -@@ -329,21 +329,21 @@ abstract class ClusteringTests(sc: SparkContext) extends PerfTest { - var testRdd: RDD[Vector] = _ - - def validate(model: KMeansModel, rdd: RDD[Vector]): Double = { -- val numPoints = rdd.cache().count() -+ val numExamples = rdd.cache().count() - - val error = model.computeCost(rdd) - -- math.sqrt(error/numPoints) -+ math.sqrt(error/numExamples) - } - - override def createInputData(seed: Long) = { - val numPartitions: Int = intOptionValue(NUM_PARTITIONS) - -- val numPoints: Long = longOptionValue(NUM_POINTS) -- val numColumns: Int = intOptionValue(NUM_COLUMNS) -+ val numExamples: Long = longOptionValue(NUM_EXAMPLES) -+ val numFeatures: Int = intOptionValue(NUM_FEATURES) - val numCenters: Int = intOptionValue(NUM_CENTERS) - -- val data = DataGenerator.generateKMeansVectors(sc, math.ceil(numPoints*1.25).toLong, numColumns, -+ val data = DataGenerator.generateKMeansVectors(sc, math.ceil(numExamples*1.25).toLong, numFeatures, - numCenters, numPartitions, seed) - - val split = data.randomSplit(Array(0.8, 0.2), seed) -@@ -441,9 +441,10 @@ class ALSTest(sc: SparkContext) extends RecommendationTests(sc) { - val rank: Int = intOptionValue(RANK) - val regParam = doubleOptionValue(REG_PARAM) - val seed = intOptionValue(RANDOM_SEED) + 12 -+ val implicitRatings: Boolean = booleanOptionValue(IMPLICIT) - - new ALS().setIterations(numIterations).setRank(rank).setSeed(seed).setLambda(regParam) -- .setBlocks(rdd.partitions.size).run(rdd) -+ .setBlocks(rdd.partitions.length).setImplicitPrefs(implicitRatings).run(rdd) - } - } - From 9e26cfccf8b1b9a20743a0f650e5a7d8964a0e13 Mon Sep 17 00:00:00 2001 From: Adam Roberts Date: Thu, 30 Jun 2016 17:21:40 +0100 Subject: [PATCH 09/22] Comment dep on 2.0.0 preview --- spark-tests/project/SparkTestsBuild.scala | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/spark-tests/project/SparkTestsBuild.scala b/spark-tests/project/SparkTestsBuild.scala index 697b28a..707c39a 100644 --- a/spark-tests/project/SparkTestsBuild.scala +++ b/spark-tests/project/SparkTestsBuild.scala @@ -15,8 +15,14 @@ object SparkTestsBuild extends Build { "net.sf.jopt-simple" % "jopt-simple" % "4.6", "org.scalatest" %% "scalatest" % "2.2.1" % "test", "com.google.guava" % "guava" % "14.0.1", - "org.apache.spark" %% "spark-core" % "2.0.0-preview" % "provided", "org.json4s" %% "json4s-native" % "3.2.10" + + // IMPORTANT! + // We need to uncomment the below once Spark 2.0.0 becomes available + // This relies on using spark built under the lib folder + // of this project + //"org.apache.spark" %% "spark-core" % "2.0.0-SNAPSHOT" % "provided", + ), test in assembly := {}, outputPath in assembly := file("target/spark-perf-tests-assembly.jar"), From 9cc8cae334d0a0c6342b87921edf7d6cfaa41c75 Mon Sep 17 00:00:00 2001 From: Adam Roberts Date: Fri, 1 Jul 2016 11:14:17 +0100 Subject: [PATCH 10/22] Use original config.py but with SPARK_HOME recognised, default local cluster --- config/config.py.template | 85 ++++++++++++----------- mllib-tests/project/MLlibTestsBuild.scala | 2 +- 2 files changed, 46 insertions(+), 41 deletions(-) diff --git a/config/config.py.template b/config/config.py.template index 019cbac..8489d6f 100755 --- a/config/config.py.template +++ b/config/config.py.template @@ -19,17 +19,19 @@ from sparkperf.config_utils import FlagSet, JavaOptionSet, OptionSet, ConstantOp DEFAULT_HOME=os.environ['HOME'] +# Point to an installation of Spark on the cluster. SPARK_HOME_DIR = os.getenv('SPARK_HOME', DEFAULT_HOME) # Use a custom configuration directory SPARK_CONF_DIR = SPARK_HOME_DIR + "/conf" # Master used when submitting Spark jobs. -# For local clusters: "spark://%s:7077" % socket.gethostname() +# for ec2 clusters: open("/root/spark-ec2/cluster-url", 'r').readline().strip() +# For local clusters (default): "spark://%s:7077" % socket.gethostname() # For Yarn clusters: "yarn" -# Otherwise, the default uses the specified EC2 cluster -SPARK_CLUSTER_URL = "spark://%s:7077" % socket.gethostname() +SPARK_CLUSTER_URL="spark://%s:7077" % socket.gethostname() + IS_YARN_MODE = "yarn" in SPARK_CLUSTER_URL IS_MESOS_MODE = "mesos" in SPARK_CLUSTER_URL @@ -78,18 +80,18 @@ RESTART_SPARK_CLUSTER = RESTART_SPARK_CLUSTER and not IS_YARN_MODE RSYNC_SPARK_HOME = True # Which tests to run -RUN_SPARK_TESTS = False +RUN_SPARK_TESTS = True RUN_PYSPARK_TESTS = False RUN_STREAMING_TESTS = False -RUN_MLLIB_TESTS = True -RUN_PYTHON_MLLIB_TESTS = True +RUN_MLLIB_TESTS = False +RUN_PYTHON_MLLIB_TESTS = False # Which tests to prepare. Set this to true for the first # installation or whenever you make a change to the tests. PREP_SPARK_TESTS = True -PREP_PYSPARK_TESTS = True -PREP_STREAMING_TESTS = True -PREP_MLLIB_TESTS = True +PREP_PYSPARK_TESTS = False +PREP_STREAMING_TESTS = False +PREP_MLLIB_TESTS = False # Whether to warm up local disks (warm-up is only necesary on EC2). DISK_WARMUP = False @@ -102,7 +104,7 @@ DISK_WARMUP_BYTES = 200 * 1024 * 1024 DISK_WARMUP_FILES = 200 # Prompt for confirmation when deleting temporary files. -PROMPT_FOR_DELETES = False +PROMPT_FOR_DELETES = True # Files to write results to SPARK_OUTPUT_FILENAME = "results/spark_perf_output_%s_%s" % ( @@ -126,7 +128,7 @@ PYTHON_MLLIB_OUTPUT_FILENAME = "results/python_mllib_perf_output_%s_%s" % ( # number of records in a generated dataset) if you are running the tests with more # or fewer nodes. When developing new test suites, you might want to set this to a small # value suitable for a single machine, such as 0.001. -SCALE_FACTOR = 0.01 +SCALE_FACTOR = 1.0 assert SCALE_FACTOR > 0, "SCALE_FACTOR must be > 0." @@ -145,7 +147,7 @@ COMMON_JAVA_OPTS = [ # Fraction of JVM memory used for caching RDDs. JavaOptionSet("spark.storage.memoryFraction", [0.66]), JavaOptionSet("spark.serializer", ["org.apache.spark.serializer.JavaSerializer"]), - JavaOptionSet("spark.executor.memory", ["2g"]), + # JavaOptionSet("spark.executor.memory", ["9g"]), # Turn event logging on in order better diagnose failed tests. Off by default as it crashes # releases prior to 1.0.2 # JavaOptionSet("spark.eventLog.enabled", [True]), @@ -153,7 +155,7 @@ COMMON_JAVA_OPTS = [ JavaOptionSet("spark.locality.wait", [str(60 * 1000 * 1000)]) ] # Set driver memory here -SPARK_DRIVER_MEMORY = "2g" +SPARK_DRIVER_MEMORY = "20g" # The following options value sets are shared among all tests. COMMON_OPTS = [ # How many times to run each experiment - used to warm up system caches. @@ -316,7 +318,8 @@ STREAMING_COMMON_JAVA_OPTS = [ # Fraction of JVM memory used for caching RDDs. JavaOptionSet("spark.storage.memoryFraction", [0.66]), JavaOptionSet("spark.serializer", ["org.apache.spark.serializer.JavaSerializer"]), - # JavaOptionSet("spark.executor.memory", ["2g"]) + # JavaOptionSet("spark.executor.memory", ["9g"]), + JavaOptionSet("spark.executor.extraJavaOptions", [" -XX:+UseConcMarkSweepGC "]) ] STREAMING_KEY_VAL_TEST_OPTS = STREAMING_COMMON_OPTS + streaming_batch_duration_opts(2000) + [ @@ -371,8 +374,7 @@ MLLIB_PERF_TEST_RUNNER = "mllib.perf.TestRunner" # * Build Spark locally by running `build/sbt assembly; build/sbt publishLocal` in the Spark root directory # * Set `USE_CLUSTER_SPARK = True` and `MLLIB_SPARK_VERSION = {desired Spark version, e.g. 1.5}` # * Don't use PREP_MLLIB_TESTS = True; instead manually run `cd mllib-tests; sbt/sbt -Dspark.version=1.5.0-SNAPSHOT clean assembly` to build perf tests - -MLLIB_SPARK_VERSION = "2.0" +MLLIB_SPARK_VERSION = 1.5 MLLIB_JAVA_OPTS = COMMON_JAVA_OPTS if MLLIB_SPARK_VERSION >= 1.1: @@ -401,6 +403,9 @@ MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS = MLLIB_COMMON_OPTS + [ # Generalized Linear Model (GLM) Tests # MLLIB_GLM_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [ + # The scale factor for the noise in feature values. + # Currently ignored for regression. + OptionSet("feature-noise", [1.0]), # The number of features per example OptionSet("num-features", [10000], can_scale=False), # The number of iterations for SGD @@ -412,11 +417,6 @@ MLLIB_GLM_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [ # Regularization parameter OptionSet("reg-param", [0.1]) ] -if MLLIB_SPARK_VERSION >= 1.1: - MLLIB_GLM_TEST_OPTS += [ - # Optimization algorithm: sgd, l-bfgs - OptionSet("optimizer", ["sgd", "l-bfgs"]) - ] if MLLIB_SPARK_VERSION >= 1.5: MLLIB_GLM_TEST_OPTS += [ # Ignored, but required for config @@ -425,6 +425,8 @@ if MLLIB_SPARK_VERSION >= 1.5: # GLM Regression Tests # MLLIB_GLM_REGRESSION_TEST_OPTS = MLLIB_GLM_TEST_OPTS + [ + # Optimization algorithm: sgd + OptionSet("optimizer", ["sgd"]), # The intercept for the data OptionSet("intercept", [0.0]), # The scale factor for label noise @@ -440,6 +442,8 @@ MLLIB_TESTS += [("glm-regression", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, MLLIB_CLASSIFICATION_TEST_OPTS = MLLIB_GLM_TEST_OPTS + [ # Expected fraction of examples which are negative OptionSet("per-negative", [0.3]), + # Optimization algorithm: sgd, l-bfgs + OptionSet("optimizer", ["sgd", "l-bfgs"]) ] # GLM Classification Tests # @@ -466,15 +470,15 @@ if MLLIB_SPARK_VERSION >= 1.5: OptionSet("reg-param", [0.01]), # The scale factor for the noise in feature values OptionSet("feature-noise", [1.0]), - # The scale factor for the noise in label values - OptionSet("label-noise", [0.1]), - # The intercept for the data - OptionSet("intercept", [0.2]), # The step size is not used in LBFGS, but this is required in parameter checking. OptionSet("step-size", [0.0]) ] MLLIB_GLM_ELASTIC_NET_REGRESSION_TEST_OPTS = MLLIB_GLM_ELASTIC_NET_TEST_OPTS + [ + # The scale factor for the noise in label values + OptionSet("label-noise", [0.1]), + # The intercept for the data + OptionSet("intercept", [0.2]), # Loss to minimize: l2 (squared error) OptionSet("loss", ["l2"]) ] @@ -488,9 +492,11 @@ if MLLIB_SPARK_VERSION >= 1.5: MLLIB_TESTS += [("glm-regression", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, MLLIB_JAVA_OPTS, [ConstantOption("glm-regression")] + MLLIB_GLM_ELASTIC_NET_REGRESSION_TEST_OPTS + - [OptionSet("num-features", [100], can_scale=False)])] + [OptionSet("num-features", [200], can_scale=False)])] MLLIB_GLM_ELASTIC_NET_CLASSIFICATION_TEST_OPTS = MLLIB_GLM_ELASTIC_NET_TEST_OPTS + [ + # Expected fraction of examples which are negative + OptionSet("per-negative", [0.3]), # In GLM classification with elastic-net regularization, only logistic loss is supported. OptionSet("loss", ["logistic"]) ] @@ -504,7 +510,7 @@ if MLLIB_SPARK_VERSION >= 1.5: MLLIB_TESTS += [("glm-classification", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, MLLIB_JAVA_OPTS, [ConstantOption("glm-classification")] + MLLIB_GLM_ELASTIC_NET_CLASSIFICATION_TEST_OPTS + - [OptionSet("num-features", [100], can_scale=False)])] + [OptionSet("num-features", [200], can_scale=False)])] NAIVE_BAYES_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [ # The number of features per example @@ -597,10 +603,10 @@ MLLIB_TESTS += [("als", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, # Clustering Tests # MLLIB_CLUSTERING_TEST_OPTS = MLLIB_COMMON_OPTS + [ - # The number of points - OptionSet("num-points", [1000000], can_scale=True), + # The number of examples + OptionSet("num-examples", [1000000], can_scale=True), # The number of features per point - OptionSet("num-columns", [10000], can_scale=False), + OptionSet("num-features", [10000], can_scale=False), # The number of centers OptionSet("num-centers", [20]), # The number of iterations for KMeans @@ -611,8 +617,8 @@ MLLIB_TESTS += [("kmeans", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, MLLIB_JAVA_OPTS, [ConstantOption("kmeans")] + MLLIB_CLUSTERING_TEST_OPTS)] MLLIB_GMM_TEST_OPTS = MLLIB_COMMON_OPTS + [ - OptionSet("num-points", [1000000], can_scale=True), - OptionSet("num-columns", [100], can_scale=False), + OptionSet("num-examples", [1000000], can_scale=True), + OptionSet("num-features", [100], can_scale=False), OptionSet("num-centers", [20], can_scale=False), OptionSet("num-iterations", [20])] @@ -632,16 +638,15 @@ if MLLIB_SPARK_VERSION >= 1.4: MLLIB_TESTS += [("lda", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, MLLIB_JAVA_OPTS, [ConstantOption("lda")] + MLLIB_LDA_TEST_OPTS)] -# TODO: tune PIC test size to run in 20-30 seconds MLLIB_PIC_TEST_OPTS = MLLIB_COMMON_OPTS + [ - OptionSet("num-points", [10000], can_scale=True), - OptionSet("node-degree", [10], can_scale=False), - OptionSet("num-centers", [20], can_scale=False), + OptionSet("num-examples", [10000000], can_scale=True), + OptionSet("node-degree", [20], can_scale=False), + OptionSet("num-centers", [40], can_scale=False), OptionSet("num-iterations", [20])] if MLLIB_SPARK_VERSION >= 1.3: MLLIB_TESTS += [("pic", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, - MLLIB_JAVA_OPTS, [ConstantOption("pic")] + MLLIB_CLUSTERING_TEST_OPTS)] + MLLIB_JAVA_OPTS, [ConstantOption("pic")] + MLLIB_PIC_TEST_OPTS)] # Linear Algebra Tests # MLLIB_LINALG_TEST_OPTS = MLLIB_COMMON_OPTS + [ @@ -670,7 +675,7 @@ MLLIB_TESTS += [("pca", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, MLLIB_TESTS += [("summary-statistics", MLLIB_PERF_TEST_RUNNER, SCALE_FACTOR, MLLIB_JAVA_OPTS, [ConstantOption("summary-statistics")] + - MLLIB_LINALG_TEST_OPTS)] + MLLIB_BIG_LINALG_TEST_OPTS)] MLLIB_BLOCK_MATRIX_MULT_TEST_OPTS = MLLIB_COMMON_OPTS + [ OptionSet("m", [20000], can_scale=True), @@ -754,8 +759,8 @@ if MLLIB_SPARK_VERSION >= 1.3: MLLIB_PREFIX_SPAN_TEST_OPTS = MLLIB_FPM_TEST_OPTS + \ [OptionSet("num-sequences", [5000000], can_scale=True), OptionSet("avg-sequence-size", [5], can_scale=False), - OptionSet("avg-itemset-size", [1], can_scale=False), - OptionSet("num-items", [100], can_scale=False), + OptionSet("avg-itemset-size", [2], can_scale=False), + OptionSet("num-items", [500], can_scale=False), OptionSet("min-support", [0.5], can_scale=False), OptionSet("max-pattern-len", [10], can_scale=False), OptionSet("max-local-proj-db-size", [32000000], can_scale=False)] diff --git a/mllib-tests/project/MLlibTestsBuild.scala b/mllib-tests/project/MLlibTestsBuild.scala index 6a51e93..63c4328 100644 --- a/mllib-tests/project/MLlibTestsBuild.scala +++ b/mllib-tests/project/MLlibTestsBuild.scala @@ -45,7 +45,7 @@ object MLlibTestsBuild extends Build { case v if v.startsWith("1.5.") => "v1p5" // acceptable for now, but change later when new algs are added case v if v.startsWith("1.6.") => "v1p5" case v if v.startsWith("2.0") => "v2p0" - case _ => throw new IllegalArgumentException(s"This Spark version isn't suppored: ${sparkVersion.value}.") + case _ => throw new IllegalArgumentException(s"This Spark version isn't supported: ${sparkVersion.value}.") } baseDirectory.value / targetFolder / "src" / "main" / "scala" }, From 44f7a5fe6c4b4990fd6d59009e5ec512f1897013 Mon Sep 17 00:00:00 2001 From: Adam Roberts Date: Fri, 1 Jul 2016 11:21:46 +0100 Subject: [PATCH 11/22] Back to 3.2.9 for json native --- mllib-tests/project/MLlibTestsBuild.scala | 2 +- spark-tests/project/SparkTestsBuild.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib-tests/project/MLlibTestsBuild.scala b/mllib-tests/project/MLlibTestsBuild.scala index 63c4328..43d6b2a 100644 --- a/mllib-tests/project/MLlibTestsBuild.scala +++ b/mllib-tests/project/MLlibTestsBuild.scala @@ -22,7 +22,7 @@ object MLlibTestsBuild extends Build { "net.sf.jopt-simple" % "jopt-simple" % "4.6", "org.scalatest" %% "scalatest" % "2.2.1" % "test", "org.slf4j" % "slf4j-log4j12" % "1.7.2", - "org.json4s" %% "json4s-native" % "3.2.10" + "org.json4s" %% "json4s-native" % "3.2.9" // IMPORTANT! // We need to uncomment the below once Spark 2.0.0 becomes available diff --git a/spark-tests/project/SparkTestsBuild.scala b/spark-tests/project/SparkTestsBuild.scala index 707c39a..3aec419 100644 --- a/spark-tests/project/SparkTestsBuild.scala +++ b/spark-tests/project/SparkTestsBuild.scala @@ -15,7 +15,7 @@ object SparkTestsBuild extends Build { "net.sf.jopt-simple" % "jopt-simple" % "4.6", "org.scalatest" %% "scalatest" % "2.2.1" % "test", "com.google.guava" % "guava" % "14.0.1", - "org.json4s" %% "json4s-native" % "3.2.10" + "org.json4s" %% "json4s-native" % "3.2.9" // IMPORTANT! // We need to uncomment the below once Spark 2.0.0 becomes available From cf6d6717754f806c06593c0fdf70fcef6334b21d Mon Sep 17 00:00:00 2001 From: Adam Roberts Date: Fri, 1 Jul 2016 11:23:06 +0100 Subject: [PATCH 12/22] Tidying up --- .../v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala b/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala index 0d438db..4dd1d49 100644 --- a/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala +++ b/mllib-tests/v1p4/src/main/scala/mllib/perf/MLAlgorithmTests.scala @@ -603,4 +603,3 @@ class DecisionTreeTest(sc: SparkContext) extends DecisionTreeTests(sc) { } } } - From 9e171d415e1099f4052055e55298e4f14a25b7cb Mon Sep 17 00:00:00 2001 From: Adam Roberts Date: Fri, 1 Jul 2016 16:36:33 +0100 Subject: [PATCH 13/22] Add default to 2.0 for spark mllib version --- config/config.py.template | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/config/config.py.template b/config/config.py.template index 8489d6f..becd7b6 100755 --- a/config/config.py.template +++ b/config/config.py.template @@ -374,7 +374,8 @@ MLLIB_PERF_TEST_RUNNER = "mllib.perf.TestRunner" # * Build Spark locally by running `build/sbt assembly; build/sbt publishLocal` in the Spark root directory # * Set `USE_CLUSTER_SPARK = True` and `MLLIB_SPARK_VERSION = {desired Spark version, e.g. 1.5}` # * Don't use PREP_MLLIB_TESTS = True; instead manually run `cd mllib-tests; sbt/sbt -Dspark.version=1.5.0-SNAPSHOT clean assembly` to build perf tests -MLLIB_SPARK_VERSION = 1.5 + +MLLIB_SPARK_VERSION = 2.0 MLLIB_JAVA_OPTS = COMMON_JAVA_OPTS if MLLIB_SPARK_VERSION >= 1.1: From b35c29460f522e5cc4b7eef7c387da5ef5c78275 Mon Sep 17 00:00:00 2001 From: Adam Roberts Date: Mon, 15 Aug 2016 13:41:25 +0100 Subject: [PATCH 14/22] Use GA Spark 2 not preview --- mllib-tests/project/MLlibTestsBuild.scala | 10 ++-------- spark-tests/project/SparkTestsBuild.scala | 10 ++-------- streaming-tests/project/StreamingTestsBuild.scala | 4 ++-- 3 files changed, 6 insertions(+), 18 deletions(-) diff --git a/mllib-tests/project/MLlibTestsBuild.scala b/mllib-tests/project/MLlibTestsBuild.scala index 43d6b2a..cf2641c 100644 --- a/mllib-tests/project/MLlibTestsBuild.scala +++ b/mllib-tests/project/MLlibTestsBuild.scala @@ -23,14 +23,8 @@ object MLlibTestsBuild extends Build { "org.scalatest" %% "scalatest" % "2.2.1" % "test", "org.slf4j" % "slf4j-log4j12" % "1.7.2", "org.json4s" %% "json4s-native" % "3.2.9" - - // IMPORTANT! - // We need to uncomment the below once Spark 2.0.0 becomes available - // This relies on using spark built under the lib folder - // of this project - - //"org.apache.spark" %% "spark-core" % "2.0.0-SNAPSHOT" % "provided", - //"org.apache.spark" %% "spark-mllib" % "2.0.0-SNAPSHOT" % "provided" + "org.apache.spark" %% "spark-core" % "2.0.0", + "org.apache.spark" %% "spark-mllib" % "2.0.0" ) ) diff --git a/spark-tests/project/SparkTestsBuild.scala b/spark-tests/project/SparkTestsBuild.scala index 3aec419..8ea4915 100644 --- a/spark-tests/project/SparkTestsBuild.scala +++ b/spark-tests/project/SparkTestsBuild.scala @@ -15,14 +15,8 @@ object SparkTestsBuild extends Build { "net.sf.jopt-simple" % "jopt-simple" % "4.6", "org.scalatest" %% "scalatest" % "2.2.1" % "test", "com.google.guava" % "guava" % "14.0.1", - "org.json4s" %% "json4s-native" % "3.2.9" - - // IMPORTANT! - // We need to uncomment the below once Spark 2.0.0 becomes available - // This relies on using spark built under the lib folder - // of this project - //"org.apache.spark" %% "spark-core" % "2.0.0-SNAPSHOT" % "provided", - + "org.json4s" %% "json4s-native" % "3.2.9", + "org.apache.spark" %% "spark-core" % "2.0.0" ), test in assembly := {}, outputPath in assembly := file("target/spark-perf-tests-assembly.jar"), diff --git a/streaming-tests/project/StreamingTestsBuild.scala b/streaming-tests/project/StreamingTestsBuild.scala index fc2569c..6bf6b25 100644 --- a/streaming-tests/project/StreamingTestsBuild.scala +++ b/streaming-tests/project/StreamingTestsBuild.scala @@ -20,8 +20,8 @@ object StreamingTestsBuild extends Build { "com.typesafe.akka" %% "akka-remote" % "2.3.11", "com.typesafe.akka" %% "akka-agent" % "2.3.11", "org.slf4j" % "slf4j-log4j12" % "1.7.2", - "org.apache.spark" %% "spark-core" % "2.0.0-preview" % "provided", - "org.apache.spark" %% "spark-streaming" % "2.0.0-preview" % "provided" + "org.apache.spark" %% "spark-core" % "2.0.0", + "org.apache.spark" %% "spark-streaming" % "2.0.0" ), test in assembly := {}, outputPath in assembly := file("target/streaming-perf-tests-assembly.jar"), From 533f6a6e9e924a267e99214cfec17e314b9d31ab Mon Sep 17 00:00:00 2001 From: Adam Roberts Date: Mon, 15 Aug 2016 14:03:01 +0100 Subject: [PATCH 15/22] Ensure we can override Spark version for all projects --- mllib-tests/project/MLlibTestsBuild.scala | 5 ++--- spark-tests/project/SparkTestsBuild.scala | 5 +++-- streaming-tests/project/StreamingTestsBuild.scala | 7 ++++--- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/mllib-tests/project/MLlibTestsBuild.scala b/mllib-tests/project/MLlibTestsBuild.scala index cf2641c..fa8b231 100644 --- a/mllib-tests/project/MLlibTestsBuild.scala +++ b/mllib-tests/project/MLlibTestsBuild.scala @@ -23,8 +23,8 @@ object MLlibTestsBuild extends Build { "org.scalatest" %% "scalatest" % "2.2.1" % "test", "org.slf4j" % "slf4j-log4j12" % "1.7.2", "org.json4s" %% "json4s-native" % "3.2.9" - "org.apache.spark" %% "spark-core" % "2.0.0", - "org.apache.spark" %% "spark-mllib" % "2.0.0" + "org.apache.spark" %% "spark-core" % sparkVersion.value, + "org.apache.spark" %% "spark-mllib" % sparkVersion.value ) ) @@ -33,7 +33,6 @@ object MLlibTestsBuild extends Build { file("."), settings = assemblySettings ++ commonSettings ++ Seq( scalaSource in Compile := { - println("sparkVersion.value is: " + sparkVersion.value) val targetFolder = sparkVersion.value match { case v if v.startsWith("1.4.") => "v1p4" case v if v.startsWith("1.5.") => "v1p5" // acceptable for now, but change later when new algs are added diff --git a/spark-tests/project/SparkTestsBuild.scala b/spark-tests/project/SparkTestsBuild.scala index 8ea4915..c842851 100644 --- a/spark-tests/project/SparkTestsBuild.scala +++ b/spark-tests/project/SparkTestsBuild.scala @@ -10,13 +10,14 @@ object SparkTestsBuild extends Build { settings = assemblySettings ++ Seq( organization := "org.spark-project", version := "0.1", - scalaVersion := "2.11.8", + scalaVersion := sys.props.getOrElse("scala.version", default="2.11.8"), + sparkVersion := sys.props.getOrElse("spark.version", default="2.0.0"), libraryDependencies ++= Seq( "net.sf.jopt-simple" % "jopt-simple" % "4.6", "org.scalatest" %% "scalatest" % "2.2.1" % "test", "com.google.guava" % "guava" % "14.0.1", "org.json4s" %% "json4s-native" % "3.2.9", - "org.apache.spark" %% "spark-core" % "2.0.0" + "org.apache.spark" %% "spark-core" % sparkVersion ), test in assembly := {}, outputPath in assembly := file("target/spark-perf-tests-assembly.jar"), diff --git a/streaming-tests/project/StreamingTestsBuild.scala b/streaming-tests/project/StreamingTestsBuild.scala index 6bf6b25..9c2e0bf 100644 --- a/streaming-tests/project/StreamingTestsBuild.scala +++ b/streaming-tests/project/StreamingTestsBuild.scala @@ -10,7 +10,8 @@ object StreamingTestsBuild extends Build { settings = assemblySettings ++ Seq( organization := "org.spark-project", version := "0.1", - scalaVersion := "2.11.8", + scalaVersion := sys.props.getOrElse("scala.version", default="2.11.8"), + sparkVersion := sys.props.getOrElse("spark.version", default="2.0.0"), libraryDependencies ++= Seq( "net.sf.jopt-simple" % "jopt-simple" % "4.5", "org.scalatest" %% "scalatest" % "2.2.1" % "test", @@ -20,8 +21,8 @@ object StreamingTestsBuild extends Build { "com.typesafe.akka" %% "akka-remote" % "2.3.11", "com.typesafe.akka" %% "akka-agent" % "2.3.11", "org.slf4j" % "slf4j-log4j12" % "1.7.2", - "org.apache.spark" %% "spark-core" % "2.0.0", - "org.apache.spark" %% "spark-streaming" % "2.0.0" + "org.apache.spark" %% "spark-core" % sparkVersion, + "org.apache.spark" %% "spark-streaming" % sparkVersion ), test in assembly := {}, outputPath in assembly := file("target/streaming-perf-tests-assembly.jar"), From 1eb427c37b34302aa2770fe1ade2a9fb8f64b892 Mon Sep 17 00:00:00 2001 From: Adam Roberts Date: Mon, 15 Aug 2016 14:04:46 +0100 Subject: [PATCH 16/22] Add a missing comma for ml project file --- mllib-tests/project/MLlibTestsBuild.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib-tests/project/MLlibTestsBuild.scala b/mllib-tests/project/MLlibTestsBuild.scala index fa8b231..2ec5ecf 100644 --- a/mllib-tests/project/MLlibTestsBuild.scala +++ b/mllib-tests/project/MLlibTestsBuild.scala @@ -22,7 +22,7 @@ object MLlibTestsBuild extends Build { "net.sf.jopt-simple" % "jopt-simple" % "4.6", "org.scalatest" %% "scalatest" % "2.2.1" % "test", "org.slf4j" % "slf4j-log4j12" % "1.7.2", - "org.json4s" %% "json4s-native" % "3.2.9" + "org.json4s" %% "json4s-native" % "3.2.9", "org.apache.spark" %% "spark-core" % sparkVersion.value, "org.apache.spark" %% "spark-mllib" % sparkVersion.value ) From 70963cfc9f2a6234d4db9f953e74b4b09bb8c89b Mon Sep 17 00:00:00 2001 From: Adam Roberts Date: Fri, 26 Aug 2016 13:26:45 +0100 Subject: [PATCH 17/22] Just use mllib provided artifact and remove core part --- mllib-tests/project/MLlibTestsBuild.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mllib-tests/project/MLlibTestsBuild.scala b/mllib-tests/project/MLlibTestsBuild.scala index 2ec5ecf..4ebf2a1 100644 --- a/mllib-tests/project/MLlibTestsBuild.scala +++ b/mllib-tests/project/MLlibTestsBuild.scala @@ -23,8 +23,7 @@ object MLlibTestsBuild extends Build { "org.scalatest" %% "scalatest" % "2.2.1" % "test", "org.slf4j" % "slf4j-log4j12" % "1.7.2", "org.json4s" %% "json4s-native" % "3.2.9", - "org.apache.spark" %% "spark-core" % sparkVersion.value, - "org.apache.spark" %% "spark-mllib" % sparkVersion.value + "org.apache.spark" %% "spark-mllib" % sparkVersion.value % "provided" ) ) From 1d1441ba259e1e8e64e868a921c83c28fec997da Mon Sep 17 00:00:00 2001 From: Adam Roberts Date: Fri, 26 Aug 2016 13:29:08 +0100 Subject: [PATCH 18/22] Use 2.0.0 to resolve artifacts for mllib --- config/config.py.template | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/config.py.template b/config/config.py.template index becd7b6..c15d5e9 100755 --- a/config/config.py.template +++ b/config/config.py.template @@ -375,7 +375,7 @@ MLLIB_PERF_TEST_RUNNER = "mllib.perf.TestRunner" # * Set `USE_CLUSTER_SPARK = True` and `MLLIB_SPARK_VERSION = {desired Spark version, e.g. 1.5}` # * Don't use PREP_MLLIB_TESTS = True; instead manually run `cd mllib-tests; sbt/sbt -Dspark.version=1.5.0-SNAPSHOT clean assembly` to build perf tests -MLLIB_SPARK_VERSION = 2.0 +MLLIB_SPARK_VERSION = 2.0.0 MLLIB_JAVA_OPTS = COMMON_JAVA_OPTS if MLLIB_SPARK_VERSION >= 1.1: From c873532c3ec91e814c78f1abcdf0c31702ebd99e Mon Sep 17 00:00:00 2001 From: Adam Roberts Date: Fri, 26 Aug 2016 14:41:08 +0100 Subject: [PATCH 19/22] Remove feature-noise from glm regression so we can build and run --- .../src/main/scala/mllib/perf/MLAlgorithmTests.scala | 12 ++++-------- .../v2p0/src/main/scala/mllib/perf/StatTests.scala | 2 +- .../main/scala/mllib/perf/util/DataGenerator.scala | 8 +++----- 3 files changed, 8 insertions(+), 14 deletions(-) diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/MLAlgorithmTests.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/MLAlgorithmTests.scala index 693ca7c..37409b7 100644 --- a/mllib-tests/v2p0/src/main/scala/mllib/perf/MLAlgorithmTests.scala +++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/MLAlgorithmTests.scala @@ -204,10 +204,9 @@ class GLMRegressionTest(sc: SparkContext) extends GLMTests(sc) { class GLMClassificationTest(sc: SparkContext) extends GLMTests(sc) { val THRESHOLD = ("per-negative", "probability for a negative label during data generation") - val FEATURE_NOISE = ("feature-noise", "scale factor for the noise during feature generation") val LOSS = ("loss", "loss to minimize. Supported: logistic, hinge (SVM).") - doubleOptions = doubleOptions ++ Seq(THRESHOLD, FEATURE_NOISE) + doubleOptions = doubleOptions ++ Seq(THRESHOLD) stringOptions = stringOptions ++ Seq(LOSS) val options = intOptions ++ stringOptions ++ booleanOptions ++ doubleOptions ++ longOptions @@ -227,10 +226,9 @@ class GLMClassificationTest(sc: SparkContext) extends GLMTests(sc) { val numPartitions: Int = intOptionValue(NUM_PARTITIONS) val threshold: Double = doubleOptionValue(THRESHOLD) - val featureNoise: Double = doubleOptionValue(FEATURE_NOISE) val data = DataGenerator.generateClassificationLabeledPoints(sc, - math.ceil(numExamples * 1.25).toLong, numFeatures, threshold, featureNoise, numPartitions, + math.ceil(numExamples * 1.25).toLong, numFeatures, threshold, numPartitions, seed) val split = data.randomSplit(Array(0.8, 0.2), seed) @@ -462,11 +460,10 @@ class NaiveBayesTest(sc: SparkContext) extends RegressionAndClassificationTests[NaiveBayesModel](sc) { val THRESHOLD = ("per-negative", "probability for a negative label during data generation") - val FEATURE_NOISE = ("feature-noise", "scale factor for the noise during feature generation") val SMOOTHING = ("nb-lambda", "the smoothing parameter lambda for Naive Bayes") val MODEL_TYPE = ("model-type", "either multinomial (default) or bernoulli") - doubleOptions = doubleOptions ++ Seq(THRESHOLD, FEATURE_NOISE, SMOOTHING) + doubleOptions = doubleOptions ++ Seq(THRESHOLD, SMOOTHING) stringOptions = stringOptions ++ Seq(MODEL_TYPE) val options = intOptions ++ stringOptions ++ booleanOptions ++ doubleOptions ++ longOptions addOptionsToParser() @@ -478,7 +475,6 @@ class NaiveBayesTest(sc: SparkContext) val numPartitions: Int = intOptionValue(NUM_PARTITIONS) val threshold: Double = doubleOptionValue(THRESHOLD) - val featureNoise: Double = doubleOptionValue(FEATURE_NOISE) val modelType = stringOptionValue(MODEL_TYPE) val data = if (modelType == "bernoulli") { @@ -486,7 +482,7 @@ class NaiveBayesTest(sc: SparkContext) math.ceil(numExamples * 1.25).toLong, numFeatures, threshold, numPartitions, seed) } else { val negdata = DataGenerator.generateClassificationLabeledPoints(sc, - math.ceil(numExamples * 1.25).toLong, numFeatures, threshold, featureNoise, numPartitions, + math.ceil(numExamples * 1.25).toLong, numFeatures, threshold, numPartitions, seed) val dataNonneg = negdata.map { lp => LabeledPoint(lp.label, Vectors.dense(lp.features.toArray.map(math.abs))) diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/StatTests.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/StatTests.scala index 21c286c..2e84629 100644 --- a/mllib-tests/v2p0/src/main/scala/mllib/perf/StatTests.scala +++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/StatTests.scala @@ -73,7 +73,7 @@ class ChiSquaredFeatureTest(sc: SparkContext) extends StatTests[RDD[LabeledPoint val n: Int = intOptionValue(NUM_COLS) val numPartitions: Int = intOptionValue(NUM_PARTITIONS) - rdd = DataGenerator.generateClassificationLabeledPoints(sc, m, n, 0.5, 1.0, numPartitions, + rdd = DataGenerator.generateClassificationLabeledPoints(sc, m, n, 0.5, numPartitions, seed, chiSq = true).cache() // Materialize rdd diff --git a/mllib-tests/v2p0/src/main/scala/mllib/perf/util/DataGenerator.scala b/mllib-tests/v2p0/src/main/scala/mllib/perf/util/DataGenerator.scala index e65a5a5..ff3fd00 100644 --- a/mllib-tests/v2p0/src/main/scala/mllib/perf/util/DataGenerator.scala +++ b/mllib-tests/v2p0/src/main/scala/mllib/perf/util/DataGenerator.scala @@ -49,12 +49,11 @@ object DataGenerator { numRows: Long, numCols: Int, threshold: Double, - featureNoise: Double, numPartitions: Int, seed: Long = System.currentTimeMillis(), chiSq: Boolean = false): RDD[LabeledPoint] = { - RandomRDDs.randomRDD(sc, new ClassLabelGenerator(numCols,threshold, featureNoise, chiSq), + RandomRDDs.randomRDD(sc, new ClassLabelGenerator(numCols,threshold, chiSq), numRows, numPartitions, seed) } @@ -364,7 +363,6 @@ class RatingGenerator( class ClassLabelGenerator( private val numFeatures: Int, private val threshold: Double, - private val featureNoise: Double, private val chiSq: Boolean) extends RandomDataGenerator[LabeledPoint] { private val rng = new java.util.Random() @@ -372,7 +370,7 @@ class ClassLabelGenerator( override def nextValue(): LabeledPoint = { val y = if (rng.nextDouble() < threshold) 0.0 else 1.0 val x = Array.fill[Double](numFeatures) { - if (!chiSq) rng.nextGaussian() + (y * featureNoise) else rng.nextInt(6) * 1.0 + if (!chiSq) rng.nextGaussian() + y else rng.nextInt(6) * 1.0 } LabeledPoint(y, Vectors.dense(x)) @@ -383,7 +381,7 @@ class ClassLabelGenerator( } override def copy(): ClassLabelGenerator = - new ClassLabelGenerator(numFeatures, threshold, featureNoise, chiSq) + new ClassLabelGenerator(numFeatures, threshold, chiSq) } class BinaryLabeledDataGenerator( From e15a40e565568129c25b188328dc4ea1bd5b9d53 Mon Sep 17 00:00:00 2001 From: Adam Roberts Date: Fri, 26 Aug 2016 15:00:24 +0100 Subject: [PATCH 20/22] Remove sparkVersion statement in SparkTestsBuild.scala --- spark-tests/project/SparkTestsBuild.scala | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/spark-tests/project/SparkTestsBuild.scala b/spark-tests/project/SparkTestsBuild.scala index c842851..f77c39d 100644 --- a/spark-tests/project/SparkTestsBuild.scala +++ b/spark-tests/project/SparkTestsBuild.scala @@ -11,13 +11,12 @@ object SparkTestsBuild extends Build { organization := "org.spark-project", version := "0.1", scalaVersion := sys.props.getOrElse("scala.version", default="2.11.8"), - sparkVersion := sys.props.getOrElse("spark.version", default="2.0.0"), libraryDependencies ++= Seq( "net.sf.jopt-simple" % "jopt-simple" % "4.6", "org.scalatest" %% "scalatest" % "2.2.1" % "test", "com.google.guava" % "guava" % "14.0.1", - "org.json4s" %% "json4s-native" % "3.2.9", - "org.apache.spark" %% "spark-core" % sparkVersion + "org.apache.spark" %% "spark-core" % "2.0.0" % "provided", + "org.json4s" %% "json4s-native" % "3.2.9" ), test in assembly := {}, outputPath in assembly := file("target/spark-perf-tests-assembly.jar"), From 0ad07c7dc2c58b01688f08f0985fed88075eaf49 Mon Sep 17 00:00:00 2001 From: Adam Roberts Date: Fri, 26 Aug 2016 16:36:04 +0100 Subject: [PATCH 21/22] Remove spark version from streaming project file also --- streaming-tests/project/StreamingTestsBuild.scala | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/streaming-tests/project/StreamingTestsBuild.scala b/streaming-tests/project/StreamingTestsBuild.scala index 9c2e0bf..39784f3 100644 --- a/streaming-tests/project/StreamingTestsBuild.scala +++ b/streaming-tests/project/StreamingTestsBuild.scala @@ -11,7 +11,6 @@ object StreamingTestsBuild extends Build { organization := "org.spark-project", version := "0.1", scalaVersion := sys.props.getOrElse("scala.version", default="2.11.8"), - sparkVersion := sys.props.getOrElse("spark.version", default="2.0.0"), libraryDependencies ++= Seq( "net.sf.jopt-simple" % "jopt-simple" % "4.5", "org.scalatest" %% "scalatest" % "2.2.1" % "test", @@ -21,8 +20,8 @@ object StreamingTestsBuild extends Build { "com.typesafe.akka" %% "akka-remote" % "2.3.11", "com.typesafe.akka" %% "akka-agent" % "2.3.11", "org.slf4j" % "slf4j-log4j12" % "1.7.2", - "org.apache.spark" %% "spark-core" % sparkVersion, - "org.apache.spark" %% "spark-streaming" % sparkVersion + "org.apache.spark" %% "spark-core" % "2.0.0" % "provided", + "org.apache.spark" %% "spark-streaming" % "2.0.0" % "provided" ), test in assembly := {}, outputPath in assembly := file("target/streaming-perf-tests-assembly.jar"), From e6cbaf9d266eca7e6e11a71f27c3655f412f02da Mon Sep 17 00:00:00 2001 From: Adam Roberts Date: Tue, 27 Sep 2016 19:21:30 +0100 Subject: [PATCH 22/22] Lower defaults, feature-noise if Spark 1.x only --- config/config.py.template | 21 +++++++++++++++------ mllib-tests/project/MLlibTestsBuild.scala | 5 ++++- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/config/config.py.template b/config/config.py.template index c15d5e9..aeb39ee 100755 --- a/config/config.py.template +++ b/config/config.py.template @@ -128,7 +128,7 @@ PYTHON_MLLIB_OUTPUT_FILENAME = "results/python_mllib_perf_output_%s_%s" % ( # number of records in a generated dataset) if you are running the tests with more # or fewer nodes. When developing new test suites, you might want to set this to a small # value suitable for a single machine, such as 0.001. -SCALE_FACTOR = 1.0 +SCALE_FACTOR = 0.01 assert SCALE_FACTOR > 0, "SCALE_FACTOR must be > 0." @@ -155,7 +155,8 @@ COMMON_JAVA_OPTS = [ JavaOptionSet("spark.locality.wait", [str(60 * 1000 * 1000)]) ] # Set driver memory here -SPARK_DRIVER_MEMORY = "20g" +SPARK_DRIVER_MEMORY = "1g" + # The following options value sets are shared among all tests. COMMON_OPTS = [ # How many times to run each experiment - used to warm up system caches. @@ -375,7 +376,7 @@ MLLIB_PERF_TEST_RUNNER = "mllib.perf.TestRunner" # * Set `USE_CLUSTER_SPARK = True` and `MLLIB_SPARK_VERSION = {desired Spark version, e.g. 1.5}` # * Don't use PREP_MLLIB_TESTS = True; instead manually run `cd mllib-tests; sbt/sbt -Dspark.version=1.5.0-SNAPSHOT clean assembly` to build perf tests -MLLIB_SPARK_VERSION = 2.0.0 +MLLIB_SPARK_VERSION = 2.0 MLLIB_JAVA_OPTS = COMMON_JAVA_OPTS if MLLIB_SPARK_VERSION >= 1.1: @@ -403,10 +404,8 @@ MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS = MLLIB_COMMON_OPTS + [ ] # Generalized Linear Model (GLM) Tests # + MLLIB_GLM_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [ - # The scale factor for the noise in feature values. - # Currently ignored for regression. - OptionSet("feature-noise", [1.0]), # The number of features per example OptionSet("num-features", [10000], can_scale=False), # The number of iterations for SGD @@ -418,12 +417,22 @@ MLLIB_GLM_TEST_OPTS = MLLIB_REGRESSION_CLASSIFICATION_TEST_OPTS + [ # Regularization parameter OptionSet("reg-param", [0.1]) ] + if MLLIB_SPARK_VERSION >= 1.5: MLLIB_GLM_TEST_OPTS += [ # Ignored, but required for config OptionSet("elastic-net-param", [0.0]) ] +if MLLIB_SPARK_VERSION < 2.0: + MLLIB_GLM_TEST_OPTS += [ + # The scale factor for the noise in feature values. + # Currently ignored for regression. + # Only available in Spark 1.x + OptionSet("feature-noise", [1.0]) + ] + + # GLM Regression Tests # MLLIB_GLM_REGRESSION_TEST_OPTS = MLLIB_GLM_TEST_OPTS + [ # Optimization algorithm: sgd diff --git a/mllib-tests/project/MLlibTestsBuild.scala b/mllib-tests/project/MLlibTestsBuild.scala index 4ebf2a1..3622734 100644 --- a/mllib-tests/project/MLlibTestsBuild.scala +++ b/mllib-tests/project/MLlibTestsBuild.scala @@ -23,7 +23,10 @@ object MLlibTestsBuild extends Build { "org.scalatest" %% "scalatest" % "2.2.1" % "test", "org.slf4j" % "slf4j-log4j12" % "1.7.2", "org.json4s" %% "json4s-native" % "3.2.9", - "org.apache.spark" %% "spark-mllib" % sparkVersion.value % "provided" + // Allow the user to set the Spark version but default to look + // for the Spark 2.0.0 artifact. Uncomment below to use spark.version + // "org.apache.spark" %% "spark-mllib" % sparkVersion.value % "provided" + "org.apache.spark" %% "spark-mllib" % "2.0.0" % "provided" ) )