From d2ab68a51a03429e25a4b4ec965cbca089aa48a6 Mon Sep 17 00:00:00 2001 From: Sean Owen Date: Mon, 25 Feb 2019 04:37:45 -0600 Subject: [PATCH] [SPARK-26966][ML] Update to JPMML 1.4.8 JPMML apparently only supports Java 9 in 1.4.2+. We are seeing text failures from JPMML relating to JAXB when running on Java 11. It's shaded and not a big change, so should be safe. Existing tests. Closes #23868 from srowen/SPARK-26966. Authored-by: Sean Owen Signed-off-by: Sean Owen --- build.gradle | 2 +- .../BinaryClassificationPMMLModelExport.scala | 14 ++++++++------ .../export/GeneralizedLinearPMMLModelExport.scala | 10 ++++++---- .../mllib/pmml/export/KMeansPMMLModelExport.scala | 10 ++++++---- .../mllib/pmml/export/PMMLModelExportFactory.scala | 6 +++--- .../BinaryClassificationPMMLModelExportSuite.scala | 9 +++++---- .../GeneralizedLinearPMMLModelExportSuite.scala | 2 +- .../pmml/export/KMeansPMMLModelExportSuite.scala | 2 +- pom.xml | 12 ++++++++++++ 9 files changed, 43 insertions(+), 24 deletions(-) diff --git a/build.gradle b/build.gradle index 9b6009d9059a0..869abadd79b8f 100644 --- a/build.gradle +++ b/build.gradle @@ -107,7 +107,7 @@ allprojects { jlineVersion = '2.14.6' xbeanAsm5Version = '4.5' breezeVersion = '0.13.2' - pmmlVersion = '1.2.17' + pmmlVersion = '1.4.15' classutilVersion = '1.4.0' scoptVersion = '3.7.1' mesosVersion = '1.0.4' diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExport.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExport.scala index a8c32f72bfdeb..27935c6f5291a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExport.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExport.scala @@ -19,7 +19,9 @@ package org.apache.spark.mllib.pmml.export import scala.{Array => SArray} -import org.dmg.pmml._ +import org.dmg.pmml.{DataDictionary, DataField, DataType, FieldName, MiningField, + MiningFunction, MiningSchema, OpType} +import org.dmg.pmml.regression.{NumericPredictor, RegressionModel, RegressionTable} import org.apache.spark.mllib.regression.GeneralizedLinearModel @@ -29,7 +31,7 @@ import org.apache.spark.mllib.regression.GeneralizedLinearModel private[mllib] class BinaryClassificationPMMLModelExport( model: GeneralizedLinearModel, description: String, - normalizationMethod: RegressionNormalizationMethodType, + normalizationMethod: RegressionModel.NormalizationMethod, threshold: Double) extends PMMLModelExport { @@ -47,7 +49,7 @@ private[mllib] class BinaryClassificationPMMLModelExport( val miningSchema = new MiningSchema val regressionTableYES = new RegressionTable(model.intercept).setTargetCategory("1") var interceptNO = threshold - if (RegressionNormalizationMethodType.LOGIT == normalizationMethod) { + if (RegressionModel.NormalizationMethod.LOGIT == normalizationMethod) { if (threshold <= 0) { interceptNO = Double.MinValue } else if (threshold >= 1) { @@ -58,7 +60,7 @@ private[mllib] class BinaryClassificationPMMLModelExport( } val regressionTableNO = new RegressionTable(interceptNO).setTargetCategory("0") val regressionModel = new RegressionModel() - .setFunctionName(MiningFunctionType.CLASSIFICATION) + .setMiningFunction(MiningFunction.CLASSIFICATION) .setMiningSchema(miningSchema) .setModelName(description) .setNormalizationMethod(normalizationMethod) @@ -69,7 +71,7 @@ private[mllib] class BinaryClassificationPMMLModelExport( dataDictionary.addDataFields(new DataField(fields(i), OpType.CONTINUOUS, DataType.DOUBLE)) miningSchema .addMiningFields(new MiningField(fields(i)) - .setUsageType(FieldUsageType.ACTIVE)) + .setUsageType(MiningField.UsageType.ACTIVE)) regressionTableYES.addNumericPredictors(new NumericPredictor(fields(i), model.weights(i))) } @@ -79,7 +81,7 @@ private[mllib] class BinaryClassificationPMMLModelExport( .addDataFields(new DataField(targetField, OpType.CATEGORICAL, DataType.STRING)) miningSchema .addMiningFields(new MiningField(targetField) - .setUsageType(FieldUsageType.TARGET)) + .setUsageType(MiningField.UsageType.TARGET)) dataDictionary.setNumberOfFields(dataDictionary.getDataFields.size) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/GeneralizedLinearPMMLModelExport.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/GeneralizedLinearPMMLModelExport.scala index 4d951d2973a6f..723224de168e2 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/GeneralizedLinearPMMLModelExport.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/GeneralizedLinearPMMLModelExport.scala @@ -19,7 +19,9 @@ package org.apache.spark.mllib.pmml.export import scala.{Array => SArray} -import org.dmg.pmml._ +import org.dmg.pmml.{DataDictionary, DataField, DataType, FieldName, MiningField, + MiningFunction, MiningSchema, OpType} +import org.dmg.pmml.regression.{NumericPredictor, RegressionModel, RegressionTable} import org.apache.spark.mllib.regression.GeneralizedLinearModel @@ -45,7 +47,7 @@ private[mllib] class GeneralizedLinearPMMLModelExport( val miningSchema = new MiningSchema val regressionTable = new RegressionTable(model.intercept) val regressionModel = new RegressionModel() - .setFunctionName(MiningFunctionType.REGRESSION) + .setMiningFunction(MiningFunction.REGRESSION) .setMiningSchema(miningSchema) .setModelName(description) .addRegressionTables(regressionTable) @@ -55,7 +57,7 @@ private[mllib] class GeneralizedLinearPMMLModelExport( dataDictionary.addDataFields(new DataField(fields(i), OpType.CONTINUOUS, DataType.DOUBLE)) miningSchema .addMiningFields(new MiningField(fields(i)) - .setUsageType(FieldUsageType.ACTIVE)) + .setUsageType(MiningField.UsageType.ACTIVE)) regressionTable.addNumericPredictors(new NumericPredictor(fields(i), model.weights(i))) } @@ -64,7 +66,7 @@ private[mllib] class GeneralizedLinearPMMLModelExport( dataDictionary.addDataFields(new DataField(targetField, OpType.CONTINUOUS, DataType.DOUBLE)) miningSchema .addMiningFields(new MiningField(targetField) - .setUsageType(FieldUsageType.TARGET)) + .setUsageType(MiningField.UsageType.TARGET)) dataDictionary.setNumberOfFields(dataDictionary.getDataFields.size) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExport.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExport.scala index 255c6140e5410..d043c9e58eebf 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExport.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExport.scala @@ -19,7 +19,9 @@ package org.apache.spark.mllib.pmml.export import scala.{Array => SArray} -import org.dmg.pmml._ +import org.dmg.pmml.{Array, CompareFunction, ComparisonMeasure, DataDictionary, DataField, DataType, + FieldName, MiningField, MiningFunction, MiningSchema, OpType, SquaredEuclidean} +import org.dmg.pmml.clustering.{Cluster, ClusteringField, ClusteringModel} import org.apache.spark.mllib.clustering.KMeansModel @@ -48,7 +50,7 @@ private[mllib] class KMeansPMMLModelExport(model: KMeansModel) extends PMMLModel .setModelName("k-means") .setMiningSchema(miningSchema) .setComparisonMeasure(comparisonMeasure) - .setFunctionName(MiningFunctionType.CLUSTERING) + .setMiningFunction(MiningFunction.CLUSTERING) .setModelClass(ClusteringModel.ModelClass.CENTER_BASED) .setNumberOfClusters(model.clusterCenters.length) @@ -57,9 +59,9 @@ private[mllib] class KMeansPMMLModelExport(model: KMeansModel) extends PMMLModel dataDictionary.addDataFields(new DataField(fields(i), OpType.CONTINUOUS, DataType.DOUBLE)) miningSchema .addMiningFields(new MiningField(fields(i)) - .setUsageType(FieldUsageType.ACTIVE)) + .setUsageType(MiningField.UsageType.ACTIVE)) clusteringModel.addClusteringFields( - new ClusteringField(fields(i)).setCompareFunction(CompareFunctionType.ABS_DIFF)) + new ClusteringField(fields(i)).setCompareFunction(CompareFunction.ABS_DIFF)) } dataDictionary.setNumberOfFields(dataDictionary.getDataFields.size) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactory.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactory.scala index 29bd689e1185a..84e63041dc2d6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactory.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/export/PMMLModelExportFactory.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.pmml.export -import org.dmg.pmml.RegressionNormalizationMethodType +import org.dmg.pmml.regression.RegressionModel import org.apache.spark.mllib.classification.LogisticRegressionModel import org.apache.spark.mllib.classification.SVMModel @@ -44,12 +44,12 @@ private[mllib] object PMMLModelExportFactory { new GeneralizedLinearPMMLModelExport(lasso, "lasso regression") case svm: SVMModel => new BinaryClassificationPMMLModelExport( - svm, "linear SVM", RegressionNormalizationMethodType.NONE, + svm, "linear SVM", RegressionModel.NormalizationMethod.NONE, svm.getThreshold.getOrElse(0.0)) case logistic: LogisticRegressionModel => if (logistic.numClasses == 2) { new BinaryClassificationPMMLModelExport( - logistic, "logistic regression", RegressionNormalizationMethodType.LOGIT, + logistic, "logistic regression", RegressionModel.NormalizationMethod.LOGIT, logistic.getThreshold.getOrElse(0.5)) } else { throw new IllegalArgumentException( diff --git a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExportSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExportSuite.scala index 4c6e76e47419b..08c581cd470d9 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExportSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/BinaryClassificationPMMLModelExportSuite.scala @@ -17,8 +17,7 @@ package org.apache.spark.mllib.pmml.export -import org.dmg.pmml.RegressionModel -import org.dmg.pmml.RegressionNormalizationMethodType +import org.dmg.pmml.regression.RegressionModel import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.classification.LogisticRegressionModel @@ -51,7 +50,8 @@ class BinaryClassificationPMMLModelExportSuite extends SparkFunSuite { assert(pmmlRegressionModel.getRegressionTables.get(1).getTargetCategory === "0") assert(pmmlRegressionModel.getRegressionTables.get(1).getNumericPredictors.size === 0) // ensure logistic regression has normalization method set to LOGIT - assert(pmmlRegressionModel.getNormalizationMethod() == RegressionNormalizationMethodType.LOGIT) + assert(pmmlRegressionModel.getNormalizationMethod() === + RegressionModel.NormalizationMethod.LOGIT) } test("linear SVM PMML export") { @@ -78,7 +78,8 @@ class BinaryClassificationPMMLModelExportSuite extends SparkFunSuite { assert(pmmlRegressionModel.getRegressionTables.get(1).getTargetCategory === "0") assert(pmmlRegressionModel.getRegressionTables.get(1).getNumericPredictors.size === 0) // ensure linear SVM has normalization method set to NONE - assert(pmmlRegressionModel.getNormalizationMethod() == RegressionNormalizationMethodType.NONE) + assert(pmmlRegressionModel.getNormalizationMethod() === + RegressionModel.NormalizationMethod.NONE) } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/GeneralizedLinearPMMLModelExportSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/GeneralizedLinearPMMLModelExportSuite.scala index 1d32309481787..bf1a0fd8e6071 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/GeneralizedLinearPMMLModelExportSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/GeneralizedLinearPMMLModelExportSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.pmml.export -import org.dmg.pmml.RegressionModel +import org.dmg.pmml.regression.RegressionModel import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.regression.{LassoModel, LinearRegressionModel, RidgeRegressionModel} diff --git a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExportSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExportSuite.scala index b3f9750afa730..b61c6225e9012 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExportSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/pmml/export/KMeansPMMLModelExportSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.pmml.export -import org.dmg.pmml.ClusteringModel +import org.dmg.pmml.clustering.ClusteringModel import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.clustering.KMeansModel diff --git a/pom.xml b/pom.xml index 9253853a00a95..38eb89cd81c13 100644 --- a/pom.xml +++ b/pom.xml @@ -381,6 +381,18 @@ 14.0.1 provided + + org.jpmml + pmml-model + 1.4.8 + provided + + + org.jpmml + pmml-agent + + + org.apache.commons