From 427295e645c8dc368619926ae0313ec24540bc81 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Wed, 10 May 2017 16:24:58 +0800 Subject: [PATCH 1/4] ML 2.2 QA: New Scala APIs, docs --- .../apache/spark/ml/classification/LinearSVC.scala | 6 +++--- .../spark/ml/classification/LogisticRegression.scala | 8 ++++++-- .../scala/org/apache/spark/ml/feature/Imputer.scala | 12 +++++++----- .../org/apache/spark/ml/feature/StringIndexer.scala | 2 +- .../scala/org/apache/spark/ml/fpm/FPGrowth.scala | 2 +- .../scala/org/apache/spark/ml/stat/Correlation.scala | 4 ++-- .../spark/ml/classification/LinearSVCSuite.scala | 4 ++-- python/pyspark/ml/classification.py | 1 + 8 files changed, 23 insertions(+), 16 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala index 7507c7539d4ef..d671c6ea58035 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala @@ -51,6 +51,7 @@ private[classification] trait LinearSVCParams extends ClassifierParams with HasR * Linear SVM Classifier * * This binary classifier optimizes the Hinge Loss using the OWLQN optimizer. + * Only supports L2 regularization currently. * */ @Since("2.2.0") @@ -131,7 +132,6 @@ class LinearSVC @Since("2.2.0") ( */ @Since("2.2.0") def setThreshold(value: Double): this.type = set(threshold, value) - setDefault(threshold -> 0.0) /** * Suggested depth for treeAggregate (greater than or equal to 2). @@ -148,7 +148,7 @@ class LinearSVC @Since("2.2.0") ( @Since("2.2.0") override def copy(extra: ParamMap): LinearSVC = defaultCopy(extra) - override protected[classification] def train(dataset: Dataset[_]): LinearSVCModel = { + override protected def train(dataset: Dataset[_]): LinearSVCModel = { val w = if (!isDefined(weightCol) || $(weightCol).isEmpty) lit(1.0) else col($(weightCol)) val instances: RDD[Instance] = dataset.select(col($(labelCol)), w, col($(featuresCol))).rdd.map { @@ -264,7 +264,7 @@ object LinearSVC extends DefaultParamsReadable[LinearSVC] { /** * :: Experimental :: - * SVM Model trained by [[LinearSVC]] + * Linear SVM Model trained by [[LinearSVC]] */ @Since("2.2.0") @Experimental diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 42dc7fbebe4c3..c474af98b6a93 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -267,8 +267,12 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas } /** - * Logistic regression. Supports multinomial logistic (softmax) regression and binomial logistic - * regression. + * Logistic regression. Supports: + * - Multinomial logistic (softmax) regression. + * - Binomial logistic regression. + * + * This class supports fitting traditional logistic regression model by LBFGS/OWLQN and + * bound (box) constrained logistic regression model by LBFGSB. */ @Since("1.2.0") class LogisticRegression @Since("1.2.0") ( diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala index a41bd8e689d56..683ca49c78de0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala @@ -102,7 +102,8 @@ private[feature] trait ImputerParams extends Params with HasInputCols { * computing median, DataFrameStatFunctions.approxQuantile is used with a relative error of 0.001. */ @Experimental -class Imputer @Since("2.2.0")(override val uid: String) +@Since("2.2.0") +class Imputer @Since("2.2.0")(@Since("2.2.0") override val uid: String) extends Estimator[ImputerModel] with ImputerParams with DefaultParamsWritable { @Since("2.2.0") @@ -165,8 +166,8 @@ class Imputer @Since("2.2.0")(override val uid: String) object Imputer extends DefaultParamsReadable[Imputer] { /** strategy names that Imputer currently supports. */ - private[ml] val mean = "mean" - private[ml] val median = "median" + private[feature] val mean = "mean" + private[feature] val median = "median" @Since("2.2.0") override def load(path: String): Imputer = super.load(path) @@ -180,9 +181,10 @@ object Imputer extends DefaultParamsReadable[Imputer] { * which are used to replace the missing values in the input DataFrame. */ @Experimental +@Since("2.2.0") class ImputerModel private[ml]( - override val uid: String, - val surrogateDF: DataFrame) + @Since("2.2.0") override val uid: String, + @Since("2.2.0") val surrogateDF: DataFrame) extends Model[ImputerModel] with ImputerParams with MLWritable { import ImputerModel._ diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala index 99321bcc7cf98..f5073cbe55988 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala @@ -146,7 +146,7 @@ object StringIndexer extends DefaultParamsReadable[StringIndexer] { * This is a temporary fix for the case when target labels do not exist during prediction. */ @Since("1.4.0") -class StringIndexerModel ( +class StringIndexerModel private[ml] ( @Since("1.4.0") override val uid: String, @Since("1.5.0") val labels: Array[String]) extends Model[StringIndexerModel] with StringIndexerBase with MLWritable { diff --git a/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala index 8f00daa59f1a5..ad6045a50dee8 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/fpm/FPGrowth.scala @@ -200,7 +200,7 @@ object FPGrowth extends DefaultParamsReadable[FPGrowth] { @Experimental class FPGrowthModel private[ml] ( @Since("2.2.0") override val uid: String, - @transient val freqItemsets: DataFrame) + @Since("2.2.0") @transient val freqItemsets: DataFrame) extends Model[FPGrowthModel] with FPGrowthParams with MLWritable { /** @group setParam */ diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Correlation.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Correlation.scala index e185bc8a6faaa..675fe2e45a93f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/stat/Correlation.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Correlation.scala @@ -27,9 +27,9 @@ import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.types.{StructField, StructType} /** - * API for correlation functions in MLlib, compatible with Dataframes and Datasets. + * API for correlation functions in MLlib, compatible with DataFrames and Datasets. * - * The functions in this package generalize the functions in [[org.apache.spark.sql.Dataset#stat]] + * The functions in this package generalize the functions in [[org.apache.spark.sql.Dataset.stat]] * to spark.ml's Vector types. */ @Since("2.2.0") diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala index 2f87afc23fe7e..90f26eb0c54e1 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala @@ -106,7 +106,7 @@ class LinearSVCSuite extends SparkFunSuite with MLlibTestSparkContext with Defau assert(lsvc.getTol === 1E-6) assert(lsvc.getStandardization) assert(!lsvc.isDefined(lsvc.weightCol)) - assert(lsvc.getThreshold === 0.0) + assert(lsvc.getThreshold === 0.5) assert(lsvc.getAggregationDepth === 2) assert(lsvc.getLabelCol === "label") assert(lsvc.getFeaturesCol === "features") @@ -116,7 +116,7 @@ class LinearSVCSuite extends SparkFunSuite with MLlibTestSparkContext with Defau model.transform(smallBinaryDataset) .select("label", "prediction", "rawPrediction") .collect() - assert(model.getThreshold === 0.0) + assert(model.getThreshold === 0.5) assert(model.getFeaturesCol === "features") assert(model.getPredictionCol === "prediction") assert(model.getRawPredictionCol === "rawPrediction") diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index a9756ea4af99a..354e1dc0f9990 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -70,6 +70,7 @@ class LinearSVC(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, Ha `Linear SVM Classifier `_ This binary classifier optimizes the Hinge Loss using the OWLQN optimizer. + Only supports L2 regularization currently. >>> from pyspark.sql import Row >>> from pyspark.ml.linalg import Vectors From 2aab9bad77c694fcefb8dc63b1d5a4ed2d0a7190 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Wed, 10 May 2017 17:07:09 +0800 Subject: [PATCH 2/4] Fix typo. --- mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala | 2 +- mllib/src/main/scala/org/apache/spark/ml/stat/Correlation.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala index 683ca49c78de0..9102d9dbf4653 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala @@ -103,7 +103,7 @@ private[feature] trait ImputerParams extends Params with HasInputCols { */ @Experimental @Since("2.2.0") -class Imputer @Since("2.2.0")(@Since("2.2.0") override val uid: String) +class Imputer @Since("2.2.0") (@Since("2.2.0") override val uid: String) extends Estimator[ImputerModel] with ImputerParams with DefaultParamsWritable { @Since("2.2.0") diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Correlation.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Correlation.scala index 675fe2e45a93f..6e885d7c8aec5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/stat/Correlation.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Correlation.scala @@ -29,7 +29,7 @@ import org.apache.spark.sql.types.{StructField, StructType} /** * API for correlation functions in MLlib, compatible with DataFrames and Datasets. * - * The functions in this package generalize the functions in [[org.apache.spark.sql.Dataset.stat]] + * The functions in this package generalize the functions in [[org.apache.spark.sql.Dataset#stat]] * to spark.ml's Vector types. */ @Since("2.2.0") From 65f22a0926153336436e4e77e5fd5baa67d3665b Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Wed, 10 May 2017 22:45:43 +0800 Subject: [PATCH 3/4] Fix LinearSVC threshold. --- .../scala/org/apache/spark/ml/classification/LinearSVC.scala | 1 + .../src/main/scala/org/apache/spark/ml/feature/Imputer.scala | 2 +- .../org/apache/spark/ml/classification/LinearSVCSuite.scala | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala index d671c6ea58035..9900fbc9edda7 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala @@ -132,6 +132,7 @@ class LinearSVC @Since("2.2.0") ( */ @Since("2.2.0") def setThreshold(value: Double): this.type = set(threshold, value) + setDefault(threshold -> 0.0) /** * Suggested depth for treeAggregate (greater than or equal to 2). diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala index 9102d9dbf4653..9e023b9dd469b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala @@ -182,7 +182,7 @@ object Imputer extends DefaultParamsReadable[Imputer] { */ @Experimental @Since("2.2.0") -class ImputerModel private[ml]( +class ImputerModel private[ml] ( @Since("2.2.0") override val uid: String, @Since("2.2.0") val surrogateDF: DataFrame) extends Model[ImputerModel] with ImputerParams with MLWritable { diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala index 90f26eb0c54e1..2f87afc23fe7e 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala @@ -106,7 +106,7 @@ class LinearSVCSuite extends SparkFunSuite with MLlibTestSparkContext with Defau assert(lsvc.getTol === 1E-6) assert(lsvc.getStandardization) assert(!lsvc.isDefined(lsvc.weightCol)) - assert(lsvc.getThreshold === 0.5) + assert(lsvc.getThreshold === 0.0) assert(lsvc.getAggregationDepth === 2) assert(lsvc.getLabelCol === "label") assert(lsvc.getFeaturesCol === "features") @@ -116,7 +116,7 @@ class LinearSVCSuite extends SparkFunSuite with MLlibTestSparkContext with Defau model.transform(smallBinaryDataset) .select("label", "prediction", "rawPrediction") .collect() - assert(model.getThreshold === 0.5) + assert(model.getThreshold === 0.0) assert(model.getFeaturesCol === "features") assert(model.getPredictionCol === "prediction") assert(model.getRawPredictionCol === "rawPrediction") From cef372bd0adbfa25ce37a7da241479b55d8c3231 Mon Sep 17 00:00:00 2001 From: Yanbo Liang Date: Tue, 16 May 2017 10:03:07 +0800 Subject: [PATCH 4/4] Revert the change for StringIndexerModel. --- .../main/scala/org/apache/spark/ml/feature/StringIndexer.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala index f5073cbe55988..99321bcc7cf98 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala @@ -146,7 +146,7 @@ object StringIndexer extends DefaultParamsReadable[StringIndexer] { * This is a temporary fix for the case when target labels do not exist during prediction. */ @Since("1.4.0") -class StringIndexerModel private[ml] ( +class StringIndexerModel ( @Since("1.4.0") override val uid: String, @Since("1.5.0") val labels: Array[String]) extends Model[StringIndexerModel] with StringIndexerBase with MLWritable {