From 2adebe8de3881509e510fc518c562d1141ccd0ef Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Wed, 10 Aug 2016 13:40:18 +0800 Subject: [PATCH 01/21] add a chiSquare Selector based on False Positive Rate (FPR) test --- .../spark/mllib/feature/ChiSqSelector.scala | 29 +++++++++++++++++-- .../mllib/feature/ChiSqSelectorSuite.scala | 18 ++++++++++++ 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index c8c2823bbaf04..f3316eeee8fec 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -173,8 +173,8 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] { * Creates a ChiSquared feature selector. * @param numTopFeatures number of features that selector will select * (ordered by statistic value descending) - * Note that if the number of features is < numTopFeatures, then this will - * select all features. + * Note that if the number of features is less than numTopFeatures, + * then this will select all features. */ @Since("1.3.0") class ChiSqSelector @Since("1.3.0") ( @@ -197,3 +197,28 @@ class ChiSqSelector @Since("1.3.0") ( new ChiSqSelectorModel(indices) } } + +/** + * Creates a ChiSquared feature selector by False Positive Rate (FPR) test. + * @param alpha the highest p-value for features to be kept + */ +@Since("2.1.0") +class ChiSqSelectorByFpr @Since("2.1.0") ( + @Since("2.1.0") val alpha: Double) extends Serializable { + + /** + * Returns a ChiSquared feature selector by FPR. + * + * @param data an `RDD[LabeledPoint]` containing the labeled dataset with categorical features. + * Real-valued features will be treated as categorical for each distinct value. + * Apply feature discretizer before using this function. + */ + @Since("2.1.0") + def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = { + val indices = Statistics.chiSqTest(data) + .zipWithIndex.filter { case (res, _) => res.pValue < alpha } + .map { case (_, indices) => indices } + .sorted + new ChiSqSelectorModel(indices) + } +} diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala index 734800a9afad6..6b2209c8a7c15 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala @@ -65,6 +65,24 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext { assert(filteredData == preFilteredData) } + test("ChiSqSelectorByFpr transform test (sparse & dense vector)") { + val labeledDiscreteData = sc.parallelize( + Seq(LabeledPoint(0.0, Vectors.sparse(4, Array((0, 8.0), (1, 7.0)))), + LabeledPoint(1.0, Vectors.sparse(4, Array((1, 9.0), (2, 6.0), (3, 4.0)))), + LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 4.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0, 9.0)))), 2) + val preFilteredData = + Set(LabeledPoint(0.0, Vectors.dense(Array(0.0))), + LabeledPoint(1.0, Vectors.dense(Array(4.0))), + LabeledPoint(1.0, Vectors.dense(Array(4.0))), + LabeledPoint(2.0, Vectors.dense(Array(9.0)))) + val model = new ChiSqSelectorByFpr(0.1).fit(labeledDiscreteData) + val filteredData = labeledDiscreteData.map { lp => + LabeledPoint(lp.label, model.transform(lp.features)) + }.collect().toSet + assert(filteredData == preFilteredData) + } + test("model load / save") { val model = ChiSqSelectorSuite.createModel() val tempDir = Utils.createTempDir() From 7623563884355a04867ce5271baa286f65180e62 Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Tue, 16 Aug 2016 21:36:11 +0800 Subject: [PATCH 02/21] Configure the ChiSqSelector to reuse ChiSqTestResult by numTopFeatures, Percentile, and Fpr selector --- .../mllib/JavaChiSqSelectorExample.java | 3 +- .../examples/mllib/ChiSqSelectorExample.scala | 3 +- .../spark/ml/feature/ChiSqSelector.scala | 60 ++++++++++- .../mllib/api/python/PythonMLLibAPI.scala | 4 +- .../spark/mllib/feature/ChiSqSelector.scala | 102 +++++++++++------- .../mllib/feature/ChiSqSelectorSuite.scala | 6 +- 6 files changed, 126 insertions(+), 52 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java index ad44acb4cd6e3..f0619b7bc5685 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java @@ -56,7 +56,8 @@ public LabeledPoint call(LabeledPoint lp) { ); // Create ChiSqSelector that will select top 50 of 692 features - ChiSqSelector selector = new ChiSqSelector(50); + ChiSqSelector selector = new ChiSqSelector(); + selector.setNumTopFeatures(50); // Create ChiSqSelector model (selecting features) final ChiSqSelectorModel transformer = selector.fit(discretizedData.rdd()); // Filter the top 50 features from each feature vector diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala index 5e400b7d715b4..9fb520ce56acc 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala @@ -43,7 +43,8 @@ object ChiSqSelectorExample { LabeledPoint(lp.label, Vectors.dense(lp.features.toArray.map { x => (x / 16).floor })) } // Create ChiSqSelector that will select top 50 of 692 features - val selector = new ChiSqSelector(50) + val selector = new ChiSqSelector() + selector.setNumTopFeatures(50) // Create ChiSqSelector model (selecting features) val transformer = selector.fit(discretizedData) // Filter the top 50 features from each feature vector diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index 1482eb3d1f7a6..439514bdb4a4c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -27,6 +27,7 @@ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature +import org.apache.spark.mllib.feature.SelectorType import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint} import org.apache.spark.rdd.RDD @@ -51,11 +52,29 @@ private[feature] trait ChiSqSelectorParams extends Params " number of features is < numTopFeatures, then this will select all features.", ParamValidators.gtEq(1)) setDefault(numTopFeatures -> 50) + final val percentile = new IntParam(this, "percentile", + "Percentile of features that selector will select, ordered by statistics value descending.", + ParamValidators.gtEq(0)) + setDefault(percentile -> 10) + + final val alpha = new DoubleParam(this, "alpha", + "The highest p-value for features to be kept.", + ParamValidators.gtEq(0)) + setDefault(alpha -> 0.05) + + final val selectorType = SelectorType.KBest /** @group getParam */ def getNumTopFeatures: Int = $(numTopFeatures) + + def getPercentile: Int = $(percentile) + + def getAlpha: Double = $(alpha) + + def getSelectorType: SelectorType.Value = selectorType } + /** * Chi-Squared feature selection, which selects categorical features to use for predicting a * categorical label. @@ -66,10 +85,26 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str @Since("1.6.0") def this() = this(Identifiable.randomUID("chiSqSelector")) - + val chiSqSelector = new feature.ChiSqSelector() /** @group setParam */ @Since("1.6.0") - def setNumTopFeatures(value: Int): this.type = set(numTopFeatures, value) + def setNumTopFeatures(value: Int): this.type = { + chiSqSelector.setNumTopFeatures(value) + chiSqSelector.setSelectorType(SelectorType.KBest) + set(numTopFeatures, value) + } + + def setPercentile(value: Int): this.type = { + chiSqSelector.setPercentile(value) + chiSqSelector.setSelectorType(SelectorType.Percentile) + set(percentile, value) + } + + def setAlpha(value: Double): this.type = { + chiSqSelector.setAlpha(value) + chiSqSelector.setSelectorType(SelectorType.Fpr) + set(alpha, value) + } /** @group setParam */ @Since("1.6.0") @@ -89,10 +124,25 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str val input: RDD[OldLabeledPoint] = dataset.select(col($(labelCol)).cast(DoubleType), col($(featuresCol))).rdd.map { case Row(label: Double, features: Vector) => - OldLabeledPoint(label, OldVectors.fromML(features)) + OldLabeledPoint(label, OldVectors.fromML(features)) } - val chiSqSelector = new feature.ChiSqSelector($(numTopFeatures)).fit(input) - copyValues(new ChiSqSelectorModel(uid, chiSqSelector).setParent(this)) + val model = chiSqSelector.fit(input) + copyValues(new ChiSqSelectorModel(uid, model).setParent(this)) + } + + def selectKBest(value: Int): ChiSqSelectorModel = { + val model = chiSqSelector.selectKBest(value) + copyValues(new ChiSqSelectorModel(uid, model).setParent(this)) + } + + def selectPercentile(value: Int): ChiSqSelectorModel = { + val model = chiSqSelector.selectPercentile(value) + copyValues(new ChiSqSelectorModel(uid, model).setParent(this)) + } + + def selectFpr(value: Double): ChiSqSelectorModel = { + val model = chiSqSelector.selectFpr(value) + copyValues(new ChiSqSelectorModel(uid, model).setParent(this)) } @Since("1.6.0") diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index a80cca70f4b28..bdcfe70651e3d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -634,8 +634,8 @@ private[python] class PythonMLLibAPI extends Serializable { * Extra care needs to be taken in the Python code to ensure it gets freed on * exit; see the Py4J documentation. */ - def fitChiSqSelector(numTopFeatures: Int, data: JavaRDD[LabeledPoint]): ChiSqSelectorModel = { - new ChiSqSelector(numTopFeatures).fit(data.rdd) + def fitChiSqSelector(data: JavaRDD[LabeledPoint]): ChiSqSelectorModel = { + new ChiSqSelector().fit(data.rdd) } /** diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index f3316eeee8fec..9bc75c65165fd 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -27,22 +27,26 @@ import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.stat.Statistics +import org.apache.spark.mllib.stat.test.ChiSqTestResult import org.apache.spark.mllib.util.{Loader, Saveable} import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext import org.apache.spark.sql.{Row, SparkSession} +object SelectorType extends Enumeration { + type SelectorType = Value + val KBest, Percentile, Fpr = Value +} + /** * Chi Squared selector model. * - * @param selectedFeatures list of indices to select (filter). Must be ordered asc + * @param selectedFeatures list of indices to select (filter). */ @Since("1.3.0") class ChiSqSelectorModel @Since("1.3.0") ( @Since("1.3.0") val selectedFeatures: Array[Int]) extends VectorTransformer with Saveable { - require(isSorted(selectedFeatures), "Array has to be sorted asc") - protected def isSorted(array: Array[Int]): Boolean = { var i = 1 val len = array.length @@ -69,21 +73,23 @@ class ChiSqSelectorModel @Since("1.3.0") ( * Preserves the order of filtered features the same as their indices are stored. * Might be moved to Vector as .slice * @param features vector - * @param filterIndices indices of features to filter, must be ordered asc + * @param filterIndices indices of features to filter */ private def compress(features: Vector, filterIndices: Array[Int]): Vector = { + val orderedIndices = filterIndices.sorted + require(isSorted(orderedIndices), "Array has to be sorted asc") features match { case SparseVector(size, indices, values) => - val newSize = filterIndices.length + val newSize = orderedIndices.length val newValues = new ArrayBuilder.ofDouble val newIndices = new ArrayBuilder.ofInt var i = 0 var j = 0 var indicesIdx = 0 var filterIndicesIdx = 0 - while (i < indices.length && j < filterIndices.length) { + while (i < indices.length && j < orderedIndices.length) { indicesIdx = indices(i) - filterIndicesIdx = filterIndices(j) + filterIndicesIdx = orderedIndices(j) if (indicesIdx == filterIndicesIdx) { newIndices += j newValues += values(i) @@ -101,7 +107,7 @@ class ChiSqSelectorModel @Since("1.3.0") ( Vectors.sparse(newSize, newIndices.result(), newValues.result()) case DenseVector(values) => val values = features.toArray - Vectors.dense(filterIndices.map(i => values(i))) + Vectors.dense(orderedIndices.map(i => values(i))) case other => throw new UnsupportedOperationException( s"Only sparse and dense vectors are supported but got ${other.getClass}.") @@ -171,14 +177,34 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] { /** * Creates a ChiSquared feature selector. - * @param numTopFeatures number of features that selector will select - * (ordered by statistic value descending) - * Note that if the number of features is less than numTopFeatures, - * then this will select all features. */ @Since("1.3.0") -class ChiSqSelector @Since("1.3.0") ( - @Since("1.3.0") val numTopFeatures: Int) extends Serializable { +class ChiSqSelector @Since("1.3.0") () extends Serializable { + var numTopFeatures: Int = 1 + var percentile: Int = 10 + var alpha: Double = 0.05 + var selectorType = SelectorType.KBest + var chiSqTestResult: Array[ChiSqTestResult] = new Array[ChiSqTestResult](0) + + def setNumTopFeatures(value: Int): this.type = { + numTopFeatures = value + selectorType = SelectorType.KBest + this + } + def setPercentile(value: Int): this.type = { + percentile = value + selectorType = SelectorType.Percentile + this + } + def setAlpha(value: Double): this.type = { + alpha = value + selectorType = SelectorType.Fpr + this + } + def setSelectorType(value: SelectorType.Value): this.type = { + selectorType = value + this + } /** * Returns a ChiSquared feature selector. @@ -189,36 +215,32 @@ class ChiSqSelector @Since("1.3.0") ( */ @Since("1.3.0") def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = { - val indices = Statistics.chiSqTest(data) - .zipWithIndex.sortBy { case (res, _) => -res.statistic } - .take(numTopFeatures) - .map { case (_, indices) => indices } - .sorted + chiSqTestResult = Statistics.chiSqTest(data) + selectorType match { + case SelectorType.KBest => selectKBest(numTopFeatures) + case SelectorType.Percentile => selectPercentile(percentile) + case SelectorType.Fpr => selectFpr(alpha) + } + } + + def selectKBest(value: Int): ChiSqSelectorModel = { + val indices = chiSqTestResult.zipWithIndex.sortBy { case (res, _) => -res.statistic } + .take(numTopFeatures) + .map { case (_, indices) => indices } new ChiSqSelectorModel(indices) } -} -/** - * Creates a ChiSquared feature selector by False Positive Rate (FPR) test. - * @param alpha the highest p-value for features to be kept - */ -@Since("2.1.0") -class ChiSqSelectorByFpr @Since("2.1.0") ( - @Since("2.1.0") val alpha: Double) extends Serializable { + def selectPercentile(value: Int): ChiSqSelectorModel = { + val indices = chiSqTestResult.zipWithIndex.sortBy { case (res, _) => -res.statistic } + .take((chiSqTestResult.length * percentile / 100).toInt) + .map { case (_, indices) => indices } + new ChiSqSelectorModel(indices) + } - /** - * Returns a ChiSquared feature selector by FPR. - * - * @param data an `RDD[LabeledPoint]` containing the labeled dataset with categorical features. - * Real-valued features will be treated as categorical for each distinct value. - * Apply feature discretizer before using this function. - */ - @Since("2.1.0") - def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = { - val indices = Statistics.chiSqTest(data) - .zipWithIndex.filter { case (res, _) => res.pValue < alpha } - .map { case (_, indices) => indices } - .sorted + def selectFpr(value: Double): ChiSqSelectorModel = { + val indices = chiSqTestResult.zipWithIndex.filter{ case (res, _) => res.pValue < alpha } + .map { case (_, indices) => indices } new ChiSqSelectorModel(indices) } } + diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala index 6b2209c8a7c15..d61888df9c0dc 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala @@ -58,14 +58,14 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext { LabeledPoint(1.0, Vectors.dense(Array(6.0))), LabeledPoint(1.0, Vectors.dense(Array(8.0))), LabeledPoint(2.0, Vectors.dense(Array(5.0)))) - val model = new ChiSqSelector(1).fit(labeledDiscreteData) + val model = new ChiSqSelector().fit(labeledDiscreteData) val filteredData = labeledDiscreteData.map { lp => LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSet assert(filteredData == preFilteredData) } - test("ChiSqSelectorByFpr transform test (sparse & dense vector)") { + test("ChiSqSelector by FPR transform test (sparse & dense vector)") { val labeledDiscreteData = sc.parallelize( Seq(LabeledPoint(0.0, Vectors.sparse(4, Array((0, 8.0), (1, 7.0)))), LabeledPoint(1.0, Vectors.sparse(4, Array((1, 9.0), (2, 6.0), (3, 4.0)))), @@ -76,7 +76,7 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext { LabeledPoint(1.0, Vectors.dense(Array(4.0))), LabeledPoint(1.0, Vectors.dense(Array(4.0))), LabeledPoint(2.0, Vectors.dense(Array(9.0)))) - val model = new ChiSqSelectorByFpr(0.1).fit(labeledDiscreteData) + val model = new ChiSqSelector().setAlpha(0.1).fit(labeledDiscreteData) val filteredData = labeledDiscreteData.map { lp => LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSet From 3d6aecb8441503c9c3d62a2d8a3d48824b9d6637 Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Wed, 17 Aug 2016 10:34:59 +0800 Subject: [PATCH 03/21] Config the ChiSqSelector to reuse the ChiSqTestResult by KBest, Percentile and FPR selector --- .../mllib/JavaChiSqSelectorExample.java | 3 +- .../examples/mllib/ChiSqSelectorExample.scala | 3 +- .../spark/ml/feature/ChiSqSelector.scala | 12 +++--- .../mllib/api/python/PythonMLLibAPI.scala | 4 +- .../spark/mllib/feature/ChiSqSelector.scala | 38 ++++++++++--------- .../mllib/feature/ChiSqSelectorSuite.scala | 2 +- 6 files changed, 32 insertions(+), 30 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java index f0619b7bc5685..ad44acb4cd6e3 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java @@ -56,8 +56,7 @@ public LabeledPoint call(LabeledPoint lp) { ); // Create ChiSqSelector that will select top 50 of 692 features - ChiSqSelector selector = new ChiSqSelector(); - selector.setNumTopFeatures(50); + ChiSqSelector selector = new ChiSqSelector(50); // Create ChiSqSelector model (selecting features) final ChiSqSelectorModel transformer = selector.fit(discretizedData.rdd()); // Filter the top 50 features from each feature vector diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala index 9fb520ce56acc..5e400b7d715b4 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala @@ -43,8 +43,7 @@ object ChiSqSelectorExample { LabeledPoint(lp.label, Vectors.dense(lp.features.toArray.map { x => (x / 16).floor })) } // Create ChiSqSelector that will select top 50 of 692 features - val selector = new ChiSqSelector() - selector.setNumTopFeatures(50) + val selector = new ChiSqSelector(50) // Create ChiSqSelector model (selecting features) val transformer = selector.fit(discretizedData) // Filter the top 50 features from each feature vector diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index 439514bdb4a4c..a44ac2fe73aea 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -27,7 +27,7 @@ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature -import org.apache.spark.mllib.feature.SelectorType +import org.apache.spark.mllib.feature.ChiSqSelectorType import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint} import org.apache.spark.rdd.RDD @@ -62,7 +62,7 @@ private[feature] trait ChiSqSelectorParams extends Params ParamValidators.gtEq(0)) setDefault(alpha -> 0.05) - final val selectorType = SelectorType.KBest + final val selectorType = ChiSqSelectorType.KBest /** @group getParam */ def getNumTopFeatures: Int = $(numTopFeatures) @@ -71,7 +71,7 @@ private[feature] trait ChiSqSelectorParams extends Params def getAlpha: Double = $(alpha) - def getSelectorType: SelectorType.Value = selectorType + def getChiSqSelectorType: ChiSqSelectorType.Value = selectorType } @@ -90,19 +90,19 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str @Since("1.6.0") def setNumTopFeatures(value: Int): this.type = { chiSqSelector.setNumTopFeatures(value) - chiSqSelector.setSelectorType(SelectorType.KBest) + chiSqSelector.setChiSqSelectorType(ChiSqSelectorType.KBest) set(numTopFeatures, value) } def setPercentile(value: Int): this.type = { chiSqSelector.setPercentile(value) - chiSqSelector.setSelectorType(SelectorType.Percentile) + chiSqSelector.setChiSqSelectorType(ChiSqSelectorType.Percentile) set(percentile, value) } def setAlpha(value: Double): this.type = { chiSqSelector.setAlpha(value) - chiSqSelector.setSelectorType(SelectorType.Fpr) + chiSqSelector.setChiSqSelectorType(ChiSqSelectorType.Fpr) set(alpha, value) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index bdcfe70651e3d..a80cca70f4b28 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -634,8 +634,8 @@ private[python] class PythonMLLibAPI extends Serializable { * Extra care needs to be taken in the Python code to ensure it gets freed on * exit; see the Py4J documentation. */ - def fitChiSqSelector(data: JavaRDD[LabeledPoint]): ChiSqSelectorModel = { - new ChiSqSelector().fit(data.rdd) + def fitChiSqSelector(numTopFeatures: Int, data: JavaRDD[LabeledPoint]): ChiSqSelectorModel = { + new ChiSqSelector(numTopFeatures).fit(data.rdd) } /** diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index 9bc75c65165fd..e2345b85a279e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -33,7 +33,7 @@ import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext import org.apache.spark.sql.{Row, SparkSession} -object SelectorType extends Enumeration { +object ChiSqSelectorType extends Enumeration { type SelectorType = Value val KBest, Percentile, Fpr = Value } @@ -77,7 +77,6 @@ class ChiSqSelectorModel @Since("1.3.0") ( */ private def compress(features: Vector, filterIndices: Array[Int]): Vector = { val orderedIndices = filterIndices.sorted - require(isSorted(orderedIndices), "Array has to be sorted asc") features match { case SparseVector(size, indices, values) => val newSize = orderedIndices.length @@ -178,30 +177,34 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] { /** * Creates a ChiSquared feature selector. */ -@Since("1.3.0") -class ChiSqSelector @Since("1.3.0") () extends Serializable { - var numTopFeatures: Int = 1 - var percentile: Int = 10 - var alpha: Double = 0.05 - var selectorType = SelectorType.KBest - var chiSqTestResult: Array[ChiSqTestResult] = new Array[ChiSqTestResult](0) - +@Since("2.1.0") +class ChiSqSelector @Since("2.1.0") () extends Serializable { + private var numTopFeatures: Int = 1 + private var percentile: Int = 10 + private var alpha: Double = 0.05 + private var selectorType = ChiSqSelectorType.KBest + private var chiSqTestResult: Array[ChiSqTestResult] = _ + + def this(numTopFeatures: Int) { + this() + this.numTopFeatures = numTopFeatures + } def setNumTopFeatures(value: Int): this.type = { numTopFeatures = value - selectorType = SelectorType.KBest + selectorType = ChiSqSelectorType.KBest this } def setPercentile(value: Int): this.type = { percentile = value - selectorType = SelectorType.Percentile + selectorType = ChiSqSelectorType.Percentile this } def setAlpha(value: Double): this.type = { alpha = value - selectorType = SelectorType.Fpr + selectorType = ChiSqSelectorType.Fpr this } - def setSelectorType(value: SelectorType.Value): this.type = { + def setChiSqSelectorType(value: ChiSqSelectorType.Value): this.type = { selectorType = value this } @@ -217,9 +220,10 @@ class ChiSqSelector @Since("1.3.0") () extends Serializable { def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = { chiSqTestResult = Statistics.chiSqTest(data) selectorType match { - case SelectorType.KBest => selectKBest(numTopFeatures) - case SelectorType.Percentile => selectPercentile(percentile) - case SelectorType.Fpr => selectFpr(alpha) + case ChiSqSelectorType.KBest => selectKBest(numTopFeatures) + case ChiSqSelectorType.Percentile => selectPercentile(percentile) + case ChiSqSelectorType.Fpr => selectFpr(alpha) + case _ => throw new Exception } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala index d61888df9c0dc..e181a544f7159 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala @@ -58,7 +58,7 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext { LabeledPoint(1.0, Vectors.dense(Array(6.0))), LabeledPoint(1.0, Vectors.dense(Array(8.0))), LabeledPoint(2.0, Vectors.dense(Array(5.0)))) - val model = new ChiSqSelector().fit(labeledDiscreteData) + val model = new ChiSqSelector(1).fit(labeledDiscreteData) val filteredData = labeledDiscreteData.map { lp => LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSet From 5305709c9d4029186318b99fa9c7c483897aa653 Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Wed, 17 Aug 2016 17:59:16 +0800 Subject: [PATCH 04/21] add Since annotation --- .../spark/ml/feature/ChiSqSelector.scala | 63 ++++++++++++------- .../spark/mllib/feature/ChiSqSelector.scala | 34 ++++++---- .../spark/ml/feature/ChiSqSelectorSuite.scala | 11 +++- 3 files changed, 74 insertions(+), 34 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index a44ac2fe73aea..d6b847a7770b0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -27,7 +27,6 @@ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature -import org.apache.spark.mllib.feature.ChiSqSelectorType import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint} import org.apache.spark.rdd.RDD @@ -52,28 +51,33 @@ private[feature] trait ChiSqSelectorParams extends Params " number of features is < numTopFeatures, then this will select all features.", ParamValidators.gtEq(1)) setDefault(numTopFeatures -> 50) - final val percentile = new IntParam(this, "percentile", + + /** @group getParam */ + def getNumTopFeatures: Int = $(numTopFeatures) + + final val percentile = new DoubleParam(this, "percentile", "Percentile of features that selector will select, ordered by statistics value descending.", ParamValidators.gtEq(0)) setDefault(percentile -> 10) + /** @group getParam */ + def getPercentile: Double = $(percentile) + final val alpha = new DoubleParam(this, "alpha", "The highest p-value for features to be kept.", ParamValidators.gtEq(0)) setDefault(alpha -> 0.05) - final val selectorType = ChiSqSelectorType.KBest - /** @group getParam */ - def getNumTopFeatures: Int = $(numTopFeatures) - - def getPercentile: Int = $(percentile) - def getAlpha: Double = $(alpha) - def getChiSqSelectorType: ChiSqSelectorType.Value = selectorType -} + final val selectorType = new Param[String](this, "selectorType", + "ChiSqSelector Type: KBest, Percentile, Fpr") + setDefault(selectorType -> "KBest") + /** @group getParam */ + def getChiSqSelectorType: String = $(selectorType) +} /** * Chi-Squared feature selection, which selects categorical features to use for predicting a @@ -85,24 +89,26 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str @Since("1.6.0") def this() = this(Identifiable.randomUID("chiSqSelector")) - val chiSqSelector = new feature.ChiSqSelector() + + @Since("2.1.0") + var chiSqSelector: feature.ChiSqSelector = null + /** @group setParam */ - @Since("1.6.0") + @Since("2.1.0") def setNumTopFeatures(value: Int): this.type = { - chiSqSelector.setNumTopFeatures(value) - chiSqSelector.setChiSqSelectorType(ChiSqSelectorType.KBest) + set(selectorType, "KBest") set(numTopFeatures, value) } - def setPercentile(value: Int): this.type = { - chiSqSelector.setPercentile(value) - chiSqSelector.setChiSqSelectorType(ChiSqSelectorType.Percentile) + @Since("2.1.0") + def setPercentile(value: Double): this.type = { + set(selectorType, "Percentile") set(percentile, value) } + @Since("2.1.0") def setAlpha(value: Double): this.type = { - chiSqSelector.setAlpha(value) - chiSqSelector.setChiSqSelectorType(ChiSqSelectorType.Fpr) + set(selectorType, "Fpr") set(alpha, value) } @@ -124,23 +130,38 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str val input: RDD[OldLabeledPoint] = dataset.select(col($(labelCol)).cast(DoubleType), col($(featuresCol))).rdd.map { case Row(label: Double, features: Vector) => - OldLabeledPoint(label, OldVectors.fromML(features)) + OldLabeledPoint(label, OldVectors.fromML(features)) } + $(selectorType) match { + case "KBest" => + chiSqSelector = new feature.ChiSqSelector().setNumTopFeatures($(numTopFeatures)) + case "Percentile" => + chiSqSelector = new feature.ChiSqSelector().setPercentile($(percentile)) + case "Fpr" => + chiSqSelector = new feature.ChiSqSelector().setAlpha($(alpha)) + case _ => throw new Exception("Unknown ChiSqSelector Type.") + } val model = chiSqSelector.fit(input) copyValues(new ChiSqSelectorModel(uid, model).setParent(this)) } + @Since("2.1.0") def selectKBest(value: Int): ChiSqSelectorModel = { + require(chiSqSelector != null, "ChiSqSelector has not been created.") val model = chiSqSelector.selectKBest(value) copyValues(new ChiSqSelectorModel(uid, model).setParent(this)) } - def selectPercentile(value: Int): ChiSqSelectorModel = { + @Since("2.1.0") + def selectPercentile(value: Double): ChiSqSelectorModel = { + require(chiSqSelector != null, "ChiSqSelector has not been created.") val model = chiSqSelector.selectPercentile(value) copyValues(new ChiSqSelectorModel(uid, model).setParent(this)) } + @Since("2.1.0") def selectFpr(value: Double): ChiSqSelectorModel = { + require(chiSqSelector != null, "ChiSqSelector has not been created.") val model = chiSqSelector.selectFpr(value) copyValues(new ChiSqSelectorModel(uid, model).setParent(this)) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index e2345b85a279e..1c3b49a04b843 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -33,6 +33,7 @@ import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext import org.apache.spark.sql.{Row, SparkSession} +@Since("2.1.0") object ChiSqSelectorType extends Enumeration { type SelectorType = Value val KBest, Percentile, Fpr = Value @@ -179,31 +180,40 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] { */ @Since("2.1.0") class ChiSqSelector @Since("2.1.0") () extends Serializable { - private var numTopFeatures: Int = 1 - private var percentile: Int = 10 + private var numTopFeatures: Int = 50 + private var percentile: Double = 10.0 private var alpha: Double = 0.05 private var selectorType = ChiSqSelectorType.KBest private var chiSqTestResult: Array[ChiSqTestResult] = _ + @Since("1.3.0") def this(numTopFeatures: Int) { this() this.numTopFeatures = numTopFeatures } + + @Since("2.1.0") def setNumTopFeatures(value: Int): this.type = { numTopFeatures = value selectorType = ChiSqSelectorType.KBest this } - def setPercentile(value: Int): this.type = { + + @Since("2.1.0") + def setPercentile(value: Double): this.type = { percentile = value selectorType = ChiSqSelectorType.Percentile this } + + @Since("2.1.0") def setAlpha(value: Double): this.type = { alpha = value selectorType = ChiSqSelectorType.Fpr this } + + @Since("2.1.0") def setChiSqSelectorType(value: ChiSqSelectorType.Value): this.type = { selectorType = value this @@ -219,14 +229,15 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { @Since("1.3.0") def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = { chiSqTestResult = Statistics.chiSqTest(data) - selectorType match { - case ChiSqSelectorType.KBest => selectKBest(numTopFeatures) - case ChiSqSelectorType.Percentile => selectPercentile(percentile) - case ChiSqSelectorType.Fpr => selectFpr(alpha) - case _ => throw new Exception - } + selectorType match { + case ChiSqSelectorType.KBest => selectKBest(numTopFeatures) + case ChiSqSelectorType.Percentile => selectPercentile(percentile) + case ChiSqSelectorType.Fpr => selectFpr(alpha) + case _ => throw new Exception("Unknown ChiSqSelector Type") + } } + @Since("2.1.0") def selectKBest(value: Int): ChiSqSelectorModel = { val indices = chiSqTestResult.zipWithIndex.sortBy { case (res, _) => -res.statistic } .take(numTopFeatures) @@ -234,17 +245,18 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { new ChiSqSelectorModel(indices) } - def selectPercentile(value: Int): ChiSqSelectorModel = { + @Since("2.1.0") + def selectPercentile(value: Double): ChiSqSelectorModel = { val indices = chiSqTestResult.zipWithIndex.sortBy { case (res, _) => -res.statistic } .take((chiSqTestResult.length * percentile / 100).toInt) .map { case (_, indices) => indices } new ChiSqSelectorModel(indices) } + @Since("2.1.0") def selectFpr(value: Double): ChiSqSelectorModel = { val indices = chiSqTestResult.zipWithIndex.filter{ case (res, _) => res.pValue < alpha } .map { case (_, indices) => indices } new ChiSqSelectorModel(indices) } } - diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala index 3558290b23ae0..a29ff83ae0cce 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala @@ -49,16 +49,23 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext .map(x => (x._1.label, x._1.features, x._2)) .toDF("label", "data", "preFilteredData") - val model = new ChiSqSelector() + val selector = new ChiSqSelector() .setNumTopFeatures(1) .setFeaturesCol("data") .setLabelCol("label") .setOutputCol("filtered") - model.fit(df).transform(df).select("filtered", "preFilteredData").collect().foreach { + selector.fit(df).transform(df).select("filtered", "preFilteredData").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 ~== vec2 absTol 1e-1) } + + selector.selectPercentile(34).transform(df) + .select("filtered", "preFilteredData").collect().foreach { + case Row(vec1: Vector, vec2: Vector) => + assert(vec1 ~== vec2 absTol 1e-1) + } + } test("ChiSqSelector read/write") { From 1e8d83a58b919256435d7f183a4cfb2154dfd2ee Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Mon, 22 Aug 2016 11:09:33 +0800 Subject: [PATCH 05/21] Not reuse the ChiSqTestResult to be consistent with other methods --- .../spark/ml/feature/ChiSqSelector.scala | 44 +++++-------------- .../spark/mllib/feature/ChiSqSelector.scala | 44 ++++++------------- .../spark/ml/feature/ChiSqSelectorSuite.scala | 2 +- 3 files changed, 25 insertions(+), 65 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index d6b847a7770b0..884da6de85450 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -27,6 +27,7 @@ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature +import org.apache.spark.mllib.feature.ChiSqSelectorType import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint} import org.apache.spark.rdd.RDD @@ -73,7 +74,7 @@ private[feature] trait ChiSqSelectorParams extends Params final val selectorType = new Param[String](this, "selectorType", "ChiSqSelector Type: KBest, Percentile, Fpr") - setDefault(selectorType -> "KBest") + setDefault(selectorType -> ChiSqSelectorType.KBest.toString) /** @group getParam */ def getChiSqSelectorType: String = $(selectorType) @@ -90,25 +91,22 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str @Since("1.6.0") def this() = this(Identifiable.randomUID("chiSqSelector")) - @Since("2.1.0") - var chiSqSelector: feature.ChiSqSelector = null - /** @group setParam */ @Since("2.1.0") def setNumTopFeatures(value: Int): this.type = { - set(selectorType, "KBest") + set(selectorType, ChiSqSelectorType.KBest.toString) set(numTopFeatures, value) } @Since("2.1.0") def setPercentile(value: Double): this.type = { - set(selectorType, "Percentile") + set(selectorType, ChiSqSelectorType.Percentile.toString) set(percentile, value) } @Since("2.1.0") def setAlpha(value: Double): this.type = { - set(selectorType, "Fpr") + set(selectorType, ChiSqSelectorType.Fpr.toString) set(alpha, value) } @@ -132,37 +130,15 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str case Row(label: Double, features: Vector) => OldLabeledPoint(label, OldVectors.fromML(features)) } - $(selectorType) match { + var model = $(selectorType) match { case "KBest" => - chiSqSelector = new feature.ChiSqSelector().setNumTopFeatures($(numTopFeatures)) + new feature.ChiSqSelector().setNumTopFeatures($(numTopFeatures)).fit(input) case "Percentile" => - chiSqSelector = new feature.ChiSqSelector().setPercentile($(percentile)) + new feature.ChiSqSelector().setPercentile($(percentile)).fit(input) case "Fpr" => - chiSqSelector = new feature.ChiSqSelector().setAlpha($(alpha)) - case _ => throw new Exception("Unknown ChiSqSelector Type.") + new feature.ChiSqSelector().setAlpha($(alpha)).fit(input) + case _ => throw new IllegalStateException("Unknown ChiSqSelector Type.") } - val model = chiSqSelector.fit(input) - copyValues(new ChiSqSelectorModel(uid, model).setParent(this)) - } - - @Since("2.1.0") - def selectKBest(value: Int): ChiSqSelectorModel = { - require(chiSqSelector != null, "ChiSqSelector has not been created.") - val model = chiSqSelector.selectKBest(value) - copyValues(new ChiSqSelectorModel(uid, model).setParent(this)) - } - - @Since("2.1.0") - def selectPercentile(value: Double): ChiSqSelectorModel = { - require(chiSqSelector != null, "ChiSqSelector has not been created.") - val model = chiSqSelector.selectPercentile(value) - copyValues(new ChiSqSelectorModel(uid, model).setParent(this)) - } - - @Since("2.1.0") - def selectFpr(value: Double): ChiSqSelectorModel = { - require(chiSqSelector != null, "ChiSqSelector has not been created.") - val model = chiSqSelector.selectFpr(value) copyValues(new ChiSqSelectorModel(uid, model).setParent(this)) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index 1c3b49a04b843..6c0db2cb8ccac 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -27,7 +27,6 @@ import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.stat.Statistics -import org.apache.spark.mllib.stat.test.ChiSqTestResult import org.apache.spark.mllib.util.{Loader, Saveable} import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext @@ -184,7 +183,6 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { private var percentile: Double = 10.0 private var alpha: Double = 0.05 private var selectorType = ChiSqSelectorType.KBest - private var chiSqTestResult: Array[ChiSqTestResult] = _ @Since("1.3.0") def this(numTopFeatures: Int) { @@ -201,6 +199,7 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { @Since("2.1.0") def setPercentile(value: Double): this.type = { + require(value <= 100 && value >= 0, "Percentile should be larger than 0 and less than 100") percentile = value selectorType = ChiSqSelectorType.Percentile this @@ -228,35 +227,20 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { */ @Since("1.3.0") def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = { - chiSqTestResult = Statistics.chiSqTest(data) - selectorType match { - case ChiSqSelectorType.KBest => selectKBest(numTopFeatures) - case ChiSqSelectorType.Percentile => selectPercentile(percentile) - case ChiSqSelectorType.Fpr => selectFpr(alpha) - case _ => throw new Exception("Unknown ChiSqSelector Type") + var indices = selectorType match { + case ChiSqSelectorType.KBest => Statistics.chiSqTest(data) + .zipWithIndex.sortBy { case (res, _) => -res.statistic } + .take(numTopFeatures) + .map { case (_, indices) => indices } + case ChiSqSelectorType.Percentile => Statistics.chiSqTest(data) + .zipWithIndex.sortBy { case (res, _) => -res.statistic } + .take((data.count() * percentile / 100).toInt) + .map { case (_, indices) => indices } + case ChiSqSelectorType.Fpr => Statistics.chiSqTest(data) + .zipWithIndex.filter{ case (res, _) => res.pValue < alpha } + .map { case (_, indices) => indices } + case _ => throw new IllegalStateException("Unknown ChiSqSelector Type") } - } - - @Since("2.1.0") - def selectKBest(value: Int): ChiSqSelectorModel = { - val indices = chiSqTestResult.zipWithIndex.sortBy { case (res, _) => -res.statistic } - .take(numTopFeatures) - .map { case (_, indices) => indices } - new ChiSqSelectorModel(indices) - } - - @Since("2.1.0") - def selectPercentile(value: Double): ChiSqSelectorModel = { - val indices = chiSqTestResult.zipWithIndex.sortBy { case (res, _) => -res.statistic } - .take((chiSqTestResult.length * percentile / 100).toInt) - .map { case (_, indices) => indices } - new ChiSqSelectorModel(indices) - } - - @Since("2.1.0") - def selectFpr(value: Double): ChiSqSelectorModel = { - val indices = chiSqTestResult.zipWithIndex.filter{ case (res, _) => res.pValue < alpha } - .map { case (_, indices) => indices } new ChiSqSelectorModel(indices) } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala index a29ff83ae0cce..50d175a18753c 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala @@ -60,7 +60,7 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext assert(vec1 ~== vec2 absTol 1e-1) } - selector.selectPercentile(34).transform(df) + selector.setPercentile(34).fit(df).transform(df) .select("filtered", "preFilteredData").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 ~== vec2 absTol 1e-1) From 85a17dd8a3e0a8d0d5a041d14a4472b5c202abcc Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Mon, 22 Aug 2016 19:20:38 +0800 Subject: [PATCH 06/21] fix Percentile bugs, optimize the code --- .../spark/ml/feature/ChiSqSelector.scala | 18 +++++++++---- .../spark/mllib/feature/ChiSqSelector.scala | 25 ++++++++++--------- .../spark/ml/feature/ChiSqSelectorSuite.scala | 2 +- 3 files changed, 27 insertions(+), 18 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index 884da6de85450..0d1946136832f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -72,6 +72,13 @@ private[feature] trait ChiSqSelectorParams extends Params /** @group getParam */ def getAlpha: Double = $(alpha) + /** + * The ChiSqSelector supports KBest, Percentile, Fpr selection, + * which is the same as ChiSqSelectorType defined in MLLIB. + * when call setNumTopFeatures, the selectorType is set to KBest + * when call setPercentile, the selectorType is set to Percentile + * when call setFpr, the selectorType is set to Fpr + */ final val selectorType = new Param[String](this, "selectorType", "ChiSqSelector Type: KBest, Percentile, Fpr") setDefault(selectorType -> ChiSqSelectorType.KBest.toString) @@ -130,14 +137,15 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str case Row(label: Double, features: Vector) => OldLabeledPoint(label, OldVectors.fromML(features)) } - var model = $(selectorType) match { - case "KBest" => + var model = ChiSqSelectorType.withName($(selectorType)) match { + case ChiSqSelectorType.KBest => new feature.ChiSqSelector().setNumTopFeatures($(numTopFeatures)).fit(input) - case "Percentile" => + case ChiSqSelectorType.Percentile => new feature.ChiSqSelector().setPercentile($(percentile)).fit(input) - case "Fpr" => + case ChiSqSelectorType.Fpr => new feature.ChiSqSelector().setAlpha($(alpha)).fit(input) - case _ => throw new IllegalStateException("Unknown ChiSqSelector Type.") + case errorType => + throw new IllegalStateException(s"Unknown ChiSqSelector Type: $errorType") } copyValues(new ChiSqSelectorModel(uid, model).setParent(this)) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index 6c0db2cb8ccac..74a61dc9ce585 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -33,7 +33,7 @@ import org.apache.spark.SparkContext import org.apache.spark.sql.{Row, SparkSession} @Since("2.1.0") -object ChiSqSelectorType extends Enumeration { +private[spark] object ChiSqSelectorType extends Enumeration { type SelectorType = Value val KBest, Percentile, Fpr = Value } @@ -180,7 +180,7 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] { @Since("2.1.0") class ChiSqSelector @Since("2.1.0") () extends Serializable { private var numTopFeatures: Int = 50 - private var percentile: Double = 10.0 + private var percentile: Double = 0.1 private var alpha: Double = 0.05 private var selectorType = ChiSqSelectorType.KBest @@ -199,7 +199,7 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { @Since("2.1.0") def setPercentile(value: Double): this.type = { - require(value <= 100 && value >= 0, "Percentile should be larger than 0 and less than 100") + require(value <= 1 && value >= 0, "Percentile should be larger than 0 and less than 100") percentile = value selectorType = ChiSqSelectorType.Percentile this @@ -227,20 +227,21 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { */ @Since("1.3.0") def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = { - var indices = selectorType match { - case ChiSqSelectorType.KBest => Statistics.chiSqTest(data) + val chiSqTestResult = Statistics.chiSqTest(data) + val features = selectorType match { + case ChiSqSelectorType.KBest => chiSqTestResult .zipWithIndex.sortBy { case (res, _) => -res.statistic } .take(numTopFeatures) - .map { case (_, indices) => indices } - case ChiSqSelectorType.Percentile => Statistics.chiSqTest(data) + case ChiSqSelectorType.Percentile => chiSqTestResult .zipWithIndex.sortBy { case (res, _) => -res.statistic } - .take((data.count() * percentile / 100).toInt) - .map { case (_, indices) => indices } - case ChiSqSelectorType.Fpr => Statistics.chiSqTest(data) + .take((chiSqTestResult.length * percentile).toInt) + case ChiSqSelectorType.Fpr => chiSqTestResult .zipWithIndex.filter{ case (res, _) => res.pValue < alpha } - .map { case (_, indices) => indices } - case _ => throw new IllegalStateException("Unknown ChiSqSelector Type") + case errorType => + throw new IllegalStateException("Unknown ChiSqSelector Type: $errorType") } + val indices = features.map { case (_, indices) => indices } new ChiSqSelectorModel(indices) } } + diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala index 50d175a18753c..e0293dbc4b0b2 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala @@ -60,7 +60,7 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext assert(vec1 ~== vec2 absTol 1e-1) } - selector.setPercentile(34).fit(df).transform(df) + selector.setPercentile(0.34).fit(df).transform(df) .select("filtered", "preFilteredData").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 ~== vec2 absTol 1e-1) From 61b71c81a75f88f8f9144e325e1a58f271e1aba0 Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Mon, 22 Aug 2016 19:34:37 +0800 Subject: [PATCH 07/21] change the default value of Percentile --- .../scala/org/apache/spark/ml/feature/ChiSqSelector.scala | 2 +- .../org/apache/spark/mllib/feature/ChiSqSelector.scala | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index 0d1946136832f..b9b88a0ddcf66 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -59,7 +59,7 @@ private[feature] trait ChiSqSelectorParams extends Params final val percentile = new DoubleParam(this, "percentile", "Percentile of features that selector will select, ordered by statistics value descending.", ParamValidators.gtEq(0)) - setDefault(percentile -> 10) + setDefault(percentile -> 0.1) /** @group getParam */ def getPercentile: Double = $(percentile) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index 74a61dc9ce585..79692f51887e1 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -76,7 +76,11 @@ class ChiSqSelectorModel @Since("1.3.0") ( * @param filterIndices indices of features to filter */ private def compress(features: Vector, filterIndices: Array[Int]): Vector = { - val orderedIndices = filterIndices.sorted + val orderedIndices = if (isSorted(filterIndices)) { + filterIndices + } else { + filterIndices.sorted + } features match { case SparseVector(size, indices, values) => val newSize = orderedIndices.length From d7b2892129cb64600443992a0041ef1781f74d86 Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Tue, 23 Aug 2016 12:57:34 +0800 Subject: [PATCH 08/21] Add require for setAlpha value --- .../scala/org/apache/spark/mllib/feature/ChiSqSelector.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index 79692f51887e1..0302b2ca86c34 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -203,7 +203,7 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { @Since("2.1.0") def setPercentile(value: Double): this.type = { - require(value <= 1 && value >= 0, "Percentile should be larger than 0 and less than 100") + require(value <= 1 && value >= 0, "Percentile should be larger than 0 and less than 1") percentile = value selectorType = ChiSqSelectorType.Percentile this @@ -211,6 +211,7 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { @Since("2.1.0") def setAlpha(value: Double): this.type = { + require(value <= 1 && value >= 0, "alpha value should be larger than 0 and less than 1") alpha = value selectorType = ChiSqSelectorType.Fpr this From 6699396aa7413040e24dddca679f24fe75c05cfb Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Tue, 23 Aug 2016 20:30:01 +0800 Subject: [PATCH 09/21] rm isSorted function, change gtEq(0) to inRange(0,1) for percentile and alpha check --- .../spark/ml/feature/ChiSqSelector.scala | 4 ++-- .../spark/mllib/feature/ChiSqSelector.scala | 22 ++++--------------- 2 files changed, 6 insertions(+), 20 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index b9b88a0ddcf66..6affcd4f25453 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -58,7 +58,7 @@ private[feature] trait ChiSqSelectorParams extends Params final val percentile = new DoubleParam(this, "percentile", "Percentile of features that selector will select, ordered by statistics value descending.", - ParamValidators.gtEq(0)) + ParamValidators.inRange(0, 1)) setDefault(percentile -> 0.1) /** @group getParam */ @@ -66,7 +66,7 @@ private[feature] trait ChiSqSelectorParams extends Params final val alpha = new DoubleParam(this, "alpha", "The highest p-value for features to be kept.", - ParamValidators.gtEq(0)) + ParamValidators.inRange(0, 1)) setDefault(alpha -> 0.05) /** @group getParam */ diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index 0302b2ca86c34..a47a896633323 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -47,16 +47,6 @@ private[spark] object ChiSqSelectorType extends Enumeration { class ChiSqSelectorModel @Since("1.3.0") ( @Since("1.3.0") val selectedFeatures: Array[Int]) extends VectorTransformer with Saveable { - protected def isSorted(array: Array[Int]): Boolean = { - var i = 1 - val len = array.length - while (i < len) { - if (array(i) < array(i-1)) return false - i += 1 - } - true - } - /** * Applies transformation on a vector. * @@ -76,11 +66,7 @@ class ChiSqSelectorModel @Since("1.3.0") ( * @param filterIndices indices of features to filter */ private def compress(features: Vector, filterIndices: Array[Int]): Vector = { - val orderedIndices = if (isSorted(filterIndices)) { - filterIndices - } else { - filterIndices.sorted - } + val orderedIndices = filterIndices.sorted features match { case SparseVector(size, indices, values) => val newSize = orderedIndices.length @@ -203,7 +189,7 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { @Since("2.1.0") def setPercentile(value: Double): this.type = { - require(value <= 1 && value >= 0, "Percentile should be larger than 0 and less than 1") + require(0.0 <= value && value <= 1.0, "Percentile must be in [0,1]") percentile = value selectorType = ChiSqSelectorType.Percentile this @@ -211,7 +197,7 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { @Since("2.1.0") def setAlpha(value: Double): this.type = { - require(value <= 1 && value >= 0, "alpha value should be larger than 0 and less than 1") + require(0.0 <= value && value <= 1.0, "Alpha must be in [0,1]") alpha = value selectorType = ChiSqSelectorType.Fpr this @@ -243,7 +229,7 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { case ChiSqSelectorType.Fpr => chiSqTestResult .zipWithIndex.filter{ case (res, _) => res.pValue < alpha } case errorType => - throw new IllegalStateException("Unknown ChiSqSelector Type: $errorType") + throw new IllegalStateException(s"Unknown ChiSqSelector Type: $errorType") } val indices = features.map { case (_, indices) => indices } new ChiSqSelectorModel(indices) From b8986b5cd763b9d44c6672571ad27552ba8bca73 Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Tue, 23 Aug 2016 20:49:58 +0800 Subject: [PATCH 10/21] Optimize fit function of ml ChiSqSelector --- .../org/apache/spark/ml/feature/ChiSqSelector.scala | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index 6affcd4f25453..da5c9c536296e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -137,16 +137,18 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str case Row(label: Double, features: Vector) => OldLabeledPoint(label, OldVectors.fromML(features)) } - var model = ChiSqSelectorType.withName($(selectorType)) match { + var selector = new feature.ChiSqSelector() + ChiSqSelectorType.withName($(selectorType)) match { case ChiSqSelectorType.KBest => - new feature.ChiSqSelector().setNumTopFeatures($(numTopFeatures)).fit(input) + selector.setNumTopFeatures($(numTopFeatures)) case ChiSqSelectorType.Percentile => - new feature.ChiSqSelector().setPercentile($(percentile)).fit(input) + selector.setPercentile($(percentile)) case ChiSqSelectorType.Fpr => - new feature.ChiSqSelector().setAlpha($(alpha)).fit(input) + selector.setAlpha($(alpha)) case errorType => throw new IllegalStateException(s"Unknown ChiSqSelector Type: $errorType") } + val model = selector.fit(input) copyValues(new ChiSqSelectorModel(uid, model).setParent(this)) } From 5c2e44cba3494623c283d56bd7cfc1c915901b22 Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Wed, 24 Aug 2016 22:28:02 +0800 Subject: [PATCH 11/21] Fpr to FPR, sort all cases in fit --- .../org/apache/spark/ml/feature/ChiSqSelector.scala | 10 +++++----- .../apache/spark/mllib/feature/ChiSqSelector.scala | 11 +++++------ 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index da5c9c536296e..7992b8b78a8f5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -73,14 +73,14 @@ private[feature] trait ChiSqSelectorParams extends Params def getAlpha: Double = $(alpha) /** - * The ChiSqSelector supports KBest, Percentile, Fpr selection, + * The ChiSqSelector supports KBest, Percentile, FPR selection, * which is the same as ChiSqSelectorType defined in MLLIB. * when call setNumTopFeatures, the selectorType is set to KBest * when call setPercentile, the selectorType is set to Percentile - * when call setFpr, the selectorType is set to Fpr + * when call setAlpha, the selectorType is set to FPR */ final val selectorType = new Param[String](this, "selectorType", - "ChiSqSelector Type: KBest, Percentile, Fpr") + "ChiSqSelector Type: KBest, Percentile, FPR") setDefault(selectorType -> ChiSqSelectorType.KBest.toString) /** @group getParam */ @@ -113,7 +113,7 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str @Since("2.1.0") def setAlpha(value: Double): this.type = { - set(selectorType, ChiSqSelectorType.Fpr.toString) + set(selectorType, ChiSqSelectorType.FPR.toString) set(alpha, value) } @@ -143,7 +143,7 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str selector.setNumTopFeatures($(numTopFeatures)) case ChiSqSelectorType.Percentile => selector.setPercentile($(percentile)) - case ChiSqSelectorType.Fpr => + case ChiSqSelectorType.FPR => selector.setAlpha($(alpha)) case errorType => throw new IllegalStateException(s"Unknown ChiSqSelector Type: $errorType") diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index a47a896633323..610f5c2dd479b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -35,7 +35,7 @@ import org.apache.spark.sql.{Row, SparkSession} @Since("2.1.0") private[spark] object ChiSqSelectorType extends Enumeration { type SelectorType = Value - val KBest, Percentile, Fpr = Value + val KBest, Percentile, FPR = Value } /** @@ -199,7 +199,7 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { def setAlpha(value: Double): this.type = { require(0.0 <= value && value <= 1.0, "Alpha must be in [0,1]") alpha = value - selectorType = ChiSqSelectorType.Fpr + selectorType = ChiSqSelectorType.FPR this } @@ -219,15 +219,14 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { @Since("1.3.0") def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = { val chiSqTestResult = Statistics.chiSqTest(data) + .zipWithIndex.sortBy { case (res, _) => -res.statistic } val features = selectorType match { case ChiSqSelectorType.KBest => chiSqTestResult - .zipWithIndex.sortBy { case (res, _) => -res.statistic } .take(numTopFeatures) case ChiSqSelectorType.Percentile => chiSqTestResult - .zipWithIndex.sortBy { case (res, _) => -res.statistic } .take((chiSqTestResult.length * percentile).toInt) - case ChiSqSelectorType.Fpr => chiSqTestResult - .zipWithIndex.filter{ case (res, _) => res.pValue < alpha } + case ChiSqSelectorType.FPR => chiSqTestResult + .filter{ case (res, _) => res.pValue < alpha } case errorType => throw new IllegalStateException(s"Unknown ChiSqSelector Type: $errorType") } From 0d3967af95bd4e303cf6a5e1e826806f0f6ee617 Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Mon, 29 Aug 2016 22:35:35 +0800 Subject: [PATCH 12/21] Add Python API for ChiSqSelector --- .../mllib/api/python/PythonMLLibAPI.scala | 28 ++++++++-- python/pyspark/mllib/feature.py | 52 ++++++++++++++++--- 2 files changed, 71 insertions(+), 9 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index a80cca70f4b28..5dfbd55790b10 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -629,13 +629,35 @@ private[python] class PythonMLLibAPI extends Serializable { } /** - * Java stub for ChiSqSelector.fit(). This stub returns a + * Java stub for ChiSqSelector.fit() when the seletion type is KBest. This stub returns a * handle to the Java object instead of the content of the Java object. * Extra care needs to be taken in the Python code to ensure it gets freed on * exit; see the Py4J documentation. */ - def fitChiSqSelector(numTopFeatures: Int, data: JavaRDD[LabeledPoint]): ChiSqSelectorModel = { - new ChiSqSelector(numTopFeatures).fit(data.rdd) + def fitChiSqSelectorKBest(numTopFeatures: Int, + data: JavaRDD[LabeledPoint]): ChiSqSelectorModel = { + new ChiSqSelector().setNumTopFeatures(numTopFeatures).fit(data.rdd) + } + + /** + * Java stub for ChiSqSelector.fit() when the selection type is Percentile. This stub returns a + * handle to the Java object instead of the content of the Java object. + * Extra care needs to be taken in the Python code to ensure it gets freed on + * exit; see the Py4J documentation. + */ + def fitChiSqSelectorPercentile(percentile: Double, + data: JavaRDD[LabeledPoint]): ChiSqSelectorModel = { + new ChiSqSelector().setPercentile(percentile).fit(data.rdd) + } + + /** + * Java stub for ChiSqSelector.fit() when the selection type is FPR. This stub returns a + * handle to the Java object instead of the content of the Java object. + * Extra care needs to be taken in the Python code to ensure it gets freed on + * exit; see the Py4J documentation. + */ + def fitChiSqSelectorFPR(alpha: Double, data: JavaRDD[LabeledPoint]): ChiSqSelectorModel = { + new ChiSqSelector().setAlpha(alpha).fit(data.rdd) } /** diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index c8a6e33f4d9a4..2bdbabb32ad10 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -276,24 +276,64 @@ class ChiSqSelector(object): """ Creates a ChiSquared feature selector. - :param numTopFeatures: number of features that selector will select. - >>> data = [ ... LabeledPoint(0.0, SparseVector(3, {0: 8.0, 1: 7.0})), ... LabeledPoint(1.0, SparseVector(3, {1: 9.0, 2: 6.0})), ... LabeledPoint(1.0, [0.0, 9.0, 8.0]), ... LabeledPoint(2.0, [8.0, 9.0, 5.0]) ... ] - >>> model = ChiSqSelector(1).fit(sc.parallelize(data)) + >>> model = ChiSqSelector().setNumTopFeatures(1).fit(sc.parallelize(data)) + >>> model.transform(SparseVector(3, {1: 9.0, 2: 6.0})) + SparseVector(1, {0: 6.0}) + >>> model.transform(DenseVector([8.0, 9.0, 5.0])) + DenseVector([5.0]) + >>> model = ChiSqSelector().setPercentile(0.34).fit(sc.parallelize(data)) >>> model.transform(SparseVector(3, {1: 9.0, 2: 6.0})) SparseVector(1, {0: 6.0}) >>> model.transform(DenseVector([8.0, 9.0, 5.0])) DenseVector([5.0]) + >>> data = [ + ... LabeledPoint(0.0, SparseVector(4, {0: 8.0, 1: 7.0})), + ... LabeledPoint(1.0, SparseVector(4, {1: 9.0, 2: 6.0, 3: 4.0})), + ... LabeledPoint(1.0, [0.0, 9.0, 8.0, 4.0]), + ... LabeledPoint(2.0, [8.0, 9.0, 5.0, 9.0]) + ... ] + >>> model = ChiSqSelector().setAlpha(0.1).fit(sc.parallelize(data)) + >>> model.transform(DenseVector([1.0,2.0,3.0,4.0])) + DenseVector([4.0]) .. versionadded:: 1.4.0 """ - def __init__(self, numTopFeatures): - self.numTopFeatures = int(numTopFeatures) + def __init__(self): + self.param = 50 + self.fitFunc = "fitChiSqSelectorKBest" + + @since('2.1.0') + def setNumTopFeatures(self, numTopFeatures): + """ + set numTopFeature for feature selection by number of top features + """ + self.param = int(numTopFeatures) + self.fitFunc = "fitChiSqSelectorKBest" + return self + + @since('2.1.0') + def setPercentile(self, percentile): + """ + set Percentile [0.0, 1.0] for feature selection by percentile + """ + self.param = float(percentile) + self.fitFunc = "fitChiSqSelectorPercentile" + return self + + @since('2.1.0') + def setAlpha(self, alpha): + """ + set Alpha [0.0, 1.0] for feature selection by FPR + """ + self.param = float(alpha) + self.fitFunc = "fitChiSqSelectorFPR" + return self @since('1.4.0') def fit(self, data): @@ -305,7 +345,7 @@ def fit(self, data): treated as categorical for each distinct value. Apply feature discretizer before using this function. """ - jmodel = callMLlibFunc("fitChiSqSelector", self.numTopFeatures, data) + jmodel = callMLlibFunc(self.fitFunc, self.param, data) return ChiSqSelectorModel(jmodel) From 1dc6a8ebad693009dd4bca0e579252ec274bce86 Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Mon, 5 Sep 2016 13:55:51 +0800 Subject: [PATCH 13/21] split the ChiSqSelector param to numTopFeateres, Percentile, Alpha in Python --- python/pyspark/mllib/feature.py | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index 2bdbabb32ad10..bb0cc13419d42 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -271,6 +271,11 @@ def transform(self, vector): """ return JavaVectorTransformer.transform(self, vector) +class ChiSqSelectorType: + """ + This class defines the selector types of Chi Square Selector. + """ + KBest, Percentile, FPR = range(3) class ChiSqSelector(object): """ @@ -305,34 +310,34 @@ class ChiSqSelector(object): .. versionadded:: 1.4.0 """ def __init__(self): - self.param = 50 - self.fitFunc = "fitChiSqSelectorKBest" + self.numTopFeatures = 50 + self.selectorType = ChiSqSelectorType.KBest @since('2.1.0') def setNumTopFeatures(self, numTopFeatures): """ set numTopFeature for feature selection by number of top features """ - self.param = int(numTopFeatures) - self.fitFunc = "fitChiSqSelectorKBest" + self.numTopFeatures = int(numTopFeatures) + self.selectorType = ChiSqSelectorType.KBest return self @since('2.1.0') def setPercentile(self, percentile): """ - set Percentile [0.0, 1.0] for feature selection by percentile + set percentile [0.0, 1.0] for feature selection by percentile """ - self.param = float(percentile) - self.fitFunc = "fitChiSqSelectorPercentile" + self.percentile = float(percentile) + self.selectorType = ChiSqSelectorType.Percentile return self @since('2.1.0') def setAlpha(self, alpha): """ - set Alpha [0.0, 1.0] for feature selection by FPR + set alpha [0.0, 1.0] for feature selection by FPR """ - self.param = float(alpha) - self.fitFunc = "fitChiSqSelectorFPR" + self.alpha = float(alpha) + self.selectorType = ChiSqSelectorType.FPR return self @since('1.4.0') @@ -345,7 +350,12 @@ def fit(self, data): treated as categorical for each distinct value. Apply feature discretizer before using this function. """ - jmodel = callMLlibFunc(self.fitFunc, self.param, data) + if self.selectorType == ChiSqSelectorType.KBest: + jmodel = callMLlibFunc("fitChiSqSelectorKBest", self.numTopFeatures, data) + elif self.selectorType == ChiSqSelectorType.Percentile: + jmodel = callMLlibFunc("fitChiSqSelectorPercentile", self.percentile, data) + elif self.selectorType == ChiSqSelectorType.FPR: + jmodel = callMLlibFunc("fitChiSqSelectorFPR", self.alpha, data) return ChiSqSelectorModel(jmodel) From 990887181b12c354270ae3127e10fa35e790a784 Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Mon, 5 Sep 2016 23:22:05 +0800 Subject: [PATCH 14/21] Add type check for Python ChiSqSelector --- python/pyspark/mllib/feature.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index bb0cc13419d42..1c18d978607f4 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -356,6 +356,8 @@ def fit(self, data): jmodel = callMLlibFunc("fitChiSqSelectorPercentile", self.percentile, data) elif self.selectorType == ChiSqSelectorType.FPR: jmodel = callMLlibFunc("fitChiSqSelectorFPR", self.alpha, data) + else: + raise TypeError("Chi Square selector only supports: KBest, Percentile, and FPR.") return ChiSqSelectorModel(jmodel) From bbccac7b5d3812d71498f9a420829f3e67955794 Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Tue, 6 Sep 2016 17:19:09 +0800 Subject: [PATCH 15/21] Change the exception type of value check --- python/pyspark/mllib/feature.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index 1c18d978607f4..5350f949c3ecf 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -357,7 +357,8 @@ def fit(self, data): elif self.selectorType == ChiSqSelectorType.FPR: jmodel = callMLlibFunc("fitChiSqSelectorFPR", self.alpha, data) else: - raise TypeError("Chi Square selector only supports: KBest, Percentile, and FPR.") + raise ValueError("ChiSqSelector type supports KBest(0), Percentile(1) and " + "FPR(2), the current value is: %s" % self.selectorType) return ChiSqSelectorModel(jmodel) From c35bcf14a7c1694ec03f4c58fc6f5b8d5ef3e5bb Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Tue, 13 Sep 2016 17:57:46 +0800 Subject: [PATCH 16/21] change python code style --- python/pyspark/mllib/feature.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index 5350f949c3ecf..26abe4175ce28 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -357,8 +357,8 @@ def fit(self, data): elif self.selectorType == ChiSqSelectorType.FPR: jmodel = callMLlibFunc("fitChiSqSelectorFPR", self.alpha, data) else: - raise ValueError("ChiSqSelector type supports KBest(0), Percentile(1) and " - "FPR(2), the current value is: %s" % self.selectorType) + raise ValueError("ChiSqSelector type supports KBest(0), Percentile(1) and" + " FPR(2), the current value is: %s" % self.selectorType) return ChiSqSelectorModel(jmodel) From e8f03edbc54d3c9cb32688f79dd129dbf043da38 Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Tue, 13 Sep 2016 18:33:40 +0800 Subject: [PATCH 17/21] change python code style --- python/pyspark/mllib/feature.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index 26abe4175ce28..dd876fbac9194 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -271,12 +271,14 @@ def transform(self, vector): """ return JavaVectorTransformer.transform(self, vector) + class ChiSqSelectorType: """ This class defines the selector types of Chi Square Selector. """ KBest, Percentile, FPR = range(3) + class ChiSqSelector(object): """ Creates a ChiSquared feature selector. From ec74ddebe6bc9f1e85c60e9fcd88355691b5f42c Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Tue, 13 Sep 2016 20:19:55 +0800 Subject: [PATCH 18/21] revert isSort to pass MiMa test --- .../spark/mllib/feature/ChiSqSelector.scala | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index 610f5c2dd479b..16beb5072c26d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -47,6 +47,16 @@ private[spark] object ChiSqSelectorType extends Enumeration { class ChiSqSelectorModel @Since("1.3.0") ( @Since("1.3.0") val selectedFeatures: Array[Int]) extends VectorTransformer with Saveable { + protected def isSorted(array: Array[Int]): Boolean = { + var i = 1 + val len = array.length + while (i < len) { + if (array(i) < array(i-1)) return false + i += 1 + } + true + } + /** * Applies transformation on a vector. * @@ -169,10 +179,10 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] { */ @Since("2.1.0") class ChiSqSelector @Since("2.1.0") () extends Serializable { - private var numTopFeatures: Int = 50 - private var percentile: Double = 0.1 - private var alpha: Double = 0.05 - private var selectorType = ChiSqSelectorType.KBest + var numTopFeatures: Int = 50 + var percentile: Double = 0.1 + var alpha: Double = 0.05 + var selectorType = ChiSqSelectorType.KBest @Since("1.3.0") def this(numTopFeatures: Int) { From 6398f4ca954ff0971c5ef3db7956847b09d54849 Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Wed, 14 Sep 2016 15:11:15 +0800 Subject: [PATCH 19/21] Change MimaExcludes --- .../org/apache/spark/mllib/feature/ChiSqSelector.scala | 10 ---------- project/MimaExcludes.scala | 3 +++ 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index 16beb5072c26d..226b296293133 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -47,16 +47,6 @@ private[spark] object ChiSqSelectorType extends Enumeration { class ChiSqSelectorModel @Since("1.3.0") ( @Since("1.3.0") val selectedFeatures: Array[Int]) extends VectorTransformer with Saveable { - protected def isSorted(array: Array[Int]): Boolean = { - var i = 1 - val len = array.length - while (i < len) { - if (array(i) < array(i-1)) return false - i += 1 - } - true - } - /** * Applies transformation on a vector. * diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index 688218f6f43af..d24f2f3f25192 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -787,6 +787,9 @@ object MimaExcludes { ) ++ Seq( // [SPARK-14743] Improve delegation token handling in secure cluster ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.deploy.SparkHadoopUtil.getTimeFromNowToRenewal") + ) ++ Seq( + // [SPARK-17017] Add chiSquare selector based on False Positive Rate (FPR) test + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.mllib.feature.ChiSqSelectorModel.isSorted") ) } From 1d2f67f01be415203173bc441fbb238ccb7121a9 Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Sun, 18 Sep 2016 15:43:24 +0800 Subject: [PATCH 20/21] add javadoc --- .../org/apache/spark/ml/feature/ChiSqSelector.scala | 5 ++++- .../org/apache/spark/mllib/feature/ChiSqSelector.scala | 10 ++++++++-- python/pyspark/mllib/feature.py | 4 ++-- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index 7992b8b78a8f5..accaecb9d6dce 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -90,6 +90,9 @@ private[feature] trait ChiSqSelectorParams extends Params /** * Chi-Squared feature selection, which selects categorical features to use for predicting a * categorical label. + * The selector supports three selection methods: KBest, Percentile and FPR. + * By default, the selection method is KBest, the default number of top features is 50. + * User can use setNumTopFeatures, setPercentile and setAlpha to set different selection methods. */ @Since("1.6.0") final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: String) @@ -99,7 +102,7 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str def this() = this(Identifiable.randomUID("chiSqSelector")) /** @group setParam */ - @Since("2.1.0") + @Since("1.6.0") def setNumTopFeatures(value: Int): this.type = { set(selectorType, ChiSqSelectorType.KBest.toString) set(numTopFeatures, value) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index 87998af01d39d..079e8ff9d598e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -166,21 +166,27 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] { /** * Creates a ChiSquared feature selector. + * The selector supports three selection methods: KBest, Percentile and FPR. + * By default, the selection method is KBest, the default number of top features is 50. + * User can use setNumTopFeatures, setPercentile and setAlpha to set different selection methods. */ -@Since("2.1.0") +@Since("1.3.0") class ChiSqSelector @Since("2.1.0") () extends Serializable { var numTopFeatures: Int = 50 var percentile: Double = 0.1 var alpha: Double = 0.05 var selectorType = ChiSqSelectorType.KBest + /** + * The is the same to call this() and setNumTopFeatures(numTopFeatures) + */ @Since("1.3.0") def this(numTopFeatures: Int) { this() this.numTopFeatures = numTopFeatures } - @Since("2.1.0") + @Since("1.6.0") def setNumTopFeatures(value: Int): this.type = { numTopFeatures = value selectorType = ChiSqSelectorType.KBest diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index cc4be6f994ecc..5b489617afb45 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -310,8 +310,8 @@ class ChiSqSelector(object): .. versionadded:: 1.4.0 """ - def __init__(self): - self.numTopFeatures = 50 + def __init__(self, numTopFeatures=50): + self.numTopFeatures = numTopFeatures self.selectorType = ChiSqSelectorType.KBest @since('2.1.0') From 88d2143989a4219020100999c53bb1186fce5d1c Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Mon, 19 Sep 2016 09:36:05 +0800 Subject: [PATCH 21/21] change javadoc --- .../scala/org/apache/spark/ml/feature/ChiSqSelector.scala | 7 +++++-- .../org/apache/spark/mllib/feature/ChiSqSelector.scala | 7 +++++-- python/pyspark/mllib/feature.py | 6 ++++++ 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index accaecb9d6dce..0c6a37bab0aad 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -90,8 +90,11 @@ private[feature] trait ChiSqSelectorParams extends Params /** * Chi-Squared feature selection, which selects categorical features to use for predicting a * categorical label. - * The selector supports three selection methods: KBest, Percentile and FPR. - * By default, the selection method is KBest, the default number of top features is 50. + * The selector supports three selection methods: `KBest`, `Percentile` and `FPR`. + * `KBest` chooses the `k` top features according to a chi-squared test. + * `Percentile` is similar but chooses a fraction of all features instead of a fixed number. + * `FPR` chooses all features whose false positive rate meets some threshold. + * By default, the selection method is `KBest`, the default number of top features is 50. * User can use setNumTopFeatures, setPercentile and setAlpha to set different selection methods. */ @Since("1.6.0") diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index 079e8ff9d598e..f68a017184b21 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -166,8 +166,11 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] { /** * Creates a ChiSquared feature selector. - * The selector supports three selection methods: KBest, Percentile and FPR. - * By default, the selection method is KBest, the default number of top features is 50. + * The selector supports three selection methods: `KBest`, `Percentile` and `FPR`. + * `KBest` chooses the `k` top features according to a chi-squared test. + * `Percentile` is similar but chooses a fraction of all features instead of a fixed number. + * `FPR` chooses all features whose false positive rate meets some threshold. + * By default, the selection method is `KBest`, the default number of top features is 50. * User can use setNumTopFeatures, setPercentile and setAlpha to set different selection methods. */ @Since("1.3.0") diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index 87449979f3a8c..077c11370eb3f 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -281,6 +281,12 @@ class ChiSqSelectorType: class ChiSqSelector(object): """ Creates a ChiSquared feature selector. + The selector supports three selection methods: `KBest`, `Percentile` and `FPR`. + `KBest` chooses the `k` top features according to a chi-squared test. + `Percentile` is similar but chooses a fraction of all features instead of a fixed number. + `FPR` chooses all features whose false positive rate meets some threshold. + By default, the selection method is `KBest`, the default number of top features is 50. + User can use setNumTopFeatures, setPercentile and setAlpha to set different selection methods. >>> data = [ ... LabeledPoint(0.0, SparseVector(3, {0: 8.0, 1: 7.0})),