From 2c071791b2c6fd7d388343ac95783c32ffdae529 Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Fri, 23 Sep 2016 15:27:19 +0800 Subject: [PATCH 1/9] add feature selector method: FDR and FWE --- .../spark/ml/feature/ChiSqSelector.scala | 58 +++++++++++++++---- .../mllib/api/python/PythonMLLibAPI.scala | 22 ++++++- .../spark/mllib/feature/ChiSqSelector.scala | 52 ++++++++++++++--- .../mllib/feature/ChiSqSelectorSuite.scala | 2 +- python/pyspark/mllib/feature.py | 36 ++++++++++-- 5 files changed, 142 insertions(+), 28 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index 0c6a37bab0aad..0dfea1c6b26b0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -64,23 +64,41 @@ private[feature] trait ChiSqSelectorParams extends Params /** @group getParam */ def getPercentile: Double = $(percentile) - final val alpha = new DoubleParam(this, "alpha", + final val alphaFPR = new DoubleParam(this, "alphaFPR", "The highest p-value for features to be kept.", ParamValidators.inRange(0, 1)) - setDefault(alpha -> 0.05) + setDefault(alphaFPR -> 0.05) /** @group getParam */ - def getAlpha: Double = $(alpha) + def getAlphaFPR: Double = $(alphaFPR) + + final val alphaFDR = new DoubleParam(this, "alphaFDR", + "The highest uncorrected p-value for features to be kept.", + ParamValidators.inRange(0, 1)) + setDefault(alphaFPR -> 0.05) + + /** @group getParam */ + def getAlphaFDR: Double = $(alphaFDR) + + final val alphaFWE = new DoubleParam(this, "alphaFWE", + "The highest uncorrected p-value for features to be kept.", + ParamValidators.inRange(0, 1)) + setDefault(alphaFWE -> 0.05) + + /** @group getParam */ + def getAlphaFWE: Double = $(alphaFWE) /** - * The ChiSqSelector supports KBest, Percentile, FPR selection, + * The ChiSqSelector supports `KBest`, `Percentile`, `FPR`, `FDR`, `FWE` selection, * which is the same as ChiSqSelectorType defined in MLLIB. * when call setNumTopFeatures, the selectorType is set to KBest * when call setPercentile, the selectorType is set to Percentile - * when call setAlpha, the selectorType is set to FPR + * when call setFPR, the selectorType is set to FPR + * when call setFDR, the selectorType is set to FDR + * when call setFWE, the selectorType is set to FWE */ final val selectorType = new Param[String](this, "selectorType", - "ChiSqSelector Type: KBest, Percentile, FPR") + "ChiSqSelector Type: KBest, Percentile, FPR, FDR, FWE") setDefault(selectorType -> ChiSqSelectorType.KBest.toString) /** @group getParam */ @@ -93,7 +111,9 @@ private[feature] trait ChiSqSelectorParams extends Params * The selector supports three selection methods: `KBest`, `Percentile` and `FPR`. * `KBest` chooses the `k` top features according to a chi-squared test. * `Percentile` is similar but chooses a fraction of all features instead of a fixed number. - * `FPR` chooses all features whose false positive rate meets some threshold. + * `FPR` select features based on a false positive rate test. + * `FDR` select features based on an estimated false discovery rate. + * `FWE` select features based on family-wise error rate. * By default, the selection method is `KBest`, the default number of top features is 50. * User can use setNumTopFeatures, setPercentile and setAlpha to set different selection methods. */ @@ -118,9 +138,21 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str } @Since("2.1.0") - def setAlpha(value: Double): this.type = { + def setFPR(value: Double): this.type = { set(selectorType, ChiSqSelectorType.FPR.toString) - set(alpha, value) + set(alphaFPR, value) + } + + @Since("2.1.0") + def setFDR(value: Double): this.type = { + set(selectorType, ChiSqSelectorType.FDR.toString) + set(alphaFDR, value) + } + + @Since("2.1.0") + def setFWE(value: Double): this.type = { + set(selectorType, ChiSqSelectorType.FWE.toString) + set(alphaFWE, value) } /** @group setParam */ @@ -143,14 +175,18 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str case Row(label: Double, features: Vector) => OldLabeledPoint(label, OldVectors.fromML(features)) } - var selector = new feature.ChiSqSelector() + val selector = new feature.ChiSqSelector() ChiSqSelectorType.withName($(selectorType)) match { case ChiSqSelectorType.KBest => selector.setNumTopFeatures($(numTopFeatures)) case ChiSqSelectorType.Percentile => selector.setPercentile($(percentile)) case ChiSqSelectorType.FPR => - selector.setAlpha($(alpha)) + selector.setFPR($(alphaFPR)) + case ChiSqSelectorType.FDR => + selector.setFPR($(alphaFDR)) + case ChiSqSelectorType.FWE => + selector.setFPR($(alphaFWE)) case errorType => throw new IllegalStateException(s"Unknown ChiSqSelector Type: $errorType") } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index 5cffbf0892888..da0d26b71b9e5 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -657,7 +657,27 @@ private[python] class PythonMLLibAPI extends Serializable { * exit; see the Py4J documentation. */ def fitChiSqSelectorFPR(alpha: Double, data: JavaRDD[LabeledPoint]): ChiSqSelectorModel = { - new ChiSqSelector().setAlpha(alpha).fit(data.rdd) + new ChiSqSelector().setFPR(alpha).fit(data.rdd) + } + + /** + * Java stub for ChiSqSelector.fit() when the selection type is FDR. This stub returns a + * handle to the Java object instead of the content of the Java object. + * Extra care needs to be taken in the Python code to ensure it gets freed on + * exit; see the Py4J documentation. + */ + def fitChiSqSelectorFDR(alpha: Double, data: JavaRDD[LabeledPoint]): ChiSqSelectorModel = { + new ChiSqSelector().setFDR(alpha).fit(data.rdd) + } + + /** + * Java stub for ChiSqSelector.fit() when the selection type is FWE. This stub returns a + * handle to the Java object instead of the content of the Java object. + * Extra care needs to be taken in the Python code to ensure it gets freed on + * exit; see the Py4J documentation. + */ + def fitChiSqSelectorFWE(alpha: Double, data: JavaRDD[LabeledPoint]): ChiSqSelectorModel = { + new ChiSqSelector().setFWE(alpha).fit(data.rdd) } /** diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index f68a017184b21..5db9785691824 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -35,7 +35,7 @@ import org.apache.spark.sql.{Row, SparkSession} @Since("2.1.0") private[spark] object ChiSqSelectorType extends Enumeration { type SelectorType = Value - val KBest, Percentile, FPR = Value + val KBest, Percentile, FPR, FDR, FWE = Value } /** @@ -166,10 +166,12 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] { /** * Creates a ChiSquared feature selector. - * The selector supports three selection methods: `KBest`, `Percentile` and `FPR`. + * The selector supports five selection methods: `KBest`, `Percentile`, `FPR`, `FDR`, `FWE`. * `KBest` chooses the `k` top features according to a chi-squared test. * `Percentile` is similar but chooses a fraction of all features instead of a fixed number. - * `FPR` chooses all features whose false positive rate meets some threshold. + * `FPR` select features based on a false positive rate test. + * `FDR` select features based on an estimated false discovery rate. + * `FWE` select features based on family-wise error rate. * By default, the selection method is `KBest`, the default number of top features is 50. * User can use setNumTopFeatures, setPercentile and setAlpha to set different selection methods. */ @@ -177,7 +179,9 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] { class ChiSqSelector @Since("2.1.0") () extends Serializable { var numTopFeatures: Int = 50 var percentile: Double = 0.1 - var alpha: Double = 0.05 + var alphaFPR: Double = 0.05 + var alphaFDR: Double = 0.05 + var alphaFWE: Double = 0.05 var selectorType = ChiSqSelectorType.KBest /** @@ -205,13 +209,29 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { } @Since("2.1.0") - def setAlpha(value: Double): this.type = { + def setFPR(value: Double): this.type = { require(0.0 <= value && value <= 1.0, "Alpha must be in [0,1]") - alpha = value + alphaFPR = value selectorType = ChiSqSelectorType.FPR this } + @Since("2.1.0") + def setFDR(value: Double): this.type = { + require(0.0 <= value && value <= 1.0, "Alpha must be in [0,1]") + alphaFDR = value + selectorType = ChiSqSelectorType.FDR + this + } + + @Since("2.1.0") + def setFWE(value: Double): this.type = { + require(0.0 <= value && value <= 1.0, "Alpha must be in [0,1]") + alphaFWE = value + selectorType = ChiSqSelectorType.FWE + this + } + @Since("2.1.0") def setChiSqSelectorType(value: ChiSqSelectorType.Value): this.type = { selectorType = value @@ -228,18 +248,32 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { @Since("1.3.0") def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = { val chiSqTestResult = Statistics.chiSqTest(data) - .zipWithIndex.sortBy { case (res, _) => -res.statistic } + .zipWithIndex val features = selectorType match { case ChiSqSelectorType.KBest => chiSqTestResult + .sortBy { case (res, _) => -res.statistic } .take(numTopFeatures) case ChiSqSelectorType.Percentile => chiSqTestResult + .sortBy { case (res, _) => -res.statistic } .take((chiSqTestResult.length * percentile).toInt) case ChiSqSelectorType.FPR => chiSqTestResult - .filter{ case (res, _) => res.pValue < alpha } + .filter{ case (res, _) => res.pValue < alphaFPR } + case ChiSqSelectorType.FDR => + val tempRDD = chiSqTestResult + .sortBy{ case (res, _) => res.pValue } + val maxIndex = tempRDD + .zipWithIndex + .filter{ case ((res, index1), index2) => + res.pValue <= alphaFDR * (index2 + 1) / chiSqTestResult.length } + .map{ case (_, index) => index} + .max + tempRDD.take(maxIndex + 1) + case ChiSqSelectorType.FWE => chiSqTestResult + .filter{ case (res, _) => res.pValue < alphaFWE/chiSqTestResult.length } case errorType => throw new IllegalStateException(s"Unknown ChiSqSelector Type: $errorType") } - val indices = features.map { case (_, indices) => indices } + val indices = features.map { case (_, index) => index } new ChiSqSelectorModel(indices) } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala index e181a544f7159..a036d73f7ebbb 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala @@ -76,7 +76,7 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext { LabeledPoint(1.0, Vectors.dense(Array(4.0))), LabeledPoint(1.0, Vectors.dense(Array(4.0))), LabeledPoint(2.0, Vectors.dense(Array(9.0)))) - val model = new ChiSqSelector().setAlpha(0.1).fit(labeledDiscreteData) + val model = new ChiSqSelector().setFPR(0.1).fit(labeledDiscreteData) val filteredData = labeledDiscreteData.map { lp => LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSet diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index 077c11370eb3f..138840241e658 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -275,7 +275,7 @@ class ChiSqSelectorType: """ This class defines the selector types of Chi Square Selector. """ - KBest, Percentile, FPR = range(3) + KBest, Percentile, FPR, FDR, FWE = range(5) class ChiSqSelector(object): @@ -284,7 +284,9 @@ class ChiSqSelector(object): The selector supports three selection methods: `KBest`, `Percentile` and `FPR`. `KBest` chooses the `k` top features according to a chi-squared test. `Percentile` is similar but chooses a fraction of all features instead of a fixed number. - `FPR` chooses all features whose false positive rate meets some threshold. + `FPR` select features based on a false positive rate test. + `FDR` select features based on an estimated false discovery rate. + `FWE` select features based on family-wise error rate. By default, the selection method is `KBest`, the default number of top features is 50. User can use setNumTopFeatures, setPercentile and setAlpha to set different selection methods. @@ -310,7 +312,7 @@ class ChiSqSelector(object): ... LabeledPoint(1.0, [0.0, 9.0, 8.0, 4.0]), ... LabeledPoint(2.0, [8.0, 9.0, 5.0, 9.0]) ... ] - >>> model = ChiSqSelector().setAlpha(0.1).fit(sc.parallelize(data)) + >>> model = ChiSqSelector().setFPR(0.1).fit(sc.parallelize(data)) >>> model.transform(DenseVector([1.0,2.0,3.0,4.0])) DenseVector([4.0]) @@ -339,14 +341,32 @@ def setPercentile(self, percentile): return self @since('2.1.0') - def setAlpha(self, alpha): + def setFPR(self, alpha): """ set alpha [0.0, 1.0] for feature selection by FPR """ - self.alpha = float(alpha) + self.alphaFPR = float(alpha) self.selectorType = ChiSqSelectorType.FPR return self + @since('2.1.0') + def setFDR(self, alpha): + """ + set alpha [0.0, 1.0] for feature selection by FDR + """ + self.alphaFPR = float(alpha) + self.selectorType = ChiSqSelectorType.FDR + return self + + @since('2.1.0') + def setFWE(self, alpha): + """ + set alpha [0.0, 1.0] for feature selection by FWE + """ + self.alphaFWE = float(alpha) + self.selectorType = ChiSqSelectorType.FWE + return self + @since('1.4.0') def fit(self, data): """ @@ -362,7 +382,11 @@ def fit(self, data): elif self.selectorType == ChiSqSelectorType.Percentile: jmodel = callMLlibFunc("fitChiSqSelectorPercentile", self.percentile, data) elif self.selectorType == ChiSqSelectorType.FPR: - jmodel = callMLlibFunc("fitChiSqSelectorFPR", self.alpha, data) + jmodel = callMLlibFunc("fitChiSqSelectorFPR", self.alphaFPR, data) + elif self.selectorType == ChiSqSelectorType.FDR: + jmodel = callMLlibFunc("fitChiSqSelectorFDR", self.alphaFDR, data) + elif self.selectorType == ChiSqSelectorType.FWE: + jmodel = callMLlibFunc("fitChiSqSelectorFWE", self.alphaFWE, data) else: raise ValueError("ChiSqSelector type supports KBest(0), Percentile(1) and" " FPR(2), the current value is: %s" % self.selectorType) From 9c7fae33d715b1f8dc6d1fc4ee468cbe047ef5c9 Mon Sep 17 00:00:00 2001 From: Peng Date: Tue, 27 Sep 2016 20:26:28 +0800 Subject: [PATCH 2/9] fix python style bug --- python/pyspark/ml/feature.py | 25 +++++++++++++++---------- python/pyspark/mllib/feature.py | 3 ++- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 30311746acd9d..d6bd33239861a 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2602,34 +2602,39 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja typeConverter=TypeConverters.toFloat) alphaFPR = Param(Params._dummy(), "alphaFPR", "The highest p-value for features to be kept.", - typeConverter=TypeConverters.toFloat) + typeConverter=TypeConverters.toFloat) - alphaFDR = Param(Params._dummy(), "alphaFDR", "The highest uncorrected p-value for features to be kept.", - typeConverter=TypeConverters.toFloat) + alphaFDR = Param(Params._dummy(), "alphaFDR", "The highest uncorrected p-value for " + + "features to be kept.", typeConverter=TypeConverters.toFloat) - alphaFWE = Param(Params._dummy(), "alphaFWE", "The highest uncorrected p-value for features to be kept.", - typeConverter=TypeConverters.toFloat) + alphaFWE = Param(Params._dummy(), "alphaFWE", "The highest uncorrected p-value for " + + "features to be kept.", typeConverter=TypeConverters.toFloat) @keyword_only def __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, - labelCol="label", selectorType="kbest", percentile=0.1, alphaFPR=0.05, alphaFDR=0.05, alphaFWE=0.05): + labelCol="label", selectorType="kbest", percentile=0.1, alphaFPR=0.05, + alphaFDR=0.05, alphaFWE=0.05): """ __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, \ - labelCol="label", selectorType="kbest", percentile=0.1, alphaFPR=0.05, alphaFDR=0.05, alphaFWE=0.05) + labelCol="label", selectorType="kbest", percentile=0.1, alphaFPR=0.05, \ + alphaFDR=0.05, alphaFWE=0.05) """ super(ChiSqSelector, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ChiSqSelector", self.uid) - self._setDefault(numTopFeatures=50, selectorType="kbest", percentile=0.1, alphaFPR=0.05, alphaFDR=0.05, alphaFWE=0.05) + self._setDefault(numTopFeatures=50, selectorType="kbest", percentile=0.1, alphaFPR=0.05, + alphaFDR=0.05, alphaFWE=0.05) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @keyword_only @since("2.0.0") def setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None, - labelCol="labels", selectorType="kbest", percentile=0.1, alphaFPR=0.05, alphaFDR=0.05, alphaFWE=0.05): + labelCol="labels", selectorType="kbest", percentile=0.1, alphaFPR=0.05, + alphaFDR=0.05, alphaFWE=0.05): """ setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None, \ - labelCol="labels", selectorType="kbest", percentile=0.1, alphaFPR=0.05, alphaFDR=0.05, alphaFWE=0.05) + labelCol="labels", selectorType="kbest", percentile=0.1, alphaFPR=0.05, \ + alphaFDR=0.05, alphaFWE=0.05) Sets params for this ChiSqSelector. """ kwargs = self.setParams._input_kwargs diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index ac3396ee9e845..a0a2fc94283bd 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -311,7 +311,8 @@ class ChiSqSelector(object): .. versionadded:: 1.4.0 """ - def __init__(self, numTopFeatures=50, selectorType="kbest", percentile=0.1, alphaFPR=0.05, alphaFDR=0.05, alphaFWE=0.05): + def __init__(self, numTopFeatures=50, selectorType="kbest", percentile=0.1, alphaFPR=0.05, + alphaFDR=0.05, alphaFWE=0.05): self.numTopFeatures = numTopFeatures self.selectorType = selectorType self.percentile = percentile From 2e97c5542a7f1c5157d93f227da7cb33eaccc230 Mon Sep 17 00:00:00 2001 From: Peng Date: Tue, 27 Sep 2016 20:39:47 +0800 Subject: [PATCH 3/9] python style change --- python/pyspark/mllib/feature.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index a0a2fc94283bd..659e5f4724726 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -312,7 +312,7 @@ class ChiSqSelector(object): .. versionadded:: 1.4.0 """ def __init__(self, numTopFeatures=50, selectorType="kbest", percentile=0.1, alphaFPR=0.05, - alphaFDR=0.05, alphaFWE=0.05): + alphaFDR=0.05, alphaFWE=0.05): self.numTopFeatures = numTopFeatures self.selectorType = selectorType self.percentile = percentile From d05d7de190785c393dc4290a26cea61d414191af Mon Sep 17 00:00:00 2001 From: Peng Date: Mon, 10 Oct 2016 15:01:08 +0800 Subject: [PATCH 4/9] minor change --- .../org/apache/spark/mllib/feature/ChiSqSelector.scala | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index 1cdcb59e878f7..216c9434c6ab5 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -244,20 +244,20 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { .take((chiSqTestResult.length * percentile).toInt) case ChiSqSelector.FPR => chiSqTestResult - .filter{ case (res, _) => res.pValue < alpha } + .filter { case (res, _) => res.pValue < alpha } case ChiSqSelector.FDR => val tempRDD = chiSqTestResult - .sortBy{ case (res, _) => res.pValue } + .sortBy { case (res, _) => res.pValue } val maxIndex = tempRDD .zipWithIndex - .filter{ case ((res, _), index) => + .filter { case ((res, _), index) => res.pValue <= alpha * (index + 1) / chiSqTestResult.length } - .map{ case (_, index) => index} + .map { case (_, index) => index} .max tempRDD.take(maxIndex + 1) case ChiSqSelector.FWE => chiSqTestResult - .filter{ case (res, _) => res.pValue < alpha/chiSqTestResult.length } + .filter { case (res, _) => res.pValue < alpha/chiSqTestResult.length } case errorType => throw new IllegalStateException(s"Unknown ChiSqSelector Type: $errorType") } From d51b78b6cdd94b01a76a99b3d70f957737d88b20 Mon Sep 17 00:00:00 2001 From: Peng Date: Thu, 20 Oct 2016 15:36:55 +0800 Subject: [PATCH 5/9] add test cases, and revise docs --- docs/ml-features.md | 6 +- docs/mllib-feature-extraction.md | 8 +- .../spark/ml/feature/ChiSqSelector.scala | 18 +- .../spark/mllib/feature/ChiSqSelector.scala | 37 ++- .../spark/ml/feature/ChiSqSelectorSuite.scala | 304 ++++++++++++++++-- .../mllib/feature/ChiSqSelectorSuite.scala | 226 +++++++++++-- python/pyspark/ml/feature.py | 14 +- python/pyspark/mllib/feature.py | 8 +- 8 files changed, 542 insertions(+), 79 deletions(-) diff --git a/docs/ml-features.md b/docs/ml-features.md index a7f710fa52e64..9e4eb8dba68d9 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -1333,14 +1333,16 @@ for more details on the API. `ChiSqSelector` stands for Chi-Squared feature selection. It operates on labeled data with categorical features. ChiSqSelector uses the [Chi-Squared test of independence](https://en.wikipedia.org/wiki/Chi-squared_test) to decide which -features to choose. It supports three selection methods: `KBest`, `Percentile` and `FPR`: +features to choose. It supports five selection methods: `KBest`, `Percentile`, `FPR`, `FDR` and `FWE`: * `KBest` chooses the `k` top features according to a chi-squared test. This is akin to yielding the features with the most predictive power. * `Percentile` is similar to `KBest` but chooses a fraction of all features instead of a fixed number. * `FPR` chooses all features whose false positive rate meets some threshold. +* `FDR` chooses all features whose false discovery rate meets some threshold. +* `FWE` chooses all features whose family-wise error rate meets some threshold. By default, the selection method is `KBest`, the default number of top features is 50. User can use -`setNumTopFeatures`, `setPercentile` and `setAlpha` to set different selection methods. +`setSelectorType` to set different selection methods. **Examples** diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md index 87e1e027e945b..41577decad877 100644 --- a/docs/mllib-feature-extraction.md +++ b/docs/mllib-feature-extraction.md @@ -227,20 +227,22 @@ both speed and statistical learning behavior. [`ChiSqSelector`](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector) implements Chi-Squared feature selection. It operates on labeled data with categorical features. ChiSqSelector uses the [Chi-Squared test of independence](https://en.wikipedia.org/wiki/Chi-squared_test) to decide which -features to choose. It supports three selection methods: `KBest`, `Percentile` and `FPR`: +features to choose. It supports five selection methods: `KBest`, `Percentile`, `FPR`, `FDR`, and `FWE`: * `KBest` chooses the `k` top features according to a chi-squared test. This is akin to yielding the features with the most predictive power. * `Percentile` is similar to `KBest` but chooses a fraction of all features instead of a fixed number. * `FPR` chooses all features whose false positive rate meets some threshold. +* `FDR` chooses all features whose false discovery rate meets some threshold. +* `FWE` chooses all features whose family-wise error rate meets some threshold. By default, the selection method is `KBest`, the default number of top features is 50. User can use -`setNumTopFeatures`, `setPercentile` and `setAlpha` to set different selection methods. +`setSelectorType` to set different selection methods. The number of features to select can be tuned using a held-out validation set. ### Model Fitting -`ChiSqSelector` takes a `numTopFeatures` parameter specifying the number of top features that +`ChiSqSelector` can take a `numTopFeatures` parameter specifying the number of top features that the selector will select. The [`fit`](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector) method takes diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index 033a0b9c6de17..da131e4f7fc91 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -72,14 +72,15 @@ private[feature] trait ChiSqSelectorParams extends Params def getPercentile: Double = $(percentile) /** - * alpha means the highest p-value for features to be kept when select type is "fpr". - * alpha means the highest uncorrected p-value for features to be kept when select type + * Only applicable when selectorType = "fpr", "fdr", or "fwe" + * alpha means the highest p-value for features to be kept when select type is "fpr", + * or the highest uncorrected p-value for features to be kept when select type * is "fdr" and "fwe". * Default value is 0.05. */ final val alpha = new DoubleParam(this, "alpha", "alpha means the highest p-value for features to be kept when select type is fpr, " + - "alpha means the highest uncorrected p-value for features to be kept when select type " + + "or the highest uncorrected p-value for features to be kept when select type " + "is fdr and fwe.", ParamValidators.inRange(0, 1)) setDefault(alpha -> 0.05) @@ -104,13 +105,12 @@ private[feature] trait ChiSqSelectorParams extends Params /** * Chi-Squared feature selection, which selects categorical features to use for predicting a * categorical label. - * The selector supports three selection methods: `kbest`, `percentile` and `fpr`. + * The selector supports five selection methods: `kbest`, `percentile`, `fpr`, `fdr` and `fwe`. * `kbest` chooses the `k` top features according to a chi-squared test. * `percentile` is similar but chooses a fraction of all features instead of a fixed number. * `fpr` chooses all features whose false positive rate meets some threshold. - * `fpr` select features based on a false positive rate test. - * `fdr` select features based on an estimated false discovery rate. - * `fwe` select features based on family-wise error rate. + * `fdr` chooses all features whose false discovery rate meets some threshold. + * `fwe` chooses all features whose family-wise error rate meets some threshold. * By default, the selection method is `kbest`, the default number of top features is 50. */ @Since("1.6.0") @@ -134,9 +134,7 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str /** @group setParam */ @Since("2.1.0") - def setAlpha(value: Double): this.type = { - set(alpha, value) - } + def setAlpha(value: Double): this.type = set(alpha, value) /** @group setParam */ @Since("1.6.0") diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index 21fd42a6a15cb..64844e607b298 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -171,12 +171,12 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] { /** * Creates a ChiSquared feature selector. - * The selector supports three selection methods: `kbest`, `percentile` and `fpr`. + * The selector supports five selection methods: `kbest`, `percentile`, `fpr`, `fdr` and `fwe`. * `kbest` chooses the `k` top features according to a chi-squared test. * `percentile` is similar but chooses a fraction of all features instead of a fixed number. - * `fpr` select features based on a false positive rate test. - * `fdr` select features based on an estimated false discovery rate. - * `fwe` select features based on family-wise error rate. + * `fpr` chooses all features whose false positive rate meets some threshold. + * `fdr` chooses all features whose false discovery rate meets some threshold. + * `fwe` chooses all features whose family-wise error rate meets some threshold. * By default, the selection method is `kbest`, the default number of top features is 50. */ @Since("1.3.0") @@ -246,15 +246,16 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { chiSqTestResult .filter { case (res, _) => res.pValue < alpha } case ChiSqSelector.FDR => - val tempRDD = chiSqTestResult + // This uses the Benjamini-Hochberg procedure. + val tempRes = chiSqTestResult .sortBy { case (res, _) => res.pValue } - val maxIndex = tempRDD + val maxIndex = tempRes .zipWithIndex .filter { case ((res, _), index) => res.pValue <= alpha * (index + 1) / chiSqTestResult.length } - .map { case (_, index) => index} + .map { case (_, index) => index } .max - tempRDD.take(maxIndex + 1) + tempRes.take(maxIndex + 1) case ChiSqSelector.FWE => chiSqTestResult .filter { case (res, _) => res.pValue < alpha/chiSqTestResult.length } @@ -269,19 +270,29 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { @Since("2.1.0") object ChiSqSelector { - /** String name for `kbest` selector type. */ + /** String name for `kbest` selector type. + * Choose the `k` top features according to a chi-squared test. + */ private[spark] val KBest: String = "kbest" - /** String name for `percentile` selector type. */ + /** String name for `percentile` selector type. + * Choose a fraction of all features instead of a fixed number. + */ private[spark] val Percentile: String = "percentile" - /** String name for `fpr` selector type. */ + /** String name for `fpr` selector type. + * Choose all features whose false positive rate meets some threshold. + */ private[spark] val FPR: String = "fpr" - /** String name for `fdr` selector type. */ + /** String name for `fdr` selector type. + * Choose all features whose false discovery rate meets some threshold. + */ private[spark] val FDR: String = "fdr" - /** String name for `fwe` selector type. */ + /** String name for `fwe` selector type. + * Choose all features whose family-wise error rate meets some threshold. + */ private[spark] val FWE: String = "fwe" /** Set of selector type and param pairs that ChiSqSelector supports. */ diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala index 3171549544e35..1ec1731bdd194 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala @@ -28,22 +28,85 @@ import org.apache.spark.sql.Row class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { - test("Test Chi-Square selector") { + /* + * Contingency tables + * feature0 = {6.0, 0.0, 8.0} + * class 0 1 2 + * 6.0||1|0|0| + * 0.0||0|3|0| + * 8.0||0|0|2| + * degree of freedom = 4, statistic = 12, pValue = 0.017 + * + * feature1 = {7.0, 9.0} + * class 0 1 2 + * 7.0||1|0|0| + * 9.0||0|3|2| + * degree of freedom = 2, statistic = 6, pValue = 0.049 + * + * feature2 = {0.0, 6.0, 3.0, 8.0} + * class 0 1 2 + * 0.0||1|0|0| + * 6.0||0|1|2| + * 3.0||0|1|0| + * 8.0||0|1|0| + * degree of freedom = 6, statistic = 8.66, pValue = 0.193 + * + * feature3 = {7.0, 0.0, 5.0, 4.0} + * class 0 1 2 + * 7.0||1|0|0| + * 0.0||0|2|0| + * 5.0||0|1|1| + * 4.0||0|0|1| + * degree of freedom = 6, statistic = 9.5, pValue = 0.147 + * + * feature4 = {6.0, 5.0, 4.0, 0.0} + * class 0 1 2 + * 6.0||1|1|0| + * 5.0||0|2|0| + * 4.0||0|0|1| + * 0.0||0|0|1| + * degree of freedom = 6, statistic = 8.0, pValue = 0.238 + * + * feature5 = {0.0, 9.0, 5.0, 4.0} + * class 0 1 2 + * 0.0||1|0|1| + * 9.0||0|1|0| + * 5.0||0|1|0| + * 4.0||0|1|1| + * degree of freedom = 6, statistic = 5, pValue = 0.54 + * + * Use chi-squared calculator from Internet + */ + + test("Test Chi-Square selector KBest") { import testImplicits._ val data = Seq( - LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))), - LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))), - LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))), - LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0))) + LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))), + LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))), + LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))), + LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0))) ) val preFilteredData = Seq( - Vectors.dense(8.0), + Vectors.dense(6.0), + Vectors.dense(0.0), Vectors.dense(0.0), Vectors.dense(0.0), + Vectors.dense(8.0), Vectors.dense(8.0) ) + val preFilteredData2 = Seq( + Vectors.dense(6.0, 7.0, 7.0), + Vectors.dense(0.0, 9.0, 0.0), + Vectors.dense(0.0, 9.0, 0.0), + Vectors.dense(0.0, 9.0, 5.0), + Vectors.dense(8.0, 9.0, 5.0), + Vectors.dense(8.0, 9.0, 4.0) + ) + val df = sc.parallelize(data.zip(preFilteredData)) .map(x => (x._1.label, x._1.features, x._2)) .toDF("label", "data", "preFilteredData") @@ -60,37 +123,236 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext assert(vec1 ~== vec2 absTol 1e-1) } - selector.setSelectorType("percentile").setPercentile(0.34).fit(df).transform(df) - .select("filtered", "preFilteredData").collect().foreach { - case Row(vec1: Vector, vec2: Vector) => - assert(vec1 ~== vec2 absTol 1e-1) - } + val df2 = sc.parallelize(data.zip(preFilteredData2)) + .map(x => (x._1.label, x._1.features, x._2)) + .toDF("label", "data", "preFilteredData") + + selector.setNumTopFeatures(3).fit(df2).transform(df2).select("filtered", "preFilteredData") + .collect().foreach { + case Row(vec1: Vector, vec2: Vector) => + assert(vec1 ~== vec2 absTol 1e-1) + } + } + + test("Test Chi-Square selector Percentile") { + import testImplicits._ + val data = Seq( + LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))), + LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))), + LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))), + LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0))) + ) + + val preFilteredData = Seq( + Vectors.dense(6.0), + Vectors.dense(0.0), + Vectors.dense(0.0), + Vectors.dense(0.0), + Vectors.dense(8.0), + Vectors.dense(8.0) + ) val preFilteredData2 = Seq( - Vectors.dense(8.0, 7.0), + Vectors.dense(6.0, 7.0, 7.0), + Vectors.dense(0.0, 9.0, 0.0), + Vectors.dense(0.0, 9.0, 0.0), + Vectors.dense(0.0, 9.0, 5.0), + Vectors.dense(8.0, 9.0, 5.0), + Vectors.dense(8.0, 9.0, 4.0) + ) + + val df = sc.parallelize(data.zip(preFilteredData)) + .map(x => (x._1.label, x._1.features, x._2)) + .toDF("label", "data", "preFilteredData") + + val selector = new ChiSqSelector() + .setSelectorType("percentile") + .setPercentile(0.2) + .setFeaturesCol("data") + .setLabelCol("label") + .setOutputCol("filtered") + + selector.fit(df).transform(df).select("filtered", "preFilteredData").collect().foreach { + case Row(vec1: Vector, vec2: Vector) => + assert(vec1 ~== vec2 absTol 1e-1) + } + + val df2 = sc.parallelize(data.zip(preFilteredData2)) + .map(x => (x._1.label, x._1.features, x._2)) + .toDF("label", "data", "preFilteredData") + + selector.setPercentile(0.5).fit(df2).transform(df2).select("filtered", "preFilteredData") + .collect().foreach { + case Row(vec1: Vector, vec2: Vector) => + assert(vec1 ~== vec2 absTol 1e-1) + } + } + + test("Test Chi-Square selector FPR") { + import testImplicits._ + val data = Seq( + LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))), + LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))), + LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))), + LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0))) + ) + + val preFilteredData = Seq( + Vectors.dense(6.0), + Vectors.dense(0.0), + Vectors.dense(0.0), + Vectors.dense(0.0), + Vectors.dense(8.0), + Vectors.dense(8.0) + ) + + val preFilteredData2 = Seq( + Vectors.dense(6.0, 7.0, 7.0), + Vectors.dense(0.0, 9.0, 0.0), + Vectors.dense(0.0, 9.0, 0.0), + Vectors.dense(0.0, 9.0, 5.0), + Vectors.dense(8.0, 9.0, 5.0), + Vectors.dense(8.0, 9.0, 4.0) + ) + + val df = sc.parallelize(data.zip(preFilteredData)) + .map(x => (x._1.label, x._1.features, x._2)) + .toDF("label", "data", "preFilteredData") + + val selector = new ChiSqSelector() + .setSelectorType("fpr") + .setAlpha(0.02) + .setFeaturesCol("data") + .setLabelCol("label") + .setOutputCol("filtered") + + selector.fit(df).transform(df).select("filtered", "preFilteredData").collect().foreach { + case Row(vec1: Vector, vec2: Vector) => + assert(vec1 ~== vec2 absTol 1e-1) + } + + val df2 = sc.parallelize(data.zip(preFilteredData2)) + .map(x => (x._1.label, x._1.features, x._2)) + .toDF("label", "data", "preFilteredData") + + selector.setAlpha(0.15).fit(df2).transform(df2).select("filtered", "preFilteredData") + .collect().foreach { + case Row(vec1: Vector, vec2: Vector) => + assert(vec1 ~== vec2 absTol 1e-1) + } + } + + test("Test Chi-Square selector FDR") { + import testImplicits._ + val data = Seq( + LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))), + LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))), + LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))), + LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0))) + ) + + val preFilteredData = Seq( + Vectors.dense(6.0), + Vectors.dense(0.0), + Vectors.dense(0.0), + Vectors.dense(0.0), + Vectors.dense(8.0), + Vectors.dense(8.0) + ) + + val preFilteredData2 = Seq( + Vectors.dense(6.0, 7.0), Vectors.dense(0.0, 9.0), Vectors.dense(0.0, 9.0), + Vectors.dense(0.0, 9.0), + Vectors.dense(8.0, 9.0), Vectors.dense(8.0, 9.0) ) + val df = sc.parallelize(data.zip(preFilteredData)) + .map(x => (x._1.label, x._1.features, x._2)) + .toDF("label", "data", "preFilteredData") + + val selector = new ChiSqSelector() + .setSelectorType("fdr") + .setAlpha(0.12) + .setFeaturesCol("data") + .setLabelCol("label") + .setOutputCol("filtered") + + selector.fit(df).transform(df).select("filtered", "preFilteredData").collect().foreach { + case Row(vec1: Vector, vec2: Vector) => + assert(vec1 ~== vec2 absTol 1e-1) + } + val df2 = sc.parallelize(data.zip(preFilteredData2)) .map(x => (x._1.label, x._1.features, x._2)) .toDF("label", "data", "preFilteredData") - selector.setSelectorType("fpr").setAlpha(0.2).fit(df2).transform(df2) - .select("filtered", "preFilteredData").collect().foreach { - case Row(vec1: Vector, vec2: Vector) => - assert(vec1 ~== vec2 absTol 1e-1) - } + selector.setAlpha(0.15).fit(df2).transform(df2).select("filtered", "preFilteredData") + .collect().foreach { + case Row(vec1: Vector, vec2: Vector) => + assert(vec1 ~== vec2 absTol 1e-1) + } + } + + test("Test Chi-Square selector FWE") { + import testImplicits._ + val data = Seq( + LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))), + LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))), + LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))), + LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0))) + ) - selector.setSelectorType("fwe").setAlpha(0.5).fit(df2).transform(df2) - .select("filtered", "preFilteredData").collect().foreach { + val preFilteredData = Seq( + Vectors.dense(6.0), + Vectors.dense(0.0), + Vectors.dense(0.0), + Vectors.dense(0.0), + Vectors.dense(8.0), + Vectors.dense(8.0) + ) + + val preFilteredData2 = Seq( + Vectors.dense(6.0, 7.0), + Vectors.dense(0.0, 9.0), + Vectors.dense(0.0, 9.0), + Vectors.dense(0.0, 9.0), + Vectors.dense(8.0, 9.0), + Vectors.dense(8.0, 9.0) + ) + + val df = sc.parallelize(data.zip(preFilteredData)) + .map(x => (x._1.label, x._1.features, x._2)) + .toDF("label", "data", "preFilteredData") + + val selector = new ChiSqSelector() + .setSelectorType("fwe") + .setAlpha(0.12) + .setFeaturesCol("data") + .setLabelCol("label") + .setOutputCol("filtered") + + selector.fit(df).transform(df).select("filtered", "preFilteredData").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 ~== vec2 absTol 1e-1) } - selector.setSelectorType("fdr").setAlpha(0.21).fit(df2).transform(df2) - .select("filtered", "preFilteredData").collect().foreach { + val df2 = sc.parallelize(data.zip(preFilteredData2)) + .map(x => (x._1.label, x._1.features, x._2)) + .toDF("label", "data", "preFilteredData") + + selector.setAlpha(0.3).fit(df2).transform(df2).select("filtered", "preFilteredData") + .collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 ~== vec2 absTol 1e-1) } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala index ac702b4b7c69e..ca5ba9b5871af 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala @@ -27,60 +27,240 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext { /* * Contingency tables - * feature0 = {8.0, 0.0} + * feature0 = {6.0, 0.0, 8.0} * class 0 1 2 - * 8.0||1|0|1| - * 0.0||0|2|0| + * 6.0||1|0|0| + * 0.0||0|3|0| + * 8.0||0|0|2| + * degree of freedom = 4, statistic = 12, pValue = 0.017 * * feature1 = {7.0, 9.0} * class 0 1 2 * 7.0||1|0|0| - * 9.0||0|2|1| + * 9.0||0|3|2| + * degree of freedom = 2, statistic = 6, pValue = 0.049 * - * feature2 = {0.0, 6.0, 8.0, 5.0} + * feature2 = {0.0, 6.0, 3.0, 8.0} * class 0 1 2 * 0.0||1|0|0| - * 6.0||0|1|0| + * 6.0||0|1|2| + * 3.0||0|1|0| * 8.0||0|1|0| - * 5.0||0|0|1| + * degree of freedom = 6, statistic = 8.66, pValue = 0.193 + * + * feature3 = {7.0, 0.0, 5.0, 4.0} + * class 0 1 2 + * 7.0||1|0|0| + * 0.0||0|2|0| + * 5.0||0|1|1| + * 4.0||0|0|1| + * degree of freedom = 6, statistic = 9.5, pValue = 0.147 + * + * feature4 = {6.0, 5.0, 4.0, 0.0} + * class 0 1 2 + * 6.0||1|1|0| + * 5.0||0|2|0| + * 4.0||0|0|1| + * 0.0||0|0|1| + * degree of freedom = 6, statistic = 8.0, pValue = 0.238 + * + * feature5 = {0.0, 9.0, 5.0, 4.0} + * class 0 1 2 + * 0.0||1|0|1| + * 9.0||0|1|0| + * 5.0||0|1|0| + * 4.0||0|1|1| + * degree of freedom = 6, statistic = 5, pValue = 0.54 * * Use chi-squared calculator from Internet */ - test("ChiSqSelector transform test (sparse & dense vector)") { + test("ChiSqSelector transform KBest test (sparse & dense vector)") { val labeledDiscreteData = sc.parallelize( - Seq(LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))), - LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))), - LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))), - LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))), 2) + Seq(LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))), + LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))), + LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))), + LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)))), 2) val preFilteredData = - Set(LabeledPoint(0.0, Vectors.dense(Array(8.0))), + Set(LabeledPoint(0.0, Vectors.dense(Array(6.0))), LabeledPoint(1.0, Vectors.dense(Array(0.0))), LabeledPoint(1.0, Vectors.dense(Array(0.0))), + LabeledPoint(1.0, Vectors.dense(Array(0.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0)))) val model = new ChiSqSelector(1).fit(labeledDiscreteData) val filteredData = labeledDiscreteData.map { lp => LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSet assert(filteredData == preFilteredData) + + val preFilteredData2 = + Set(LabeledPoint(0.0, Vectors.dense(Array(6.0, 7.0, 7.0))), + LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 0.0))), + LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 0.0))), + LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 5.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 4.0)))) + + val model2 = new ChiSqSelector(3).fit(labeledDiscreteData) + val filteredData2 = labeledDiscreteData.map { lp => + LabeledPoint(lp.label, model2.transform(lp.features)) + }.collect().toSet + assert(filteredData2 == preFilteredData2) } - test("ChiSqSelector by FPR transform test (sparse & dense vector)") { + test("ChiSqSelector transform Percentile test (sparse & dense vector)") { val labeledDiscreteData = sc.parallelize( - Seq(LabeledPoint(0.0, Vectors.sparse(4, Array((0, 8.0), (1, 7.0)))), - LabeledPoint(1.0, Vectors.sparse(4, Array((1, 9.0), (2, 6.0), (3, 4.0)))), - LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 4.0))), - LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0, 9.0)))), 2) + Seq(LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))), + LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))), + LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))), + LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)))), 2) val preFilteredData = - Set(LabeledPoint(0.0, Vectors.dense(Array(0.0))), - LabeledPoint(1.0, Vectors.dense(Array(4.0))), - LabeledPoint(1.0, Vectors.dense(Array(4.0))), - LabeledPoint(2.0, Vectors.dense(Array(9.0)))) - val model = new ChiSqSelector().setSelectorType("fpr").setAlpha(0.1).fit(labeledDiscreteData) + Set(LabeledPoint(0.0, Vectors.dense(Array(6.0))), + LabeledPoint(1.0, Vectors.dense(Array(0.0))), + LabeledPoint(1.0, Vectors.dense(Array(0.0))), + LabeledPoint(1.0, Vectors.dense(Array(0.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0)))) + val model = new ChiSqSelector().setSelectorType("percentile").setPercentile(0.2) + .fit(labeledDiscreteData) val filteredData = labeledDiscreteData.map { lp => LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSet assert(filteredData == preFilteredData) + + val preFilteredData2 = + Set(LabeledPoint(0.0, Vectors.dense(Array(6.0, 7.0, 7.0))), + LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 0.0))), + LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 0.0))), + LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 5.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 4.0)))) + + val model2 = new ChiSqSelector().setSelectorType("percentile").setPercentile(0.5) + .fit(labeledDiscreteData) + val filteredData2 = labeledDiscreteData.map { lp => + LabeledPoint(lp.label, model2.transform(lp.features)) + }.collect().toSet + assert(filteredData2 == preFilteredData2) + } + + test("ChiSqSelector transform FPR test (sparse & dense vector)") { + val labeledDiscreteData = sc.parallelize( + Seq(LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))), + LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))), + LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))), + LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)))), 2) + val preFilteredData = + Set(LabeledPoint(0.0, Vectors.dense(Array(6.0))), + LabeledPoint(1.0, Vectors.dense(Array(0.0))), + LabeledPoint(1.0, Vectors.dense(Array(0.0))), + LabeledPoint(1.0, Vectors.dense(Array(0.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0)))) + val model = new ChiSqSelector().setSelectorType("fpr").setAlpha(0.02) + .fit(labeledDiscreteData) + val filteredData = labeledDiscreteData.map { lp => + LabeledPoint(lp.label, model.transform(lp.features)) + }.collect().toSet + assert(filteredData == preFilteredData) + + val preFilteredData2 = + Set(LabeledPoint(0.0, Vectors.dense(Array(6.0, 7.0, 7.0))), + LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 0.0))), + LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 0.0))), + LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 5.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 4.0)))) + + val model2 = new ChiSqSelector().setSelectorType("fpr").setAlpha(0.15) + .fit(labeledDiscreteData) + val filteredData2 = labeledDiscreteData.map { lp => + LabeledPoint(lp.label, model2.transform(lp.features)) + }.collect().toSet + assert(filteredData2 == preFilteredData2) + } + + test("ChiSqSelector transform FDR test (sparse & dense vector)") { + val labeledDiscreteData = sc.parallelize( + Seq(LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))), + LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))), + LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))), + LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)))), 2) + val preFilteredData = + Set(LabeledPoint(0.0, Vectors.dense(Array(6.0))), + LabeledPoint(1.0, Vectors.dense(Array(0.0))), + LabeledPoint(1.0, Vectors.dense(Array(0.0))), + LabeledPoint(1.0, Vectors.dense(Array(0.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0)))) + val model = new ChiSqSelector().setSelectorType("fdr").setAlpha(0.12) + .fit(labeledDiscreteData) + val filteredData = labeledDiscreteData.map { lp => + LabeledPoint(lp.label, model.transform(lp.features)) + }.collect().toSet + assert(filteredData == preFilteredData) + + val preFilteredData2 = + Set(LabeledPoint(0.0, Vectors.dense(Array(6.0, 7.0))), + LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0))), + LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0))), + LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0)))) + + val model2 = new ChiSqSelector().setSelectorType("fdr").setAlpha(0.15) + .fit(labeledDiscreteData) + val filteredData2 = labeledDiscreteData.map { lp => + LabeledPoint(lp.label, model2.transform(lp.features)) + }.collect().toSet + assert(filteredData2 == preFilteredData2) + } + + test("ChiSqSelector transform FWE test (sparse & dense vector)") { + val labeledDiscreteData = sc.parallelize( + Seq(LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))), + LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))), + LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))), + LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)))), 2) + val preFilteredData = + Set(LabeledPoint(0.0, Vectors.dense(Array(6.0))), + LabeledPoint(1.0, Vectors.dense(Array(0.0))), + LabeledPoint(1.0, Vectors.dense(Array(0.0))), + LabeledPoint(1.0, Vectors.dense(Array(0.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0)))) + val model = new ChiSqSelector().setSelectorType("fwe").setAlpha(0.15) + .fit(labeledDiscreteData) + val filteredData = labeledDiscreteData.map { lp => + LabeledPoint(lp.label, model.transform(lp.features)) + }.collect().toSet + assert(filteredData == preFilteredData) + + val preFilteredData2 = + Set(LabeledPoint(0.0, Vectors.dense(Array(6.0, 7.0))), + LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0))), + LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0))), + LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0)))) + + val model2 = new ChiSqSelector().setSelectorType("fwe").setAlpha(0.3) + .fit(labeledDiscreteData) + val filteredData2 = labeledDiscreteData.map { lp => + LabeledPoint(lp.label, model2.transform(lp.features)) + }.collect().toSet + assert(filteredData2 == preFilteredData2) } test("model load / save") { diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 48a77e585c8cb..d9d9267e2397e 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2580,8 +2580,15 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja """ .. note:: Experimental - Chi-Squared feature selection, which selects categorical features to use for predicting a - categorical label. + Creates a ChiSquared feature selector. + The selector supports five selection methods: `KBest`, `Percentile`, `FPR`, `FDR` and `FWE`. + `kbest` chooses the `k` top features according to a chi-squared test. + `percentile` is similar but chooses a fraction of all features instead of a fixed number. + `fpr` chooses all features whose false positive rate meets some threshold. + `fdr` chooses all features whose false discovery rate meets some threshold. + `fwe` chooses all features whose family-wise error rate meets some threshold. + By default, the selection method is `kbest`, the default number of top features is 50. + >>> from pyspark.ml.linalg import Vectors >>> df = spark.createDataFrame( @@ -2625,7 +2632,7 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja typeConverter=TypeConverters.toFloat) alpha = Param(Params._dummy(), "alpha", "alpha means the highest p-value for features " + - "to be kept when select type is fpr, alpha means the highest uncorrected " + + "to be kept when select type is fpr, or the highest uncorrected " + "p-value for features to to kept when select type is fdr and fwe.", typeConverter=TypeConverters.toFloat) @@ -2701,6 +2708,7 @@ def getPercentile(self): @since("2.1.0") def setAlpha(self, value): """ + Only applicable when selectorType = "fpr", "fdr" or "fwe" Sets the value of :py:attr:`alpha`. """ return self._set(alpha=value) diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index 1a9d4797bfc81..eda4e93c70fbf 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -274,12 +274,12 @@ def transform(self, vector): class ChiSqSelector(object): """ Creates a ChiSquared feature selector. - The selector supports three selection methods: `KBest`, `Percentile` and `FPR`. + The selector supports five selection methods: `KBest`, `Percentile`, `FPR`, `FDR` and `FWE`. `kbest` chooses the `k` top features according to a chi-squared test. `percentile` is similar but chooses a fraction of all features instead of a fixed number. - `fpr` select features based on a false positive rate test. - `fdr` select features based on an estimated false discovery rate. - `fwe` select features based on family-wise error rate. + `fpr` chooses all features whose false positive rate meets some threshold. + `fdr` chooses all features whose false discovery rate meets some threshold. + `fwe` chooses all features whose family-wise error rate meets some threshold. By default, the selection method is `kbest`, the default number of top features is 50. >>> data = [ From 92530ab7562f5d4968b5c6130ab611fd2b629549 Mon Sep 17 00:00:00 2001 From: Peng Date: Thu, 20 Oct 2016 17:45:26 +0800 Subject: [PATCH 6/9] mimor change --- .../scala/org/apache/spark/mllib/feature/ChiSqSelector.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index 64844e607b298..a41558cd72d14 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -258,7 +258,7 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { tempRes.take(maxIndex + 1) case ChiSqSelector.FWE => chiSqTestResult - .filter { case (res, _) => res.pValue < alpha/chiSqTestResult.length } + .filter { case (res, _) => res.pValue < alpha / chiSqTestResult.length } case errorType => throw new IllegalStateException(s"Unknown ChiSqSelector Type: $errorType") } From 5a7cc2ca9e81ade4d430411ab6e314ae5010169f Mon Sep 17 00:00:00 2001 From: Peng Date: Fri, 23 Dec 2016 12:46:23 +0800 Subject: [PATCH 7/9] doc and Since fix --- docs/ml-features.md | 4 +- docs/mllib-feature-extraction.md | 4 +- .../spark/ml/feature/ChiSqSelector.scala | 23 +-- .../spark/mllib/feature/ChiSqSelector.scala | 16 +- .../mllib/feature/ChiSqSelectorSuite.scala | 152 ++++-------------- python/pyspark/ml/feature.py | 19 ++- python/pyspark/mllib/feature.py | 11 +- 7 files changed, 73 insertions(+), 156 deletions(-) diff --git a/docs/ml-features.md b/docs/ml-features.md index 8dd14dfc3383d..4db2907d59ec1 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -1427,8 +1427,8 @@ features to choose. It supports five selection methods: `numTopFeatures`, `perce * `numTopFeatures` chooses a fixed number of top features according to a chi-squared test. This is akin to yielding the features with the most predictive power. * `percentile` is similar to `numTopFeatures` but chooses a fraction of all features instead of a fixed number. * `fpr` chooses all features whose p-value is below a threshold, thus controlling the false positive rate of selection. -* `fdr` chooses all features whose false discovery rate meets some threshold. -* `fwe` chooses all features whose family-wise error rate meets some threshold. +* `fdr` uses the [Benjamini-Hochberg procedure](https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure) to choose all features whose false discovery rate is below a threshold. +* `fwe` chooses all features whose whose p-values is below a threshold, thus controlling the family-wise error rate of selection. By default, the selection method is `numTopFeatures`, with the default number of top features set to 50. The user can choose a selection method using `setSelectorType`. diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md index dd79dc841fbe3..c67cfbe8e607b 100644 --- a/docs/mllib-feature-extraction.md +++ b/docs/mllib-feature-extraction.md @@ -232,8 +232,8 @@ features to choose. It supports five selection methods: `numTopFeatures`, `perce * `numTopFeatures` chooses a fixed number of top features according to a chi-squared test. This is akin to yielding the features with the most predictive power. * `percentile` is similar to `numTopFeatures` but chooses a fraction of all features instead of a fixed number. * `fpr` chooses all features whose p-value is below a threshold, thus controlling the false positive rate of selection. -* `fdr` chooses all features whose false discovery rate meets some threshold. -* `fwe` chooses all features whose family-wise error rate meets some threshold. +* `fdr` uses the [Benjamini-Hochberg procedure](https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure) to choose all features whose false discovery rate is below a threshold. +* `fwe` chooses all features whose whose p-values is below a threshold, thus controlling the family-wise error rate of selection. By default, the selection method is `numTopFeatures`, with the default number of top features set to 50. The user can choose a selection method using `setSelectorType`. diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index 1999c96a03086..abfae3f75d753 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -92,28 +92,28 @@ private[feature] trait ChiSqSelectorParams extends Params def getFpr: Double = $(fpr) /** - * The highest uncorrected p-value for features to be kept. + * The upper bound of the expected false discovery rate. * Only applicable when selectorType = "fdr". * Default value is 0.05. * @group param */ - @Since("2.1.0") + @Since("2.2.0") final val fdr = new DoubleParam(this, "fdr", - "The highest uncorrected p-value for features to be kept.", ParamValidators.inRange(0, 1)) + "The upper bound of the expected false discovery rate.", ParamValidators.inRange(0, 1)) setDefault(fdr -> 0.05) /** @group getParam */ def getFdr: Double = $(fdr) /** - * The highest uncorrected p-value for features to be kept. + * The upper bound of the expected family-wise error rate. * Only applicable when selectorType = "fwe". * Default value is 0.05. * @group param */ - @Since("2.1.0") + @Since("2.2.0") final val fwe = new DoubleParam(this, "fwe", - "The highest uncorrected p-value for features to be kept.", ParamValidators.inRange(0, 1)) + "The upper bound of the expected family-wise error rate.", ParamValidators.inRange(0, 1)) setDefault(fwe -> 0.05) /** @group getParam */ @@ -145,8 +145,11 @@ private[feature] trait ChiSqSelectorParams extends Params * - `percentile` is similar but chooses a fraction of all features instead of a fixed number. * - `fpr` chooses all features whose p-value is below a threshold, thus controlling the false * positive rate of selection. - * - `fdr` chooses all features whose false discovery rate meets some threshold. - * - `fwe` chooses all features whose family-wise error rate meets some threshold. + * - `fdr` uses the [Benjamini-Hochberg procedure] + * (https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure) + * to choose all features whose false discovery rate is below a threshold. + * - `fwe` chooses all features whose whose p-values is below a threshold, + * thus controlling the family-wise error rate of selection. * By default, the selection method is `numTopFeatures`, with the default number of top features * set to 50. */ @@ -170,11 +173,11 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str def setFpr(value: Double): this.type = set(fpr, value) /** @group setParam */ - @Since("2.1.0") + @Since("2.2.0") def setFdr(value: Double): this.type = set(fdr, value) /** @group setParam */ - @Since("2.1.0") + @Since("2.2.0") def setFwe(value: Double): this.type = set(fwe, value) /** @group setParam */ diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index 24ae113b5d61f..934c3b666a3b4 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -177,8 +177,11 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] { * - `percentile` is similar but chooses a fraction of all features instead of a fixed number. * - `fpr` chooses all features whose p-value is below a threshold, thus controlling the false * positive rate of selection. - * - `fdr` chooses all features whose false discovery rate meets some threshold. - * - `fwe` chooses all features whose family-wise error rate meets some threshold. + * - `fdr` uses the [Benjamini-Hochberg procedure] + * (https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure) + * to choose all features whose false discovery rate is below a threshold. + * - `fwe` chooses all features whose whose p-values is below a threshold, + * thus controlling the family-wise error rate of selection. * By default, the selection method is `numTopFeatures`, with the default number of top features * set to 50. */ @@ -220,14 +223,14 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { this } - @Since("2.1.0") + @Since("2.2.0") def setFdr(value: Double): this.type = { require(0.0 <= value && value <= 1.0, "FDR must be in [0,1]") fdr = value this } - @Since("2.1.0") + @Since("2.2.0") def setFwe(value: Double): this.type = { require(0.0 <= value && value <= 1.0, "FWE must be in [0,1]") fwe = value @@ -266,6 +269,7 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { .filter { case (res, _) => res.pValue < fpr } case ChiSqSelector.FDR => // This uses the Benjamini-Hochberg procedure. + // https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure val tempRes = chiSqTestResult .sortBy { case (res, _) => res.pValue } val maxIndex = tempRes @@ -289,10 +293,10 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable { private[spark] object ChiSqSelector { /** String name for `numTopFeatures` selector type. */ - val NumTopFeatures: String = "numTopFeatures" + private[spark] val NumTopFeatures: String = "numTopFeatures" /** String name for `percentile` selector type. */ - val Percentile: String = "percentile" + private[spark] val Percentile: String = "percentile" /** String name for `fpr` selector type. */ private[spark] val FPR: String = "fpr" diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala index b267cf35220b8..6578963ad8895 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala @@ -75,28 +75,16 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext { * Use chi-squared calculator from Internet */ + lazy val labeledDiscreteData = sc.parallelize( + Seq(LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))), + LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))), + LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))), + LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))), + LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)))), 2) + test("ChiSqSelector transform by KBest test (sparse & dense vector)") { - val labeledDiscreteData = sc.parallelize( - Seq(LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))), - LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))), - LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))), - LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))), - LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))), - LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)))), 2) val preFilteredData = - Set(LabeledPoint(0.0, Vectors.dense(Array(6.0))), - LabeledPoint(1.0, Vectors.dense(Array(0.0))), - LabeledPoint(1.0, Vectors.dense(Array(0.0))), - LabeledPoint(1.0, Vectors.dense(Array(0.0))), - LabeledPoint(2.0, Vectors.dense(Array(8.0))), - LabeledPoint(2.0, Vectors.dense(Array(8.0)))) - val model = new ChiSqSelector(1).fit(labeledDiscreteData) - val filteredData = labeledDiscreteData.map { lp => - LabeledPoint(lp.label, model.transform(lp.features)) - }.collect().toSet - assert(filteredData == preFilteredData) - - val preFilteredData2 = Set(LabeledPoint(0.0, Vectors.dense(Array(6.0, 7.0, 7.0))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 0.0))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 0.0))), @@ -104,36 +92,15 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext { LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 4.0)))) - val model2 = new ChiSqSelector(3).fit(labeledDiscreteData) - val filteredData2 = labeledDiscreteData.map { lp => - LabeledPoint(lp.label, model2.transform(lp.features)) + val model = new ChiSqSelector(3).fit(labeledDiscreteData) + val filteredData = labeledDiscreteData.map { lp => + LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSet - assert(filteredData2 === preFilteredData2) + assert(filteredData === preFilteredData) } test("ChiSqSelector transform by Percentile test (sparse & dense vector)") { - val labeledDiscreteData = sc.parallelize( - Seq(LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))), - LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))), - LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))), - LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))), - LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))), - LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)))), 2) val preFilteredData = - Set(LabeledPoint(0.0, Vectors.dense(Array(6.0))), - LabeledPoint(1.0, Vectors.dense(Array(0.0))), - LabeledPoint(1.0, Vectors.dense(Array(0.0))), - LabeledPoint(1.0, Vectors.dense(Array(0.0))), - LabeledPoint(2.0, Vectors.dense(Array(8.0))), - LabeledPoint(2.0, Vectors.dense(Array(8.0)))) - val model = new ChiSqSelector().setSelectorType("percentile").setPercentile(0.2) - .fit(labeledDiscreteData) - val filteredData = labeledDiscreteData.map { lp => - LabeledPoint(lp.label, model.transform(lp.features)) - }.collect().toSet - assert(filteredData == preFilteredData) - - val preFilteredData2 = Set(LabeledPoint(0.0, Vectors.dense(Array(6.0, 7.0, 7.0))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 0.0))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 0.0))), @@ -141,37 +108,16 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext { LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 4.0)))) - val model2 = new ChiSqSelector().setSelectorType("percentile").setPercentile(0.5) + val model = new ChiSqSelector().setSelectorType("percentile").setPercentile(0.5) .fit(labeledDiscreteData) - val filteredData2 = labeledDiscreteData.map { lp => - LabeledPoint(lp.label, model2.transform(lp.features)) + val filteredData = labeledDiscreteData.map { lp => + LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSet - assert(filteredData2 == preFilteredData2) + assert(filteredData == preFilteredData) } test("ChiSqSelector transform by FPR test (sparse & dense vector)") { - val labeledDiscreteData = sc.parallelize( - Seq(LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))), - LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))), - LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))), - LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))), - LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))), - LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)))), 2) val preFilteredData = - Set(LabeledPoint(0.0, Vectors.dense(Array(6.0))), - LabeledPoint(1.0, Vectors.dense(Array(0.0))), - LabeledPoint(1.0, Vectors.dense(Array(0.0))), - LabeledPoint(1.0, Vectors.dense(Array(0.0))), - LabeledPoint(2.0, Vectors.dense(Array(8.0))), - LabeledPoint(2.0, Vectors.dense(Array(8.0)))) - val model = new ChiSqSelector().setSelectorType("fpr").setFpr(0.02) - .fit(labeledDiscreteData) - val filteredData = labeledDiscreteData.map { lp => - LabeledPoint(lp.label, model.transform(lp.features)) - }.collect().toSet - assert(filteredData === preFilteredData) - - val preFilteredData2 = Set(LabeledPoint(0.0, Vectors.dense(Array(6.0, 7.0, 7.0))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 0.0))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 0.0))), @@ -179,37 +125,16 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext { LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 4.0)))) - val model2 = new ChiSqSelector().setSelectorType("fpr").setFpr(0.15) - .fit(labeledDiscreteData) - val filteredData2 = labeledDiscreteData.map { lp => - LabeledPoint(lp.label, model2.transform(lp.features)) - }.collect().toSet - assert(filteredData2 === preFilteredData2) - } - - test("ChiSqSelector transform by FDR test (sparse & dense vector)") { - val labeledDiscreteData = sc.parallelize( - Seq(LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))), - LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))), - LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))), - LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))), - LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))), - LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)))), 2) - val preFilteredData = - Set(LabeledPoint(0.0, Vectors.dense(Array(6.0))), - LabeledPoint(1.0, Vectors.dense(Array(0.0))), - LabeledPoint(1.0, Vectors.dense(Array(0.0))), - LabeledPoint(1.0, Vectors.dense(Array(0.0))), - LabeledPoint(2.0, Vectors.dense(Array(8.0))), - LabeledPoint(2.0, Vectors.dense(Array(8.0)))) - val model = new ChiSqSelector().setSelectorType("fdr").setFdr(0.12) + val model = new ChiSqSelector().setSelectorType("fpr").setFpr(0.15) .fit(labeledDiscreteData) val filteredData = labeledDiscreteData.map { lp => LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSet assert(filteredData === preFilteredData) + } - val preFilteredData2 = + test("ChiSqSelector transform by FDR test (sparse & dense vector)") { + val preFilteredData = Set(LabeledPoint(0.0, Vectors.dense(Array(6.0, 7.0))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0))), @@ -217,37 +142,16 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext { LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0)))) - val model2 = new ChiSqSelector().setSelectorType("fdr").setFdr(0.15) - .fit(labeledDiscreteData) - val filteredData2 = labeledDiscreteData.map { lp => - LabeledPoint(lp.label, model2.transform(lp.features)) - }.collect().toSet - assert(filteredData2 === preFilteredData2) - } - - test("ChiSqSelector transform by FWE test (sparse & dense vector)") { - val labeledDiscreteData = sc.parallelize( - Seq(LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))), - LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))), - LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))), - LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))), - LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))), - LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)))), 2) - val preFilteredData = - Set(LabeledPoint(0.0, Vectors.dense(Array(6.0))), - LabeledPoint(1.0, Vectors.dense(Array(0.0))), - LabeledPoint(1.0, Vectors.dense(Array(0.0))), - LabeledPoint(1.0, Vectors.dense(Array(0.0))), - LabeledPoint(2.0, Vectors.dense(Array(8.0))), - LabeledPoint(2.0, Vectors.dense(Array(8.0)))) - val model = new ChiSqSelector().setSelectorType("fwe").setFwe(0.15) + val model = new ChiSqSelector().setSelectorType("fdr").setFdr(0.15) .fit(labeledDiscreteData) val filteredData = labeledDiscreteData.map { lp => LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSet assert(filteredData === preFilteredData) + } - val preFilteredData2 = + test("ChiSqSelector transform by FWE test (sparse & dense vector)") { + val preFilteredData = Set(LabeledPoint(0.0, Vectors.dense(Array(6.0, 7.0))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0))), @@ -255,12 +159,12 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext { LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0)))) - val model2 = new ChiSqSelector().setSelectorType("fwe").setFwe(0.3) + val model = new ChiSqSelector().setSelectorType("fwe").setFwe(0.3) .fit(labeledDiscreteData) - val filteredData2 = labeledDiscreteData.map { lp => - LabeledPoint(lp.label, model2.transform(lp.features)) + val filteredData = labeledDiscreteData.map { lp => + LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSet - assert(filteredData2 === preFilteredData2) + assert(filteredData === preFilteredData) } test("model load / save") { diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 014585e992b47..8488ffd4987fc 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2636,8 +2636,11 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja `percentile` is similar but chooses a fraction of all features instead of a fixed number. `fpr` chooses all features whose p-value is below a threshold, thus controlling the false positive rate of selection. - `fdr` chooses all features whose false discovery rate meets some threshold. - `fwe` chooses all features whose family-wise error rate meets some threshold. + `fdr` uses the [Benjamini-Hochberg procedure] + (https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure) + to choose all features whose false discovery rate is below a threshold. + `fwe` chooses all features whose whose p-values is below a threshold, + thus controlling the family-wise error rate of selection. By default, the selection method is `numTopFeatures`, with the default number of top features set to 50. @@ -2686,10 +2689,10 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja fpr = Param(Params._dummy(), "fpr", "The highest p-value for features to be kept.", typeConverter=TypeConverters.toFloat) - fdr = Param(Params._dummy(), "fdr", "The highest uncorrected p-value for features to be kept.", + fdr = Param(Params._dummy(), "fdr", "The upper bound of the expected false discovery rate.", typeConverter=TypeConverters.toFloat) - fwe = Param(Params._dummy(), "fwe", "The highest uncorrected p-value for features to be kept.", + fwe = Param(Params._dummy(), "fwe", "The upper bound of the expected family-wise error rate.", typeConverter=TypeConverters.toFloat) @keyword_only @@ -2781,7 +2784,7 @@ def getFpr(self): """ return self.getOrDefault(self.fpr) - @since("2.1.0") + @since("2.2.0") def setFdr(self, value): """ Sets the value of :py:attr:`fdr`. @@ -2789,14 +2792,14 @@ def setFdr(self, value): """ return self._set(fdr=value) - @since("2.1.0") + @since("2.2.0") def getFdr(self): """ Gets the value of fdr or its default value. """ return self.getOrDefault(self.fdr) - @since("2.1.0") + @since("2.2.0") def setFwe(self, value): """ Sets the value of :py:attr:`fwe`. @@ -2804,7 +2807,7 @@ def setFwe(self, value): """ return self._set(fwe=value) - @since("2.1.0") + @since("2.2.0") def getFwe(self): """ Gets the value of fwe or its default value. diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index 0699c0d329c42..21f3a458380a9 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -280,8 +280,11 @@ class ChiSqSelector(object): `percentile` is similar but chooses a fraction of all features instead of a fixed number. `fpr` chooses all features whose p-value is below a threshold, thus controlling the false positive rate of selection. - `fdr` chooses all features whose false discovery rate meets some threshold. - `fwe` chooses all features whose family-wise error rate meets some threshold. + `fdr` uses the [Benjamini-Hochberg procedure] + (https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure) + to choose all features whose false discovery rate is below a threshold. + `fwe` chooses all features whose whose p-values is below a threshold, + thus controlling the family-wise error rate of selection. By default, the selection method is `numTopFeatures`, with the default number of top features set to 50. @@ -344,7 +347,7 @@ def setFpr(self, fpr): self.fpr = float(fpr) return self - @since('2.1.0') + @since('2.2.0') def setFdr(self, fdr): """ set FDR [0.0, 1.0] for feature selection by FDR. @@ -353,7 +356,7 @@ def setFdr(self, fdr): self.fdr = float(fdr) return self - @since('2.1.0') + @since('2.2.0') def setFwe(self, fwe): """ set FWE [0.0, 1.0] for feature selection by FWE. From aa5f2cc67a1b523902c52330179fa227159728a5 Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Fri, 23 Dec 2016 23:43:09 +0800 Subject: [PATCH 8/9] fix typo --- docs/ml-features.md | 2 +- docs/mllib-feature-extraction.md | 2 +- .../main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala | 2 +- .../scala/org/apache/spark/mllib/feature/ChiSqSelector.scala | 2 +- .../org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala | 2 +- python/pyspark/ml/feature.py | 2 +- python/pyspark/mllib/feature.py | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/ml-features.md b/docs/ml-features.md index 4db2907d59ec1..1d3449746c9be 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -1428,7 +1428,7 @@ features to choose. It supports five selection methods: `numTopFeatures`, `perce * `percentile` is similar to `numTopFeatures` but chooses a fraction of all features instead of a fixed number. * `fpr` chooses all features whose p-value is below a threshold, thus controlling the false positive rate of selection. * `fdr` uses the [Benjamini-Hochberg procedure](https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure) to choose all features whose false discovery rate is below a threshold. -* `fwe` chooses all features whose whose p-values is below a threshold, thus controlling the family-wise error rate of selection. +* `fwe` chooses all features whose p-values is below a threshold, thus controlling the family-wise error rate of selection. By default, the selection method is `numTopFeatures`, with the default number of top features set to 50. The user can choose a selection method using `setSelectorType`. diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md index c67cfbe8e607b..acd28943132db 100644 --- a/docs/mllib-feature-extraction.md +++ b/docs/mllib-feature-extraction.md @@ -233,7 +233,7 @@ features to choose. It supports five selection methods: `numTopFeatures`, `perce * `percentile` is similar to `numTopFeatures` but chooses a fraction of all features instead of a fixed number. * `fpr` chooses all features whose p-value is below a threshold, thus controlling the false positive rate of selection. * `fdr` uses the [Benjamini-Hochberg procedure](https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure) to choose all features whose false discovery rate is below a threshold. -* `fwe` chooses all features whose whose p-values is below a threshold, thus controlling the family-wise error rate of selection. +* `fwe` chooses all features whose p-values is below a threshold, thus controlling the family-wise error rate of selection. By default, the selection method is `numTopFeatures`, with the default number of top features set to 50. The user can choose a selection method using `setSelectorType`. diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index abfae3f75d753..353bd186daf01 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -148,7 +148,7 @@ private[feature] trait ChiSqSelectorParams extends Params * - `fdr` uses the [Benjamini-Hochberg procedure] * (https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure) * to choose all features whose false discovery rate is below a threshold. - * - `fwe` chooses all features whose whose p-values is below a threshold, + * - `fwe` chooses all features whose p-values is below a threshold, * thus controlling the family-wise error rate of selection. * By default, the selection method is `numTopFeatures`, with the default number of top features * set to 50. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index 934c3b666a3b4..9dea3c3e843c4 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -180,7 +180,7 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] { * - `fdr` uses the [Benjamini-Hochberg procedure] * (https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure) * to choose all features whose false discovery rate is below a threshold. - * - `fwe` chooses all features whose whose p-values is below a threshold, + * - `fwe` chooses all features whose p-values is below a threshold, * thus controlling the family-wise error rate of selection. * By default, the selection method is `numTopFeatures`, with the default number of top features * set to 50. diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala index 6578963ad8895..b585e09c69f44 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala @@ -83,7 +83,7 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext { LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)))), 2) - test("ChiSqSelector transform by KBest test (sparse & dense vector)") { + test("ChiSqSelector transform by numTopFeatures test (sparse & dense vector)") { val preFilteredData = Set(LabeledPoint(0.0, Vectors.dense(Array(6.0, 7.0, 7.0))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 0.0))), diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 8488ffd4987fc..3c1d57daec859 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2639,7 +2639,7 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja `fdr` uses the [Benjamini-Hochberg procedure] (https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure) to choose all features whose false discovery rate is below a threshold. - `fwe` chooses all features whose whose p-values is below a threshold, + `fwe` chooses all features whose p-values is below a threshold, thus controlling the family-wise error rate of selection. By default, the selection method is `numTopFeatures`, with the default number of top features set to 50. diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index 21f3a458380a9..55a917f1d609a 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -283,7 +283,7 @@ class ChiSqSelector(object): `fdr` uses the [Benjamini-Hochberg procedure] (https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure) to choose all features whose false discovery rate is below a threshold. - `fwe` chooses all features whose whose p-values is below a threshold, + `fwe` chooses all features whose p-values is below a threshold, thus controlling the family-wise error rate of selection. By default, the selection method is `numTopFeatures`, with the default number of top features set to 50. From da6ac358ee4861f0d0cd320f46482b2e2535322a Mon Sep 17 00:00:00 2001 From: "Peng, Meng" Date: Tue, 27 Dec 2016 21:22:11 +0800 Subject: [PATCH 9/9] python code style change --- .../mllib/feature/ChiSqSelectorSuite.scala | 2 +- python/pyspark/ml/feature.py | 25 ++++++++++++------- python/pyspark/mllib/feature.py | 25 ++++++++++++------- 3 files changed, 33 insertions(+), 19 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala index b585e09c69f44..305cb4cbbdeea 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala @@ -113,7 +113,7 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext { val filteredData = labeledDiscreteData.map { lp => LabeledPoint(lp.label, model.transform(lp.features)) }.collect().toSet - assert(filteredData == preFilteredData) + assert(filteredData === preFilteredData) } test("ChiSqSelector transform by FPR test (sparse & dense vector)") { diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 3c1d57daec859..dbd17e01d2213 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2632,15 +2632,22 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja Creates a ChiSquared feature selector. The selector supports different selection methods: `numTopFeatures`, `percentile`, `fpr`, `fdr`, `fwe`. - `numTopFeatures` chooses a fixed number of top features according to a chi-squared test. - `percentile` is similar but chooses a fraction of all features instead of a fixed number. - `fpr` chooses all features whose p-value is below a threshold, thus controlling the false - positive rate of selection. - `fdr` uses the [Benjamini-Hochberg procedure] - (https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure) - to choose all features whose false discovery rate is below a threshold. - `fwe` chooses all features whose p-values is below a threshold, - thus controlling the family-wise error rate of selection. + + * `numTopFeatures` chooses a fixed number of top features according to a chi-squared test. + + * `percentile` is similar but chooses a fraction of all features + instead of a fixed number. + + * `fpr` chooses all features whose p-value is below a threshold, + thus controlling the false positive rate of selection. + + * `fdr` uses the `Benjamini-Hochberg procedure `_ + to choose all features whose false discovery rate is below a threshold. + + * `fwe` chooses all features whose p-values is below a threshold, + thus controlling the family-wise error rate of selection. + By default, the selection method is `numTopFeatures`, with the default number of top features set to 50. diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index 55a917f1d609a..61f2bc7492ad6 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -276,15 +276,22 @@ class ChiSqSelector(object): Creates a ChiSquared feature selector. The selector supports different selection methods: `numTopFeatures`, `percentile`, `fpr`, `fdr`, `fwe`. - `numTopFeatures` chooses a fixed number of top features according to a chi-squared test. - `percentile` is similar but chooses a fraction of all features instead of a fixed number. - `fpr` chooses all features whose p-value is below a threshold, thus controlling the false - positive rate of selection. - `fdr` uses the [Benjamini-Hochberg procedure] - (https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure) - to choose all features whose false discovery rate is below a threshold. - `fwe` chooses all features whose p-values is below a threshold, - thus controlling the family-wise error rate of selection. + + * `numTopFeatures` chooses a fixed number of top features according to a chi-squared test. + + * `percentile` is similar but chooses a fraction of all features + instead of a fixed number. + + * `fpr` chooses all features whose p-value is below a threshold, + thus controlling the false positive rate of selection. + + * `fdr` uses the `Benjamini-Hochberg procedure `_ + to choose all features whose false discovery rate is below a threshold. + + * `fwe` chooses all features whose p-values is below a threshold, + thus controlling the family-wise error rate of selection. + By default, the selection method is `numTopFeatures`, with the default number of top features set to 50.