From 2c071791b2c6fd7d388343ac95783c32ffdae529 Mon Sep 17 00:00:00 2001
From: "Peng, Meng" <peng.meng@intel.com>
Date: Fri, 23 Sep 2016 15:27:19 +0800
Subject: [PATCH 1/9] add feature selector method: FDR and FWE

---
 .../spark/ml/feature/ChiSqSelector.scala      | 58 +++++++++++++++----
 .../mllib/api/python/PythonMLLibAPI.scala     | 22 ++++++-
 .../spark/mllib/feature/ChiSqSelector.scala   | 52 ++++++++++++++---
 .../mllib/feature/ChiSqSelectorSuite.scala    |  2 +-
 python/pyspark/mllib/feature.py               | 36 ++++++++++--
 5 files changed, 142 insertions(+), 28 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
index 0c6a37bab0aad..0dfea1c6b26b0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
@@ -64,23 +64,41 @@ private[feature] trait ChiSqSelectorParams extends Params
   /** @group getParam */
   def getPercentile: Double = $(percentile)
 
-  final val alpha = new DoubleParam(this, "alpha",
+  final val alphaFPR = new DoubleParam(this, "alphaFPR",
     "The highest p-value for features to be kept.",
     ParamValidators.inRange(0, 1))
-  setDefault(alpha -> 0.05)
+  setDefault(alphaFPR -> 0.05)
 
   /** @group getParam */
-  def getAlpha: Double = $(alpha)
+  def getAlphaFPR: Double = $(alphaFPR)
+
+  final val alphaFDR = new DoubleParam(this, "alphaFDR",
+    "The highest uncorrected p-value for features to be kept.",
+    ParamValidators.inRange(0, 1))
+  setDefault(alphaFPR -> 0.05)
+
+  /** @group getParam */
+  def getAlphaFDR: Double = $(alphaFDR)
+
+  final val alphaFWE = new DoubleParam(this, "alphaFWE",
+    "The highest uncorrected p-value for features to be kept.",
+    ParamValidators.inRange(0, 1))
+  setDefault(alphaFWE -> 0.05)
+
+  /** @group getParam */
+  def getAlphaFWE: Double = $(alphaFWE)
 
   /**
-   * The ChiSqSelector supports KBest, Percentile, FPR selection,
+   * The ChiSqSelector supports `KBest`, `Percentile`, `FPR`, `FDR`, `FWE` selection,
    * which is the same as ChiSqSelectorType defined in MLLIB.
    * when call setNumTopFeatures, the selectorType is set to KBest
    * when call setPercentile, the selectorType is set to Percentile
-   * when call setAlpha, the selectorType is set to FPR
+   * when call setFPR, the selectorType is set to FPR
+   * when call setFDR, the selectorType is set to FDR
+   * when call setFWE, the selectorType is set to FWE
    */
   final val selectorType = new Param[String](this, "selectorType",
-    "ChiSqSelector Type: KBest, Percentile, FPR")
+    "ChiSqSelector Type: KBest, Percentile, FPR, FDR, FWE")
   setDefault(selectorType -> ChiSqSelectorType.KBest.toString)
 
   /** @group getParam */
@@ -93,7 +111,9 @@ private[feature] trait ChiSqSelectorParams extends Params
  * The selector supports three selection methods: `KBest`, `Percentile` and `FPR`.
  * `KBest` chooses the `k` top features according to a chi-squared test.
  * `Percentile` is similar but chooses a fraction of all features instead of a fixed number.
- * `FPR` chooses all features whose false positive rate meets some threshold.
+ * `FPR` select features based on a false positive rate test.
+ * `FDR` select features based on an estimated false discovery rate.
+ * `FWE` select features based on family-wise error rate.
  * By default, the selection method is `KBest`, the default number of top features is 50.
  * User can use setNumTopFeatures, setPercentile and setAlpha to set different selection methods.
  */
@@ -118,9 +138,21 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str
   }
 
   @Since("2.1.0")
-  def setAlpha(value: Double): this.type = {
+  def setFPR(value: Double): this.type = {
     set(selectorType, ChiSqSelectorType.FPR.toString)
-    set(alpha, value)
+    set(alphaFPR, value)
+  }
+
+  @Since("2.1.0")
+  def setFDR(value: Double): this.type = {
+    set(selectorType, ChiSqSelectorType.FDR.toString)
+    set(alphaFDR, value)
+  }
+
+  @Since("2.1.0")
+  def setFWE(value: Double): this.type = {
+    set(selectorType, ChiSqSelectorType.FWE.toString)
+    set(alphaFWE, value)
   }
 
   /** @group setParam */
@@ -143,14 +175,18 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str
         case Row(label: Double, features: Vector) =>
           OldLabeledPoint(label, OldVectors.fromML(features))
       }
-    var selector = new feature.ChiSqSelector()
+    val selector = new feature.ChiSqSelector()
     ChiSqSelectorType.withName($(selectorType)) match {
       case ChiSqSelectorType.KBest =>
         selector.setNumTopFeatures($(numTopFeatures))
       case ChiSqSelectorType.Percentile =>
         selector.setPercentile($(percentile))
       case ChiSqSelectorType.FPR =>
-        selector.setAlpha($(alpha))
+        selector.setFPR($(alphaFPR))
+      case ChiSqSelectorType.FDR =>
+        selector.setFPR($(alphaFDR))
+      case ChiSqSelectorType.FWE =>
+        selector.setFPR($(alphaFWE))
       case errorType =>
         throw new IllegalStateException(s"Unknown ChiSqSelector Type: $errorType")
     }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index 5cffbf0892888..da0d26b71b9e5 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -657,7 +657,27 @@ private[python] class PythonMLLibAPI extends Serializable {
    * exit; see the Py4J documentation.
    */
   def fitChiSqSelectorFPR(alpha: Double, data: JavaRDD[LabeledPoint]): ChiSqSelectorModel = {
-    new ChiSqSelector().setAlpha(alpha).fit(data.rdd)
+    new ChiSqSelector().setFPR(alpha).fit(data.rdd)
+  }
+
+  /**
+   * Java stub for ChiSqSelector.fit() when the selection type is FDR. This stub returns a
+   * handle to the Java object instead of the content of the Java object.
+   * Extra care needs to be taken in the Python code to ensure it gets freed on
+   * exit; see the Py4J documentation.
+   */
+  def fitChiSqSelectorFDR(alpha: Double, data: JavaRDD[LabeledPoint]): ChiSqSelectorModel = {
+    new ChiSqSelector().setFDR(alpha).fit(data.rdd)
+  }
+
+  /**
+   * Java stub for ChiSqSelector.fit() when the selection type is FWE. This stub returns a
+   * handle to the Java object instead of the content of the Java object.
+   * Extra care needs to be taken in the Python code to ensure it gets freed on
+   * exit; see the Py4J documentation.
+   */
+  def fitChiSqSelectorFWE(alpha: Double, data: JavaRDD[LabeledPoint]): ChiSqSelectorModel = {
+    new ChiSqSelector().setFWE(alpha).fit(data.rdd)
   }
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index f68a017184b21..5db9785691824 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -35,7 +35,7 @@ import org.apache.spark.sql.{Row, SparkSession}
 @Since("2.1.0")
 private[spark] object ChiSqSelectorType extends Enumeration {
   type SelectorType = Value
-  val KBest, Percentile, FPR = Value
+  val KBest, Percentile, FPR, FDR, FWE = Value
 }
 
 /**
@@ -166,10 +166,12 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] {
 
 /**
  * Creates a ChiSquared feature selector.
- * The selector supports three selection methods: `KBest`, `Percentile` and `FPR`.
+ * The selector supports five selection methods: `KBest`, `Percentile`, `FPR`, `FDR`, `FWE`.
  * `KBest` chooses the `k` top features according to a chi-squared test.
  * `Percentile` is similar but chooses a fraction of all features instead of a fixed number.
- * `FPR` chooses all features whose false positive rate meets some threshold.
+ * `FPR` select features based on a false positive rate test.
+ * `FDR` select features based on an estimated false discovery rate.
+ * `FWE` select features based on family-wise error rate.
  * By default, the selection method is `KBest`, the default number of top features is 50.
  * User can use setNumTopFeatures, setPercentile and setAlpha to set different selection methods.
  */
@@ -177,7 +179,9 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] {
 class ChiSqSelector @Since("2.1.0") () extends Serializable {
   var numTopFeatures: Int = 50
   var percentile: Double = 0.1
-  var alpha: Double = 0.05
+  var alphaFPR: Double = 0.05
+  var alphaFDR: Double = 0.05
+  var alphaFWE: Double = 0.05
   var selectorType = ChiSqSelectorType.KBest
 
   /**
@@ -205,13 +209,29 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable {
   }
 
   @Since("2.1.0")
-  def setAlpha(value: Double): this.type = {
+  def setFPR(value: Double): this.type = {
     require(0.0 <= value && value <= 1.0, "Alpha must be in [0,1]")
-    alpha = value
+    alphaFPR = value
     selectorType = ChiSqSelectorType.FPR
     this
   }
 
+  @Since("2.1.0")
+  def setFDR(value: Double): this.type = {
+    require(0.0 <= value && value <= 1.0, "Alpha must be in [0,1]")
+    alphaFDR = value
+    selectorType = ChiSqSelectorType.FDR
+    this
+  }
+
+  @Since("2.1.0")
+  def setFWE(value: Double): this.type = {
+    require(0.0 <= value && value <= 1.0, "Alpha must be in [0,1]")
+    alphaFWE = value
+    selectorType = ChiSqSelectorType.FWE
+    this
+  }
+
   @Since("2.1.0")
   def setChiSqSelectorType(value: ChiSqSelectorType.Value): this.type = {
     selectorType = value
@@ -228,18 +248,32 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable {
   @Since("1.3.0")
   def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = {
     val chiSqTestResult = Statistics.chiSqTest(data)
-      .zipWithIndex.sortBy { case (res, _) => -res.statistic }
+      .zipWithIndex
     val features = selectorType match {
       case ChiSqSelectorType.KBest => chiSqTestResult
+        .sortBy { case (res, _) => -res.statistic }
         .take(numTopFeatures)
       case ChiSqSelectorType.Percentile => chiSqTestResult
+        .sortBy { case (res, _) => -res.statistic }
         .take((chiSqTestResult.length * percentile).toInt)
       case ChiSqSelectorType.FPR => chiSqTestResult
-        .filter{ case (res, _) => res.pValue < alpha }
+        .filter{ case (res, _) => res.pValue < alphaFPR }
+      case ChiSqSelectorType.FDR =>
+        val tempRDD = chiSqTestResult
+          .sortBy{ case (res, _) => res.pValue }
+        val maxIndex = tempRDD
+          .zipWithIndex
+          .filter{ case ((res, index1), index2) =>
+            res.pValue <= alphaFDR * (index2 + 1) / chiSqTestResult.length }
+          .map{ case (_, index) => index}
+          .max
+        tempRDD.take(maxIndex + 1)
+      case ChiSqSelectorType.FWE => chiSqTestResult
+        .filter{ case (res, _) => res.pValue < alphaFWE/chiSqTestResult.length }
       case errorType =>
         throw new IllegalStateException(s"Unknown ChiSqSelector Type: $errorType")
     }
-    val indices = features.map { case (_, indices) => indices }
+    val indices = features.map { case (_, index) => index }
     new ChiSqSelectorModel(indices)
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
index e181a544f7159..a036d73f7ebbb 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
@@ -76,7 +76,7 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext {
         LabeledPoint(1.0, Vectors.dense(Array(4.0))),
         LabeledPoint(1.0, Vectors.dense(Array(4.0))),
         LabeledPoint(2.0, Vectors.dense(Array(9.0))))
-    val model = new ChiSqSelector().setAlpha(0.1).fit(labeledDiscreteData)
+    val model = new ChiSqSelector().setFPR(0.1).fit(labeledDiscreteData)
     val filteredData = labeledDiscreteData.map { lp =>
       LabeledPoint(lp.label, model.transform(lp.features))
     }.collect().toSet
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 077c11370eb3f..138840241e658 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -275,7 +275,7 @@ class ChiSqSelectorType:
     """
     This class defines the selector types of Chi Square Selector.
     """
-    KBest, Percentile, FPR = range(3)
+    KBest, Percentile, FPR, FDR, FWE = range(5)
 
 
 class ChiSqSelector(object):
@@ -284,7 +284,9 @@ class ChiSqSelector(object):
     The selector supports three selection methods: `KBest`, `Percentile` and `FPR`.
     `KBest` chooses the `k` top features according to a chi-squared test.
     `Percentile` is similar but chooses a fraction of all features instead of a fixed number.
-    `FPR` chooses all features whose false positive rate meets some threshold.
+    `FPR` select features based on a false positive rate test.
+    `FDR` select features based on an estimated false discovery rate.
+    `FWE` select features based on family-wise error rate.
     By default, the selection method is `KBest`, the default number of top features is 50.
     User can use setNumTopFeatures, setPercentile and setAlpha to set different selection methods.
 
@@ -310,7 +312,7 @@ class ChiSqSelector(object):
     ...     LabeledPoint(1.0, [0.0, 9.0, 8.0, 4.0]),
     ...     LabeledPoint(2.0, [8.0, 9.0, 5.0, 9.0])
     ... ]
-    >>> model = ChiSqSelector().setAlpha(0.1).fit(sc.parallelize(data))
+    >>> model = ChiSqSelector().setFPR(0.1).fit(sc.parallelize(data))
     >>> model.transform(DenseVector([1.0,2.0,3.0,4.0]))
     DenseVector([4.0])
 
@@ -339,14 +341,32 @@ def setPercentile(self, percentile):
         return self
 
     @since('2.1.0')
-    def setAlpha(self, alpha):
+    def setFPR(self, alpha):
         """
         set alpha [0.0, 1.0] for feature selection by FPR
         """
-        self.alpha = float(alpha)
+        self.alphaFPR = float(alpha)
         self.selectorType = ChiSqSelectorType.FPR
         return self
 
+    @since('2.1.0')
+    def setFDR(self, alpha):
+        """
+        set alpha [0.0, 1.0] for feature selection by FDR
+        """
+        self.alphaFPR = float(alpha)
+        self.selectorType = ChiSqSelectorType.FDR
+        return self
+
+    @since('2.1.0')
+    def setFWE(self, alpha):
+        """
+        set alpha [0.0, 1.0] for feature selection by FWE
+        """
+        self.alphaFWE = float(alpha)
+        self.selectorType = ChiSqSelectorType.FWE
+        return self
+
     @since('1.4.0')
     def fit(self, data):
         """
@@ -362,7 +382,11 @@ def fit(self, data):
         elif self.selectorType == ChiSqSelectorType.Percentile:
             jmodel = callMLlibFunc("fitChiSqSelectorPercentile", self.percentile, data)
         elif self.selectorType == ChiSqSelectorType.FPR:
-            jmodel = callMLlibFunc("fitChiSqSelectorFPR", self.alpha, data)
+            jmodel = callMLlibFunc("fitChiSqSelectorFPR", self.alphaFPR, data)
+        elif self.selectorType == ChiSqSelectorType.FDR:
+            jmodel = callMLlibFunc("fitChiSqSelectorFDR", self.alphaFDR, data)
+        elif self.selectorType == ChiSqSelectorType.FWE:
+            jmodel = callMLlibFunc("fitChiSqSelectorFWE", self.alphaFWE, data)
         else:
             raise ValueError("ChiSqSelector type supports KBest(0), Percentile(1) and"
                              " FPR(2), the current value is: %s" % self.selectorType)

From 9c7fae33d715b1f8dc6d1fc4ee468cbe047ef5c9 Mon Sep 17 00:00:00 2001
From: Peng <peng.meng@intel.com>
Date: Tue, 27 Sep 2016 20:26:28 +0800
Subject: [PATCH 2/9] fix python style bug

---
 python/pyspark/ml/feature.py    | 25 +++++++++++++++----------
 python/pyspark/mllib/feature.py |  3 ++-
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 30311746acd9d..d6bd33239861a 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -2602,34 +2602,39 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja
                        typeConverter=TypeConverters.toFloat)
 
     alphaFPR = Param(Params._dummy(), "alphaFPR", "The highest p-value for features to be kept.",
-                  typeConverter=TypeConverters.toFloat)
+                     typeConverter=TypeConverters.toFloat)
 
-    alphaFDR = Param(Params._dummy(), "alphaFDR", "The highest uncorrected p-value for features to be kept.",
-                  typeConverter=TypeConverters.toFloat)
+    alphaFDR = Param(Params._dummy(), "alphaFDR", "The highest uncorrected p-value for " +
+                     "features to be kept.", typeConverter=TypeConverters.toFloat)
 
-    alphaFWE = Param(Params._dummy(), "alphaFWE", "The highest uncorrected p-value for features to be kept.",
-                  typeConverter=TypeConverters.toFloat)
+    alphaFWE = Param(Params._dummy(), "alphaFWE", "The highest uncorrected p-value for " +
+                     "features to be kept.", typeConverter=TypeConverters.toFloat)
 
     @keyword_only
     def __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None,
-                 labelCol="label", selectorType="kbest", percentile=0.1, alphaFPR=0.05, alphaFDR=0.05, alphaFWE=0.05):
+                 labelCol="label", selectorType="kbest", percentile=0.1, alphaFPR=0.05,
+                 alphaFDR=0.05, alphaFWE=0.05):
         """
         __init__(self, numTopFeatures=50, featuresCol="features", outputCol=None, \
-                 labelCol="label", selectorType="kbest", percentile=0.1, alphaFPR=0.05, alphaFDR=0.05, alphaFWE=0.05)
+                 labelCol="label", selectorType="kbest", percentile=0.1, alphaFPR=0.05, \
+                 alphaFDR=0.05, alphaFWE=0.05)
         """
         super(ChiSqSelector, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.ChiSqSelector", self.uid)
-        self._setDefault(numTopFeatures=50, selectorType="kbest", percentile=0.1, alphaFPR=0.05, alphaFDR=0.05, alphaFWE=0.05)
+        self._setDefault(numTopFeatures=50, selectorType="kbest", percentile=0.1, alphaFPR=0.05,
+                         alphaFDR=0.05, alphaFWE=0.05)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
     @since("2.0.0")
     def setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None,
-                  labelCol="labels", selectorType="kbest", percentile=0.1, alphaFPR=0.05, alphaFDR=0.05, alphaFWE=0.05):
+                  labelCol="labels", selectorType="kbest", percentile=0.1, alphaFPR=0.05,
+                  alphaFDR=0.05, alphaFWE=0.05):
         """
         setParams(self, numTopFeatures=50, featuresCol="features", outputCol=None, \
-                  labelCol="labels", selectorType="kbest", percentile=0.1, alphaFPR=0.05, alphaFDR=0.05, alphaFWE=0.05)
+                  labelCol="labels", selectorType="kbest", percentile=0.1, alphaFPR=0.05, \
+                  alphaFDR=0.05, alphaFWE=0.05)
         Sets params for this ChiSqSelector.
         """
         kwargs = self.setParams._input_kwargs
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index ac3396ee9e845..a0a2fc94283bd 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -311,7 +311,8 @@ class ChiSqSelector(object):
 
     .. versionadded:: 1.4.0
     """
-    def __init__(self, numTopFeatures=50, selectorType="kbest", percentile=0.1, alphaFPR=0.05, alphaFDR=0.05, alphaFWE=0.05):
+    def __init__(self, numTopFeatures=50, selectorType="kbest", percentile=0.1, alphaFPR=0.05,
+		 alphaFDR=0.05, alphaFWE=0.05):
         self.numTopFeatures = numTopFeatures
         self.selectorType = selectorType
         self.percentile = percentile

From 2e97c5542a7f1c5157d93f227da7cb33eaccc230 Mon Sep 17 00:00:00 2001
From: Peng <peng.meng@intel.com>
Date: Tue, 27 Sep 2016 20:39:47 +0800
Subject: [PATCH 3/9] python style change

---
 python/pyspark/mllib/feature.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index a0a2fc94283bd..659e5f4724726 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -312,7 +312,7 @@ class ChiSqSelector(object):
     .. versionadded:: 1.4.0
     """
     def __init__(self, numTopFeatures=50, selectorType="kbest", percentile=0.1, alphaFPR=0.05,
-		 alphaFDR=0.05, alphaFWE=0.05):
+                 alphaFDR=0.05, alphaFWE=0.05):
         self.numTopFeatures = numTopFeatures
         self.selectorType = selectorType
         self.percentile = percentile

From d05d7de190785c393dc4290a26cea61d414191af Mon Sep 17 00:00:00 2001
From: Peng <peng.meng@intel.com>
Date: Mon, 10 Oct 2016 15:01:08 +0800
Subject: [PATCH 4/9] minor change

---
 .../org/apache/spark/mllib/feature/ChiSqSelector.scala | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index 1cdcb59e878f7..216c9434c6ab5 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -244,20 +244,20 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable {
           .take((chiSqTestResult.length * percentile).toInt)
       case ChiSqSelector.FPR =>
         chiSqTestResult
-          .filter{ case (res, _) => res.pValue < alpha }
+          .filter { case (res, _) => res.pValue < alpha }
       case ChiSqSelector.FDR =>
         val tempRDD = chiSqTestResult
-          .sortBy{ case (res, _) => res.pValue }
+          .sortBy { case (res, _) => res.pValue }
         val maxIndex = tempRDD
           .zipWithIndex
-          .filter{ case ((res, _), index) =>
+          .filter { case ((res, _), index) =>
             res.pValue <= alpha * (index + 1) / chiSqTestResult.length }
-          .map{ case (_, index) => index}
+          .map { case (_, index) => index}
           .max
         tempRDD.take(maxIndex + 1)
       case ChiSqSelector.FWE =>
         chiSqTestResult
-          .filter{ case (res, _) => res.pValue < alpha/chiSqTestResult.length }
+          .filter { case (res, _) => res.pValue < alpha/chiSqTestResult.length }
       case errorType =>
         throw new IllegalStateException(s"Unknown ChiSqSelector Type: $errorType")
     }

From d51b78b6cdd94b01a76a99b3d70f957737d88b20 Mon Sep 17 00:00:00 2001
From: Peng <peng.meng@intel.com>
Date: Thu, 20 Oct 2016 15:36:55 +0800
Subject: [PATCH 5/9] add test cases, and revise docs

---
 docs/ml-features.md                           |   6 +-
 docs/mllib-feature-extraction.md              |   8 +-
 .../spark/ml/feature/ChiSqSelector.scala      |  18 +-
 .../spark/mllib/feature/ChiSqSelector.scala   |  37 ++-
 .../spark/ml/feature/ChiSqSelectorSuite.scala | 304 ++++++++++++++++--
 .../mllib/feature/ChiSqSelectorSuite.scala    | 226 +++++++++++--
 python/pyspark/ml/feature.py                  |  14 +-
 python/pyspark/mllib/feature.py               |   8 +-
 8 files changed, 542 insertions(+), 79 deletions(-)

diff --git a/docs/ml-features.md b/docs/ml-features.md
index a7f710fa52e64..9e4eb8dba68d9 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1333,14 +1333,16 @@ for more details on the API.
 `ChiSqSelector` stands for Chi-Squared feature selection. It operates on labeled data with
 categorical features. ChiSqSelector uses the
 [Chi-Squared test of independence](https://en.wikipedia.org/wiki/Chi-squared_test) to decide which
-features to choose. It supports three selection methods: `KBest`, `Percentile` and `FPR`:
+features to choose. It supports five selection methods: `KBest`, `Percentile`, `FPR`, `FDR` and `FWE`:
 
 * `KBest` chooses the `k` top features according to a chi-squared test. This is akin to yielding the features with the most predictive power.
 * `Percentile` is similar to `KBest` but chooses a fraction of all features instead of a fixed number.
 * `FPR` chooses all features whose false positive rate meets some threshold.
+* `FDR` chooses all features whose false discovery rate meets some threshold.
+* `FWE` chooses all features whose family-wise error rate meets some threshold.
 
 By default, the selection method is `KBest`, the default number of top features is 50. User can use
-`setNumTopFeatures`, `setPercentile` and `setAlpha` to set different selection methods.
+`setSelectorType` to set different selection methods.
 
 **Examples**
 
diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md
index 87e1e027e945b..41577decad877 100644
--- a/docs/mllib-feature-extraction.md
+++ b/docs/mllib-feature-extraction.md
@@ -227,20 +227,22 @@ both speed and statistical learning behavior.
 [`ChiSqSelector`](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector) implements
 Chi-Squared feature selection. It operates on labeled data with categorical features. ChiSqSelector uses the
 [Chi-Squared test of independence](https://en.wikipedia.org/wiki/Chi-squared_test) to decide which
-features to choose. It supports three selection methods: `KBest`, `Percentile` and `FPR`:
+features to choose. It supports five selection methods: `KBest`, `Percentile`, `FPR`, `FDR`, and `FWE`:
 
 * `KBest` chooses the `k` top features according to a chi-squared test. This is akin to yielding the features with the most predictive power.
 * `Percentile` is similar to `KBest` but chooses a fraction of all features instead of a fixed number.
 * `FPR` chooses all features whose false positive rate meets some threshold.
+* `FDR` chooses all features whose false discovery rate meets some threshold.
+* `FWE` chooses all features whose family-wise error rate meets some threshold.
 
 By default, the selection method is `KBest`, the default number of top features is 50. User can use
-`setNumTopFeatures`, `setPercentile` and `setAlpha` to set different selection methods.
+`setSelectorType` to set different selection methods.
 
 The number of features to select can be tuned using a held-out validation set.
 
 ### Model Fitting
 
-`ChiSqSelector` takes a `numTopFeatures` parameter specifying the number of top features that
+`ChiSqSelector` can take a `numTopFeatures` parameter specifying the number of top features that
 the selector will select.
 
 The [`fit`](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector) method takes
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
index 033a0b9c6de17..da131e4f7fc91 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
@@ -72,14 +72,15 @@ private[feature] trait ChiSqSelectorParams extends Params
   def getPercentile: Double = $(percentile)
 
   /**
-   * alpha means the highest p-value for features to be kept when select type is "fpr".
-   * alpha means the highest uncorrected p-value for features to be kept when select type
+   * Only applicable when selectorType = "fpr", "fdr", or "fwe"
+   * alpha means the highest p-value for features to be kept when select type is "fpr",
+   * or the highest uncorrected p-value for features to be kept when select type
    * is "fdr" and "fwe".
    * Default value is 0.05.
    */
   final val alpha = new DoubleParam(this, "alpha",
     "alpha means the highest p-value for features to be kept when select type is fpr, " +
-      "alpha means the highest uncorrected p-value for features to be kept when select type " +
+      "or the highest uncorrected p-value for features to be kept when select type " +
       "is fdr and fwe.",
     ParamValidators.inRange(0, 1))
   setDefault(alpha -> 0.05)
@@ -104,13 +105,12 @@ private[feature] trait ChiSqSelectorParams extends Params
 /**
  * Chi-Squared feature selection, which selects categorical features to use for predicting a
  * categorical label.
- * The selector supports three selection methods: `kbest`, `percentile` and `fpr`.
+ * The selector supports five selection methods: `kbest`, `percentile`, `fpr`, `fdr` and `fwe`.
  * `kbest` chooses the `k` top features according to a chi-squared test.
  * `percentile` is similar but chooses a fraction of all features instead of a fixed number.
  * `fpr` chooses all features whose false positive rate meets some threshold.
- * `fpr` select features based on a false positive rate test.
- * `fdr` select features based on an estimated false discovery rate.
- * `fwe` select features based on family-wise error rate.
+ * `fdr` chooses all features whose false discovery rate meets some threshold.
+ * `fwe` chooses all features whose family-wise error rate meets some threshold.
  * By default, the selection method is `kbest`, the default number of top features is 50.
  */
 @Since("1.6.0")
@@ -134,9 +134,7 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str
 
   /** @group setParam */
   @Since("2.1.0")
-  def setAlpha(value: Double): this.type = {
-    set(alpha, value)
-  }
+  def setAlpha(value: Double): this.type = set(alpha, value)
 
   /** @group setParam */
   @Since("1.6.0")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index 21fd42a6a15cb..64844e607b298 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -171,12 +171,12 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] {
 
 /**
  * Creates a ChiSquared feature selector.
- * The selector supports three selection methods: `kbest`, `percentile` and `fpr`.
+ * The selector supports five selection methods: `kbest`, `percentile`, `fpr`, `fdr` and `fwe`.
  * `kbest` chooses the `k` top features according to a chi-squared test.
  * `percentile` is similar but chooses a fraction of all features instead of a fixed number.
- * `fpr` select features based on a false positive rate test.
- * `fdr` select features based on an estimated false discovery rate.
- * `fwe` select features based on family-wise error rate.
+ * `fpr` chooses all features whose false positive rate meets some threshold.
+ * `fdr` chooses all features whose false discovery rate meets some threshold.
+ * `fwe` chooses all features whose family-wise error rate meets some threshold.
  * By default, the selection method is `kbest`, the default number of top features is 50.
  */
 @Since("1.3.0")
@@ -246,15 +246,16 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable {
         chiSqTestResult
           .filter { case (res, _) => res.pValue < alpha }
       case ChiSqSelector.FDR =>
-        val tempRDD = chiSqTestResult
+        // This uses the Benjamini-Hochberg procedure.
+        val tempRes = chiSqTestResult
           .sortBy { case (res, _) => res.pValue }
-        val maxIndex = tempRDD
+        val maxIndex = tempRes
           .zipWithIndex
           .filter { case ((res, _), index) =>
             res.pValue <= alpha * (index + 1) / chiSqTestResult.length }
-          .map { case (_, index) => index}
+          .map { case (_, index) => index }
           .max
-        tempRDD.take(maxIndex + 1)
+        tempRes.take(maxIndex + 1)
       case ChiSqSelector.FWE =>
         chiSqTestResult
           .filter { case (res, _) => res.pValue < alpha/chiSqTestResult.length }
@@ -269,19 +270,29 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable {
 @Since("2.1.0")
 object ChiSqSelector {
 
-  /** String name for `kbest` selector type. */
+  /** String name for `kbest` selector type.
+   *  Choose the `k` top features according to a chi-squared test.
+   */
   private[spark] val KBest: String = "kbest"
 
-  /** String name for `percentile` selector type. */
+  /** String name for `percentile` selector type.
+   *  Choose a fraction of all features instead of a fixed number.
+   */
   private[spark] val Percentile: String = "percentile"
 
-  /** String name for `fpr` selector type. */
+  /** String name for `fpr` selector type.
+   *  Choose all features whose false positive rate meets some threshold.
+   */
   private[spark] val FPR: String = "fpr"
 
-  /** String name for `fdr` selector type. */
+  /** String name for `fdr` selector type.
+   *  Choose all features whose false discovery rate meets some threshold.
+   */
   private[spark] val FDR: String = "fdr"
 
-  /** String name for `fwe` selector type. */
+  /** String name for `fwe` selector type.
+   *  Choose all features whose family-wise error rate meets some threshold.
+   */
   private[spark] val FWE: String = "fwe"
 
   /** Set of selector type and param pairs that ChiSqSelector supports. */
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala
index 3171549544e35..1ec1731bdd194 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala
@@ -28,22 +28,85 @@ import org.apache.spark.sql.Row
 class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext
   with DefaultReadWriteTest {
 
-  test("Test Chi-Square selector") {
+  /*
+   *  Contingency tables
+   *  feature0 = {6.0, 0.0, 8.0}
+   *  class  0 1 2
+   *    6.0||1|0|0|
+   *    0.0||0|3|0|
+   *    8.0||0|0|2|
+   *  degree of freedom = 4, statistic = 12, pValue = 0.017
+   *
+   *  feature1 = {7.0, 9.0}
+   *  class  0 1 2
+   *    7.0||1|0|0|
+   *    9.0||0|3|2|
+   *  degree of freedom = 2, statistic = 6, pValue = 0.049
+   *
+   *  feature2 = {0.0, 6.0, 3.0, 8.0}
+   *  class  0 1 2
+   *    0.0||1|0|0|
+   *    6.0||0|1|2|
+   *    3.0||0|1|0|
+   *    8.0||0|1|0|
+   *  degree of freedom = 6, statistic = 8.66, pValue = 0.193
+   *
+   *  feature3 = {7.0, 0.0, 5.0, 4.0}
+   *  class  0 1 2
+   *    7.0||1|0|0|
+   *    0.0||0|2|0|
+   *    5.0||0|1|1|
+   *    4.0||0|0|1|
+   *  degree of freedom = 6, statistic = 9.5, pValue = 0.147
+   *
+   *  feature4 = {6.0, 5.0, 4.0, 0.0}
+   *  class  0 1 2
+   *    6.0||1|1|0|
+   *    5.0||0|2|0|
+   *    4.0||0|0|1|
+   *    0.0||0|0|1|
+   *  degree of freedom = 6, statistic = 8.0, pValue = 0.238
+   *
+   *  feature5 = {0.0, 9.0, 5.0, 4.0}
+   *  class  0 1 2
+   *    0.0||1|0|1|
+   *    9.0||0|1|0|
+   *    5.0||0|1|0|
+   *    4.0||0|1|1|
+   *  degree of freedom = 6, statistic = 5, pValue = 0.54
+   *
+   *  Use chi-squared calculator from Internet
+   */
+
+  test("Test Chi-Square selector KBest") {
     import testImplicits._
     val data = Seq(
-      LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))),
-      LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))),
-      LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))),
-      LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))
+      LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))),
+      LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))),
+      LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))),
+      LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))),
+      LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))),
+      LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)))
     )
 
     val preFilteredData = Seq(
-      Vectors.dense(8.0),
+      Vectors.dense(6.0),
+      Vectors.dense(0.0),
       Vectors.dense(0.0),
       Vectors.dense(0.0),
+      Vectors.dense(8.0),
       Vectors.dense(8.0)
     )
 
+    val preFilteredData2 = Seq(
+      Vectors.dense(6.0, 7.0, 7.0),
+      Vectors.dense(0.0, 9.0, 0.0),
+      Vectors.dense(0.0, 9.0, 0.0),
+      Vectors.dense(0.0, 9.0, 5.0),
+      Vectors.dense(8.0, 9.0, 5.0),
+      Vectors.dense(8.0, 9.0, 4.0)
+    )
+
     val df = sc.parallelize(data.zip(preFilteredData))
       .map(x => (x._1.label, x._1.features, x._2))
       .toDF("label", "data", "preFilteredData")
@@ -60,37 +123,236 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext
         assert(vec1 ~== vec2 absTol 1e-1)
     }
 
-    selector.setSelectorType("percentile").setPercentile(0.34).fit(df).transform(df)
-      .select("filtered", "preFilteredData").collect().foreach {
-        case Row(vec1: Vector, vec2: Vector) =>
-          assert(vec1 ~== vec2 absTol 1e-1)
-      }
+    val df2 = sc.parallelize(data.zip(preFilteredData2))
+      .map(x => (x._1.label, x._1.features, x._2))
+      .toDF("label", "data", "preFilteredData")
+
+    selector.setNumTopFeatures(3).fit(df2).transform(df2).select("filtered", "preFilteredData")
+      .collect().foreach {
+      case Row(vec1: Vector, vec2: Vector) =>
+        assert(vec1 ~== vec2 absTol 1e-1)
+    }
+  }
+
+  test("Test Chi-Square selector Percentile") {
+    import testImplicits._
+    val data = Seq(
+      LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))),
+      LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))),
+      LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))),
+      LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))),
+      LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))),
+      LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)))
+    )
+
+    val preFilteredData = Seq(
+      Vectors.dense(6.0),
+      Vectors.dense(0.0),
+      Vectors.dense(0.0),
+      Vectors.dense(0.0),
+      Vectors.dense(8.0),
+      Vectors.dense(8.0)
+    )
 
     val preFilteredData2 = Seq(
-      Vectors.dense(8.0, 7.0),
+      Vectors.dense(6.0, 7.0, 7.0),
+      Vectors.dense(0.0, 9.0, 0.0),
+      Vectors.dense(0.0, 9.0, 0.0),
+      Vectors.dense(0.0, 9.0, 5.0),
+      Vectors.dense(8.0, 9.0, 5.0),
+      Vectors.dense(8.0, 9.0, 4.0)
+    )
+
+    val df = sc.parallelize(data.zip(preFilteredData))
+      .map(x => (x._1.label, x._1.features, x._2))
+      .toDF("label", "data", "preFilteredData")
+
+    val selector = new ChiSqSelector()
+      .setSelectorType("percentile")
+      .setPercentile(0.2)
+      .setFeaturesCol("data")
+      .setLabelCol("label")
+      .setOutputCol("filtered")
+
+    selector.fit(df).transform(df).select("filtered", "preFilteredData").collect().foreach {
+      case Row(vec1: Vector, vec2: Vector) =>
+        assert(vec1 ~== vec2 absTol 1e-1)
+    }
+
+    val df2 = sc.parallelize(data.zip(preFilteredData2))
+      .map(x => (x._1.label, x._1.features, x._2))
+      .toDF("label", "data", "preFilteredData")
+
+    selector.setPercentile(0.5).fit(df2).transform(df2).select("filtered", "preFilteredData")
+      .collect().foreach {
+      case Row(vec1: Vector, vec2: Vector) =>
+        assert(vec1 ~== vec2 absTol 1e-1)
+    }
+  }
+
+  test("Test Chi-Square selector FPR") {
+    import testImplicits._
+    val data = Seq(
+      LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))),
+      LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))),
+      LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))),
+      LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))),
+      LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))),
+      LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)))
+    )
+
+    val preFilteredData = Seq(
+      Vectors.dense(6.0),
+      Vectors.dense(0.0),
+      Vectors.dense(0.0),
+      Vectors.dense(0.0),
+      Vectors.dense(8.0),
+      Vectors.dense(8.0)
+    )
+
+    val preFilteredData2 = Seq(
+      Vectors.dense(6.0, 7.0, 7.0),
+      Vectors.dense(0.0, 9.0, 0.0),
+      Vectors.dense(0.0, 9.0, 0.0),
+      Vectors.dense(0.0, 9.0, 5.0),
+      Vectors.dense(8.0, 9.0, 5.0),
+      Vectors.dense(8.0, 9.0, 4.0)
+    )
+
+    val df = sc.parallelize(data.zip(preFilteredData))
+      .map(x => (x._1.label, x._1.features, x._2))
+      .toDF("label", "data", "preFilteredData")
+
+    val selector = new ChiSqSelector()
+      .setSelectorType("fpr")
+      .setAlpha(0.02)
+      .setFeaturesCol("data")
+      .setLabelCol("label")
+      .setOutputCol("filtered")
+
+    selector.fit(df).transform(df).select("filtered", "preFilteredData").collect().foreach {
+      case Row(vec1: Vector, vec2: Vector) =>
+        assert(vec1 ~== vec2 absTol 1e-1)
+    }
+
+    val df2 = sc.parallelize(data.zip(preFilteredData2))
+      .map(x => (x._1.label, x._1.features, x._2))
+      .toDF("label", "data", "preFilteredData")
+
+    selector.setAlpha(0.15).fit(df2).transform(df2).select("filtered", "preFilteredData")
+      .collect().foreach {
+      case Row(vec1: Vector, vec2: Vector) =>
+        assert(vec1 ~== vec2 absTol 1e-1)
+    }
+  }
+
+  test("Test Chi-Square selector FDR") {
+    import testImplicits._
+    val data = Seq(
+      LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))),
+      LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))),
+      LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))),
+      LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))),
+      LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))),
+      LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)))
+    )
+
+    val preFilteredData = Seq(
+      Vectors.dense(6.0),
+      Vectors.dense(0.0),
+      Vectors.dense(0.0),
+      Vectors.dense(0.0),
+      Vectors.dense(8.0),
+      Vectors.dense(8.0)
+    )
+
+    val preFilteredData2 = Seq(
+      Vectors.dense(6.0, 7.0),
       Vectors.dense(0.0, 9.0),
       Vectors.dense(0.0, 9.0),
+      Vectors.dense(0.0, 9.0),
+      Vectors.dense(8.0, 9.0),
       Vectors.dense(8.0, 9.0)
     )
 
+    val df = sc.parallelize(data.zip(preFilteredData))
+      .map(x => (x._1.label, x._1.features, x._2))
+      .toDF("label", "data", "preFilteredData")
+
+    val selector = new ChiSqSelector()
+      .setSelectorType("fdr")
+      .setAlpha(0.12)
+      .setFeaturesCol("data")
+      .setLabelCol("label")
+      .setOutputCol("filtered")
+
+    selector.fit(df).transform(df).select("filtered", "preFilteredData").collect().foreach {
+      case Row(vec1: Vector, vec2: Vector) =>
+        assert(vec1 ~== vec2 absTol 1e-1)
+    }
+
     val df2 = sc.parallelize(data.zip(preFilteredData2))
       .map(x => (x._1.label, x._1.features, x._2))
       .toDF("label", "data", "preFilteredData")
 
-    selector.setSelectorType("fpr").setAlpha(0.2).fit(df2).transform(df2)
-      .select("filtered", "preFilteredData").collect().foreach {
-        case Row(vec1: Vector, vec2: Vector) =>
-          assert(vec1 ~== vec2 absTol 1e-1)
-      }
+    selector.setAlpha(0.15).fit(df2).transform(df2).select("filtered", "preFilteredData")
+      .collect().foreach {
+      case Row(vec1: Vector, vec2: Vector) =>
+        assert(vec1 ~== vec2 absTol 1e-1)
+    }
+  }
+
+  test("Test Chi-Square selector FWE") {
+    import testImplicits._
+    val data = Seq(
+      LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))),
+      LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))),
+      LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))),
+      LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))),
+      LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))),
+      LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)))
+    )
 
-    selector.setSelectorType("fwe").setAlpha(0.5).fit(df2).transform(df2)
-      .select("filtered", "preFilteredData").collect().foreach {
+    val preFilteredData = Seq(
+      Vectors.dense(6.0),
+      Vectors.dense(0.0),
+      Vectors.dense(0.0),
+      Vectors.dense(0.0),
+      Vectors.dense(8.0),
+      Vectors.dense(8.0)
+    )
+
+    val preFilteredData2 = Seq(
+      Vectors.dense(6.0, 7.0),
+      Vectors.dense(0.0, 9.0),
+      Vectors.dense(0.0, 9.0),
+      Vectors.dense(0.0, 9.0),
+      Vectors.dense(8.0, 9.0),
+      Vectors.dense(8.0, 9.0)
+    )
+
+    val df = sc.parallelize(data.zip(preFilteredData))
+      .map(x => (x._1.label, x._1.features, x._2))
+      .toDF("label", "data", "preFilteredData")
+
+    val selector = new ChiSqSelector()
+      .setSelectorType("fwe")
+      .setAlpha(0.12)
+      .setFeaturesCol("data")
+      .setLabelCol("label")
+      .setOutputCol("filtered")
+
+    selector.fit(df).transform(df).select("filtered", "preFilteredData").collect().foreach {
       case Row(vec1: Vector, vec2: Vector) =>
         assert(vec1 ~== vec2 absTol 1e-1)
     }
 
-    selector.setSelectorType("fdr").setAlpha(0.21).fit(df2).transform(df2)
-      .select("filtered", "preFilteredData").collect().foreach {
+    val df2 = sc.parallelize(data.zip(preFilteredData2))
+      .map(x => (x._1.label, x._1.features, x._2))
+      .toDF("label", "data", "preFilteredData")
+
+    selector.setAlpha(0.3).fit(df2).transform(df2).select("filtered", "preFilteredData")
+      .collect().foreach {
       case Row(vec1: Vector, vec2: Vector) =>
         assert(vec1 ~== vec2 absTol 1e-1)
     }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
index ac702b4b7c69e..ca5ba9b5871af 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
@@ -27,60 +27,240 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext {
 
   /*
    *  Contingency tables
-   *  feature0 = {8.0, 0.0}
+   *  feature0 = {6.0, 0.0, 8.0}
    *  class  0 1 2
-   *    8.0||1|0|1|
-   *    0.0||0|2|0|
+   *    6.0||1|0|0|
+   *    0.0||0|3|0|
+   *    8.0||0|0|2|
+   *  degree of freedom = 4, statistic = 12, pValue = 0.017
    *
    *  feature1 = {7.0, 9.0}
    *  class  0 1 2
    *    7.0||1|0|0|
-   *    9.0||0|2|1|
+   *    9.0||0|3|2|
+   *  degree of freedom = 2, statistic = 6, pValue = 0.049
    *
-   *  feature2 = {0.0, 6.0, 8.0, 5.0}
+   *  feature2 = {0.0, 6.0, 3.0, 8.0}
    *  class  0 1 2
    *    0.0||1|0|0|
-   *    6.0||0|1|0|
+   *    6.0||0|1|2|
+   *    3.0||0|1|0|
    *    8.0||0|1|0|
-   *    5.0||0|0|1|
+   *  degree of freedom = 6, statistic = 8.66, pValue = 0.193
+   *
+   *  feature3 = {7.0, 0.0, 5.0, 4.0}
+   *  class  0 1 2
+   *    7.0||1|0|0|
+   *    0.0||0|2|0|
+   *    5.0||0|1|1|
+   *    4.0||0|0|1|
+   *  degree of freedom = 6, statistic = 9.5, pValue = 0.147
+   *
+   *  feature4 = {6.0, 5.0, 4.0, 0.0}
+   *  class  0 1 2
+   *    6.0||1|1|0|
+   *    5.0||0|2|0|
+   *    4.0||0|0|1|
+   *    0.0||0|0|1|
+   *  degree of freedom = 6, statistic = 8.0, pValue = 0.238
+   *
+   *  feature5 = {0.0, 9.0, 5.0, 4.0}
+   *  class  0 1 2
+   *    0.0||1|0|1|
+   *    9.0||0|1|0|
+   *    5.0||0|1|0|
+   *    4.0||0|1|1|
+   *  degree of freedom = 6, statistic = 5, pValue = 0.54
    *
    *  Use chi-squared calculator from Internet
    */
 
-  test("ChiSqSelector transform test (sparse & dense vector)") {
+  test("ChiSqSelector transform KBest test (sparse & dense vector)") {
     val labeledDiscreteData = sc.parallelize(
-      Seq(LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))),
-        LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))),
-        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))),
-        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))), 2)
+      Seq(LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))),
+        LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))),
+        LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)))), 2)
     val preFilteredData =
-      Set(LabeledPoint(0.0, Vectors.dense(Array(8.0))),
+      Set(LabeledPoint(0.0, Vectors.dense(Array(6.0))),
         LabeledPoint(1.0, Vectors.dense(Array(0.0))),
         LabeledPoint(1.0, Vectors.dense(Array(0.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(8.0))),
         LabeledPoint(2.0, Vectors.dense(Array(8.0))))
     val model = new ChiSqSelector(1).fit(labeledDiscreteData)
     val filteredData = labeledDiscreteData.map { lp =>
       LabeledPoint(lp.label, model.transform(lp.features))
     }.collect().toSet
     assert(filteredData == preFilteredData)
+
+    val preFilteredData2 =
+      Set(LabeledPoint(0.0, Vectors.dense(Array(6.0, 7.0, 7.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 0.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 0.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 5.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 4.0))))
+
+    val model2 = new ChiSqSelector(3).fit(labeledDiscreteData)
+    val filteredData2 = labeledDiscreteData.map { lp =>
+      LabeledPoint(lp.label, model2.transform(lp.features))
+    }.collect().toSet
+    assert(filteredData2 == preFilteredData2)
   }
 
-  test("ChiSqSelector by FPR transform test (sparse & dense vector)") {
+  test("ChiSqSelector transform Percentile test (sparse & dense vector)") {
     val labeledDiscreteData = sc.parallelize(
-      Seq(LabeledPoint(0.0, Vectors.sparse(4, Array((0, 8.0), (1, 7.0)))),
-        LabeledPoint(1.0, Vectors.sparse(4, Array((1, 9.0), (2, 6.0), (3, 4.0)))),
-        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 4.0))),
-        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0, 9.0)))), 2)
+      Seq(LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))),
+        LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))),
+        LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)))), 2)
     val preFilteredData =
-      Set(LabeledPoint(0.0, Vectors.dense(Array(0.0))),
-        LabeledPoint(1.0, Vectors.dense(Array(4.0))),
-        LabeledPoint(1.0, Vectors.dense(Array(4.0))),
-        LabeledPoint(2.0, Vectors.dense(Array(9.0))))
-    val model = new ChiSqSelector().setSelectorType("fpr").setAlpha(0.1).fit(labeledDiscreteData)
+      Set(LabeledPoint(0.0, Vectors.dense(Array(6.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(8.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(8.0))))
+    val model = new ChiSqSelector().setSelectorType("percentile").setPercentile(0.2)
+      .fit(labeledDiscreteData)
     val filteredData = labeledDiscreteData.map { lp =>
       LabeledPoint(lp.label, model.transform(lp.features))
     }.collect().toSet
     assert(filteredData == preFilteredData)
+
+    val preFilteredData2 =
+      Set(LabeledPoint(0.0, Vectors.dense(Array(6.0, 7.0, 7.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 0.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 0.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 5.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 4.0))))
+
+    val model2 = new ChiSqSelector().setSelectorType("percentile").setPercentile(0.5)
+      .fit(labeledDiscreteData)
+    val filteredData2 = labeledDiscreteData.map { lp =>
+      LabeledPoint(lp.label, model2.transform(lp.features))
+    }.collect().toSet
+    assert(filteredData2 == preFilteredData2)
+  }
+
+  test("ChiSqSelector transform FPR test (sparse & dense vector)") {
+    val labeledDiscreteData = sc.parallelize(
+      Seq(LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))),
+        LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))),
+        LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)))), 2)
+    val preFilteredData =
+      Set(LabeledPoint(0.0, Vectors.dense(Array(6.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(8.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(8.0))))
+    val model = new ChiSqSelector().setSelectorType("fpr").setAlpha(0.02)
+      .fit(labeledDiscreteData)
+    val filteredData = labeledDiscreteData.map { lp =>
+      LabeledPoint(lp.label, model.transform(lp.features))
+    }.collect().toSet
+    assert(filteredData == preFilteredData)
+
+    val preFilteredData2 =
+      Set(LabeledPoint(0.0, Vectors.dense(Array(6.0, 7.0, 7.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 0.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 0.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 5.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 4.0))))
+
+    val model2 = new ChiSqSelector().setSelectorType("fpr").setAlpha(0.15)
+      .fit(labeledDiscreteData)
+    val filteredData2 = labeledDiscreteData.map { lp =>
+      LabeledPoint(lp.label, model2.transform(lp.features))
+    }.collect().toSet
+    assert(filteredData2 == preFilteredData2)
+  }
+
+  test("ChiSqSelector transform FDR test (sparse & dense vector)") {
+    val labeledDiscreteData = sc.parallelize(
+      Seq(LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))),
+        LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))),
+        LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)))), 2)
+    val preFilteredData =
+      Set(LabeledPoint(0.0, Vectors.dense(Array(6.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(8.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(8.0))))
+    val model = new ChiSqSelector().setSelectorType("fdr").setAlpha(0.12)
+      .fit(labeledDiscreteData)
+    val filteredData = labeledDiscreteData.map { lp =>
+      LabeledPoint(lp.label, model.transform(lp.features))
+    }.collect().toSet
+    assert(filteredData == preFilteredData)
+
+    val preFilteredData2 =
+      Set(LabeledPoint(0.0, Vectors.dense(Array(6.0, 7.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0))))
+
+    val model2 = new ChiSqSelector().setSelectorType("fdr").setAlpha(0.15)
+      .fit(labeledDiscreteData)
+    val filteredData2 = labeledDiscreteData.map { lp =>
+      LabeledPoint(lp.label, model2.transform(lp.features))
+    }.collect().toSet
+    assert(filteredData2 == preFilteredData2)
+  }
+
+  test("ChiSqSelector transform FWE test (sparse & dense vector)") {
+    val labeledDiscreteData = sc.parallelize(
+      Seq(LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))),
+        LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))),
+        LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)))), 2)
+    val preFilteredData =
+      Set(LabeledPoint(0.0, Vectors.dense(Array(6.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(8.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(8.0))))
+    val model = new ChiSqSelector().setSelectorType("fwe").setAlpha(0.15)
+      .fit(labeledDiscreteData)
+    val filteredData = labeledDiscreteData.map { lp =>
+      LabeledPoint(lp.label, model.transform(lp.features))
+    }.collect().toSet
+    assert(filteredData == preFilteredData)
+
+    val preFilteredData2 =
+      Set(LabeledPoint(0.0, Vectors.dense(Array(6.0, 7.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0))))
+
+    val model2 = new ChiSqSelector().setSelectorType("fwe").setAlpha(0.3)
+      .fit(labeledDiscreteData)
+    val filteredData2 = labeledDiscreteData.map { lp =>
+      LabeledPoint(lp.label, model2.transform(lp.features))
+    }.collect().toSet
+    assert(filteredData2 == preFilteredData2)
   }
 
   test("model load / save") {
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 48a77e585c8cb..d9d9267e2397e 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -2580,8 +2580,15 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja
     """
     .. note:: Experimental
 
-    Chi-Squared feature selection, which selects categorical features to use for predicting a
-    categorical label.
+    Creates a ChiSquared feature selector.
+    The selector supports five selection methods: `KBest`, `Percentile`, `FPR`, `FDR` and `FWE`.
+    `kbest` chooses the `k` top features according to a chi-squared test.
+    `percentile` is similar but chooses a fraction of all features instead of a fixed number.
+    `fpr` chooses all features whose false positive rate meets some threshold.
+    `fdr` chooses all features whose false discovery rate meets some threshold.
+    `fwe` chooses all features whose family-wise error rate meets some threshold.
+    By default, the selection method is `kbest`, the default number of top features is 50.
+
 
     >>> from pyspark.ml.linalg import Vectors
     >>> df = spark.createDataFrame(
@@ -2625,7 +2632,7 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja
                        typeConverter=TypeConverters.toFloat)
 
     alpha = Param(Params._dummy(), "alpha", "alpha means the highest p-value for features " +
-                  "to be kept when select type is fpr, alpha means the highest uncorrected " +
+                  "to be kept when select type is fpr, or the highest uncorrected " +
                   "p-value for features to to kept when select type is fdr and fwe.",
                   typeConverter=TypeConverters.toFloat)
 
@@ -2701,6 +2708,7 @@ def getPercentile(self):
     @since("2.1.0")
     def setAlpha(self, value):
         """
+        Only applicable when selectorType = "fpr", "fdr" or "fwe"
         Sets the value of :py:attr:`alpha`.
         """
         return self._set(alpha=value)
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 1a9d4797bfc81..eda4e93c70fbf 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -274,12 +274,12 @@ def transform(self, vector):
 class ChiSqSelector(object):
     """
     Creates a ChiSquared feature selector.
-    The selector supports three selection methods: `KBest`, `Percentile` and `FPR`.
+    The selector supports five selection methods: `KBest`, `Percentile`, `FPR`, `FDR` and `FWE`.
     `kbest` chooses the `k` top features according to a chi-squared test.
     `percentile` is similar but chooses a fraction of all features instead of a fixed number.
-    `fpr` select features based on a false positive rate test.
-    `fdr` select features based on an estimated false discovery rate.
-    `fwe` select features based on family-wise error rate.
+    `fpr` chooses all features whose false positive rate meets some threshold.
+    `fdr` chooses all features whose false discovery rate meets some threshold.
+    `fwe` chooses all features whose family-wise error rate meets some threshold.
     By default, the selection method is `kbest`, the default number of top features is 50.
 
     >>> data = [

From 92530ab7562f5d4968b5c6130ab611fd2b629549 Mon Sep 17 00:00:00 2001
From: Peng <peng.meng@intel.com>
Date: Thu, 20 Oct 2016 17:45:26 +0800
Subject: [PATCH 6/9] mimor change

---
 .../scala/org/apache/spark/mllib/feature/ChiSqSelector.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index 64844e607b298..a41558cd72d14 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -258,7 +258,7 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable {
         tempRes.take(maxIndex + 1)
       case ChiSqSelector.FWE =>
         chiSqTestResult
-          .filter { case (res, _) => res.pValue < alpha/chiSqTestResult.length }
+          .filter { case (res, _) => res.pValue < alpha / chiSqTestResult.length }
       case errorType =>
         throw new IllegalStateException(s"Unknown ChiSqSelector Type: $errorType")
     }

From 5a7cc2ca9e81ade4d430411ab6e314ae5010169f Mon Sep 17 00:00:00 2001
From: Peng <peng.meng@intel.com>
Date: Fri, 23 Dec 2016 12:46:23 +0800
Subject: [PATCH 7/9] doc and Since fix

---
 docs/ml-features.md                           |   4 +-
 docs/mllib-feature-extraction.md              |   4 +-
 .../spark/ml/feature/ChiSqSelector.scala      |  23 +--
 .../spark/mllib/feature/ChiSqSelector.scala   |  16 +-
 .../mllib/feature/ChiSqSelectorSuite.scala    | 152 ++++--------------
 python/pyspark/ml/feature.py                  |  19 ++-
 python/pyspark/mllib/feature.py               |  11 +-
 7 files changed, 73 insertions(+), 156 deletions(-)

diff --git a/docs/ml-features.md b/docs/ml-features.md
index 8dd14dfc3383d..4db2907d59ec1 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1427,8 +1427,8 @@ features to choose. It supports five selection methods: `numTopFeatures`, `perce
 * `numTopFeatures` chooses a fixed number of top features according to a chi-squared test. This is akin to yielding the features with the most predictive power.
 * `percentile` is similar to `numTopFeatures` but chooses a fraction of all features instead of a fixed number.
 * `fpr` chooses all features whose p-value is below a threshold, thus controlling the false positive rate of selection.
-* `fdr` chooses all features whose false discovery rate meets some threshold.
-* `fwe` chooses all features whose family-wise error rate meets some threshold.
+* `fdr` uses the [Benjamini-Hochberg procedure](https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure) to choose all features whose false discovery rate is below a threshold.
+* `fwe` chooses all features whose whose p-values is below a threshold, thus controlling the family-wise error rate of selection.
 By default, the selection method is `numTopFeatures`, with the default number of top features set to 50.
 The user can choose a selection method using `setSelectorType`.
 
diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md
index dd79dc841fbe3..c67cfbe8e607b 100644
--- a/docs/mllib-feature-extraction.md
+++ b/docs/mllib-feature-extraction.md
@@ -232,8 +232,8 @@ features to choose. It supports five selection methods: `numTopFeatures`, `perce
 * `numTopFeatures` chooses a fixed number of top features according to a chi-squared test. This is akin to yielding the features with the most predictive power.
 * `percentile` is similar to `numTopFeatures` but chooses a fraction of all features instead of a fixed number.
 * `fpr` chooses all features whose p-value is below a threshold, thus controlling the false positive rate of selection.
-* `fdr` chooses all features whose false discovery rate meets some threshold.
-* `fwe` chooses all features whose family-wise error rate meets some threshold.
+* `fdr` uses the [Benjamini-Hochberg procedure](https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure) to choose all features whose false discovery rate is below a threshold.
+* `fwe` chooses all features whose whose p-values is below a threshold, thus controlling the family-wise error rate of selection.
 
 By default, the selection method is `numTopFeatures`, with the default number of top features set to 50.
 The user can choose a selection method using `setSelectorType`.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
index 1999c96a03086..abfae3f75d753 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
@@ -92,28 +92,28 @@ private[feature] trait ChiSqSelectorParams extends Params
   def getFpr: Double = $(fpr)
 
   /**
-   * The highest uncorrected p-value for features to be kept.
+   * The upper bound of the expected false discovery rate.
    * Only applicable when selectorType = "fdr".
    * Default value is 0.05.
    * @group param
    */
-  @Since("2.1.0")
+  @Since("2.2.0")
   final val fdr = new DoubleParam(this, "fdr",
-    "The highest uncorrected p-value for features to be kept.", ParamValidators.inRange(0, 1))
+    "The upper bound of the expected false discovery rate.", ParamValidators.inRange(0, 1))
   setDefault(fdr -> 0.05)
 
   /** @group getParam */
   def getFdr: Double = $(fdr)
 
   /**
-   * The highest uncorrected p-value for features to be kept.
+   * The upper bound of the expected family-wise error rate.
    * Only applicable when selectorType = "fwe".
    * Default value is 0.05.
    * @group param
    */
-  @Since("2.1.0")
+  @Since("2.2.0")
   final val fwe = new DoubleParam(this, "fwe",
-    "The highest uncorrected p-value for features to be kept.", ParamValidators.inRange(0, 1))
+    "The upper bound of the expected family-wise error rate.", ParamValidators.inRange(0, 1))
   setDefault(fwe -> 0.05)
 
   /** @group getParam */
@@ -145,8 +145,11 @@ private[feature] trait ChiSqSelectorParams extends Params
  *  - `percentile` is similar but chooses a fraction of all features instead of a fixed number.
  *  - `fpr` chooses all features whose p-value is below a threshold, thus controlling the false
  *    positive rate of selection.
- *  - `fdr` chooses all features whose false discovery rate meets some threshold.
- *  - `fwe` chooses all features whose family-wise error rate meets some threshold.
+ *  - `fdr` uses the [Benjamini-Hochberg procedure]
+ *    (https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure)
+ *    to choose all features whose false discovery rate is below a threshold.
+ *  - `fwe` chooses all features whose whose p-values is below a threshold,
+ *    thus controlling the family-wise error rate of selection.
  * By default, the selection method is `numTopFeatures`, with the default number of top features
  * set to 50.
  */
@@ -170,11 +173,11 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str
   def setFpr(value: Double): this.type = set(fpr, value)
 
   /** @group setParam */
-  @Since("2.1.0")
+  @Since("2.2.0")
   def setFdr(value: Double): this.type = set(fdr, value)
 
   /** @group setParam */
-  @Since("2.1.0")
+  @Since("2.2.0")
   def setFwe(value: Double): this.type = set(fwe, value)
 
   /** @group setParam */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index 24ae113b5d61f..934c3b666a3b4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -177,8 +177,11 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] {
  *  - `percentile` is similar but chooses a fraction of all features instead of a fixed number.
  *  - `fpr` chooses all features whose p-value is below a threshold, thus controlling the false
  *    positive rate of selection.
- *  - `fdr` chooses all features whose false discovery rate meets some threshold.
- *  - `fwe` chooses all features whose family-wise error rate meets some threshold.
+ *  - `fdr` uses the [Benjamini-Hochberg procedure]
+ *    (https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure)
+ *    to choose all features whose false discovery rate is below a threshold.
+ *  - `fwe` chooses all features whose whose p-values is below a threshold,
+ *    thus controlling the family-wise error rate of selection.
  * By default, the selection method is `numTopFeatures`, with the default number of top features
  * set to 50.
  */
@@ -220,14 +223,14 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable {
     this
   }
 
-  @Since("2.1.0")
+  @Since("2.2.0")
   def setFdr(value: Double): this.type = {
     require(0.0 <= value && value <= 1.0, "FDR must be in [0,1]")
     fdr = value
     this
   }
 
-  @Since("2.1.0")
+  @Since("2.2.0")
   def setFwe(value: Double): this.type = {
     require(0.0 <= value && value <= 1.0, "FWE must be in [0,1]")
     fwe = value
@@ -266,6 +269,7 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable {
           .filter { case (res, _) => res.pValue < fpr }
       case ChiSqSelector.FDR =>
         // This uses the Benjamini-Hochberg procedure.
+        // https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure
         val tempRes = chiSqTestResult
           .sortBy { case (res, _) => res.pValue }
         val maxIndex = tempRes
@@ -289,10 +293,10 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable {
 private[spark] object ChiSqSelector {
 
   /** String name for `numTopFeatures` selector type. */
-  val NumTopFeatures: String = "numTopFeatures"
+  private[spark] val NumTopFeatures: String = "numTopFeatures"
 
   /** String name for `percentile` selector type. */
-  val Percentile: String = "percentile"
+  private[spark] val Percentile: String = "percentile"
 
   /** String name for `fpr` selector type. */
   private[spark] val FPR: String = "fpr"
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
index b267cf35220b8..6578963ad8895 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
@@ -75,28 +75,16 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext {
    *  Use chi-squared calculator from Internet
    */
 
+  lazy val labeledDiscreteData = sc.parallelize(
+    Seq(LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))),
+      LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))),
+      LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))),
+      LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))),
+      LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))),
+      LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)))), 2)
+
   test("ChiSqSelector transform by KBest test (sparse & dense vector)") {
-    val labeledDiscreteData = sc.parallelize(
-      Seq(LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))),
-        LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))),
-        LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))),
-        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))),
-        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))),
-        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)))), 2)
     val preFilteredData =
-      Set(LabeledPoint(0.0, Vectors.dense(Array(6.0))),
-        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
-        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
-        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
-        LabeledPoint(2.0, Vectors.dense(Array(8.0))),
-        LabeledPoint(2.0, Vectors.dense(Array(8.0))))
-    val model = new ChiSqSelector(1).fit(labeledDiscreteData)
-    val filteredData = labeledDiscreteData.map { lp =>
-      LabeledPoint(lp.label, model.transform(lp.features))
-    }.collect().toSet
-    assert(filteredData == preFilteredData)
-
-    val preFilteredData2 =
       Set(LabeledPoint(0.0, Vectors.dense(Array(6.0, 7.0, 7.0))),
         LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 0.0))),
         LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 0.0))),
@@ -104,36 +92,15 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext {
         LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0))),
         LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 4.0))))
 
-    val model2 = new ChiSqSelector(3).fit(labeledDiscreteData)
-    val filteredData2 = labeledDiscreteData.map { lp =>
-      LabeledPoint(lp.label, model2.transform(lp.features))
+    val model = new ChiSqSelector(3).fit(labeledDiscreteData)
+    val filteredData = labeledDiscreteData.map { lp =>
+      LabeledPoint(lp.label, model.transform(lp.features))
     }.collect().toSet
-    assert(filteredData2 === preFilteredData2)
+    assert(filteredData === preFilteredData)
   }
 
   test("ChiSqSelector transform by Percentile test (sparse & dense vector)") {
-    val labeledDiscreteData = sc.parallelize(
-      Seq(LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))),
-        LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))),
-        LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))),
-        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))),
-        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))),
-        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)))), 2)
     val preFilteredData =
-      Set(LabeledPoint(0.0, Vectors.dense(Array(6.0))),
-        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
-        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
-        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
-        LabeledPoint(2.0, Vectors.dense(Array(8.0))),
-        LabeledPoint(2.0, Vectors.dense(Array(8.0))))
-    val model = new ChiSqSelector().setSelectorType("percentile").setPercentile(0.2)
-      .fit(labeledDiscreteData)
-    val filteredData = labeledDiscreteData.map { lp =>
-      LabeledPoint(lp.label, model.transform(lp.features))
-    }.collect().toSet
-    assert(filteredData == preFilteredData)
-
-    val preFilteredData2 =
       Set(LabeledPoint(0.0, Vectors.dense(Array(6.0, 7.0, 7.0))),
         LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 0.0))),
         LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 0.0))),
@@ -141,37 +108,16 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext {
         LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0))),
         LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 4.0))))
 
-    val model2 = new ChiSqSelector().setSelectorType("percentile").setPercentile(0.5)
+    val model = new ChiSqSelector().setSelectorType("percentile").setPercentile(0.5)
       .fit(labeledDiscreteData)
-    val filteredData2 = labeledDiscreteData.map { lp =>
-      LabeledPoint(lp.label, model2.transform(lp.features))
+    val filteredData = labeledDiscreteData.map { lp =>
+      LabeledPoint(lp.label, model.transform(lp.features))
     }.collect().toSet
-    assert(filteredData2 == preFilteredData2)
+    assert(filteredData == preFilteredData)
   }
 
   test("ChiSqSelector transform by FPR test (sparse & dense vector)") {
-    val labeledDiscreteData = sc.parallelize(
-      Seq(LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))),
-        LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))),
-        LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))),
-        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))),
-        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))),
-        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)))), 2)
     val preFilteredData =
-      Set(LabeledPoint(0.0, Vectors.dense(Array(6.0))),
-        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
-        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
-        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
-        LabeledPoint(2.0, Vectors.dense(Array(8.0))),
-        LabeledPoint(2.0, Vectors.dense(Array(8.0))))
-    val model = new ChiSqSelector().setSelectorType("fpr").setFpr(0.02)
-      .fit(labeledDiscreteData)
-    val filteredData = labeledDiscreteData.map { lp =>
-      LabeledPoint(lp.label, model.transform(lp.features))
-    }.collect().toSet
-    assert(filteredData === preFilteredData)
-
-    val preFilteredData2 =
       Set(LabeledPoint(0.0, Vectors.dense(Array(6.0, 7.0, 7.0))),
         LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 0.0))),
         LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 0.0))),
@@ -179,37 +125,16 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext {
         LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0))),
         LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 4.0))))
 
-    val model2 = new ChiSqSelector().setSelectorType("fpr").setFpr(0.15)
-      .fit(labeledDiscreteData)
-    val filteredData2 = labeledDiscreteData.map { lp =>
-      LabeledPoint(lp.label, model2.transform(lp.features))
-    }.collect().toSet
-    assert(filteredData2 === preFilteredData2)
-  }
-
-  test("ChiSqSelector transform by FDR test (sparse & dense vector)") {
-    val labeledDiscreteData = sc.parallelize(
-      Seq(LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))),
-        LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))),
-        LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))),
-        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))),
-        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))),
-        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)))), 2)
-    val preFilteredData =
-      Set(LabeledPoint(0.0, Vectors.dense(Array(6.0))),
-        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
-        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
-        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
-        LabeledPoint(2.0, Vectors.dense(Array(8.0))),
-        LabeledPoint(2.0, Vectors.dense(Array(8.0))))
-    val model = new ChiSqSelector().setSelectorType("fdr").setFdr(0.12)
+    val model = new ChiSqSelector().setSelectorType("fpr").setFpr(0.15)
       .fit(labeledDiscreteData)
     val filteredData = labeledDiscreteData.map { lp =>
       LabeledPoint(lp.label, model.transform(lp.features))
     }.collect().toSet
     assert(filteredData === preFilteredData)
+  }
 
-    val preFilteredData2 =
+  test("ChiSqSelector transform by FDR test (sparse & dense vector)") {
+    val preFilteredData =
       Set(LabeledPoint(0.0, Vectors.dense(Array(6.0, 7.0))),
         LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0))),
         LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0))),
@@ -217,37 +142,16 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext {
         LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0))),
         LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0))))
 
-    val model2 = new ChiSqSelector().setSelectorType("fdr").setFdr(0.15)
-      .fit(labeledDiscreteData)
-    val filteredData2 = labeledDiscreteData.map { lp =>
-      LabeledPoint(lp.label, model2.transform(lp.features))
-    }.collect().toSet
-    assert(filteredData2 === preFilteredData2)
-  }
-
-  test("ChiSqSelector transform by FWE test (sparse & dense vector)") {
-    val labeledDiscreteData = sc.parallelize(
-      Seq(LabeledPoint(0.0, Vectors.sparse(6, Array((0, 6.0), (1, 7.0), (3, 7.0), (4, 6.0)))),
-        LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 6.0), (4, 5.0), (5, 9.0)))),
-        LabeledPoint(1.0, Vectors.sparse(6, Array((1, 9.0), (2, 3.0), (4, 5.0), (5, 5.0)))),
-        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 5.0, 6.0, 4.0))),
-        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))),
-        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)))), 2)
-    val preFilteredData =
-      Set(LabeledPoint(0.0, Vectors.dense(Array(6.0))),
-        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
-        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
-        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
-        LabeledPoint(2.0, Vectors.dense(Array(8.0))),
-        LabeledPoint(2.0, Vectors.dense(Array(8.0))))
-    val model = new ChiSqSelector().setSelectorType("fwe").setFwe(0.15)
+    val model = new ChiSqSelector().setSelectorType("fdr").setFdr(0.15)
       .fit(labeledDiscreteData)
     val filteredData = labeledDiscreteData.map { lp =>
       LabeledPoint(lp.label, model.transform(lp.features))
     }.collect().toSet
     assert(filteredData === preFilteredData)
+  }
 
-    val preFilteredData2 =
+  test("ChiSqSelector transform by FWE test (sparse & dense vector)") {
+    val preFilteredData =
       Set(LabeledPoint(0.0, Vectors.dense(Array(6.0, 7.0))),
         LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0))),
         LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0))),
@@ -255,12 +159,12 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext {
         LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0))),
         LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0))))
 
-    val model2 = new ChiSqSelector().setSelectorType("fwe").setFwe(0.3)
+    val model = new ChiSqSelector().setSelectorType("fwe").setFwe(0.3)
       .fit(labeledDiscreteData)
-    val filteredData2 = labeledDiscreteData.map { lp =>
-      LabeledPoint(lp.label, model2.transform(lp.features))
+    val filteredData = labeledDiscreteData.map { lp =>
+      LabeledPoint(lp.label, model.transform(lp.features))
     }.collect().toSet
-    assert(filteredData2 === preFilteredData2)
+    assert(filteredData === preFilteredData)
   }
 
   test("model load / save") {
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 014585e992b47..8488ffd4987fc 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -2636,8 +2636,11 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja
     `percentile` is similar but chooses a fraction of all features instead of a fixed number.
     `fpr` chooses all features whose p-value is below a threshold, thus controlling the false
     positive rate of selection.
-    `fdr` chooses all features whose false discovery rate meets some threshold.
-    `fwe` chooses all features whose family-wise error rate meets some threshold.
+    `fdr` uses the [Benjamini-Hochberg procedure]
+    (https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure)
+    to choose all features whose false discovery rate is below a threshold.
+    `fwe` chooses all features whose whose p-values is below a threshold,
+    thus controlling the family-wise error rate of selection.
     By default, the selection method is `numTopFeatures`, with the default number of top features
     set to 50.
 
@@ -2686,10 +2689,10 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja
     fpr = Param(Params._dummy(), "fpr", "The highest p-value for features to be kept.",
                 typeConverter=TypeConverters.toFloat)
 
-    fdr = Param(Params._dummy(), "fdr", "The highest uncorrected p-value for features to be kept.",
+    fdr = Param(Params._dummy(), "fdr", "The upper bound of the expected false discovery rate.",
                 typeConverter=TypeConverters.toFloat)
 
-    fwe = Param(Params._dummy(), "fwe", "The highest uncorrected p-value for features to be kept.",
+    fwe = Param(Params._dummy(), "fwe", "The upper bound of the expected family-wise error rate.",
                 typeConverter=TypeConverters.toFloat)
 
     @keyword_only
@@ -2781,7 +2784,7 @@ def getFpr(self):
         """
         return self.getOrDefault(self.fpr)
 
-    @since("2.1.0")
+    @since("2.2.0")
     def setFdr(self, value):
         """
         Sets the value of :py:attr:`fdr`.
@@ -2789,14 +2792,14 @@ def setFdr(self, value):
         """
         return self._set(fdr=value)
 
-    @since("2.1.0")
+    @since("2.2.0")
     def getFdr(self):
         """
         Gets the value of fdr or its default value.
         """
         return self.getOrDefault(self.fdr)
 
-    @since("2.1.0")
+    @since("2.2.0")
     def setFwe(self, value):
         """
         Sets the value of :py:attr:`fwe`.
@@ -2804,7 +2807,7 @@ def setFwe(self, value):
         """
         return self._set(fwe=value)
 
-    @since("2.1.0")
+    @since("2.2.0")
     def getFwe(self):
         """
         Gets the value of fwe or its default value.
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 0699c0d329c42..21f3a458380a9 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -280,8 +280,11 @@ class ChiSqSelector(object):
     `percentile` is similar but chooses a fraction of all features instead of a fixed number.
     `fpr` chooses all features whose p-value is below a threshold, thus controlling the false
     positive rate of selection.
-    `fdr` chooses all features whose false discovery rate meets some threshold.
-    `fwe` chooses all features whose family-wise error rate meets some threshold.
+    `fdr` uses the [Benjamini-Hochberg procedure]
+    (https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure)
+    to choose all features whose false discovery rate is below a threshold.
+    `fwe` chooses all features whose whose p-values is below a threshold,
+    thus controlling the family-wise error rate of selection.
     By default, the selection method is `numTopFeatures`, with the default number of top features
     set to 50.
 
@@ -344,7 +347,7 @@ def setFpr(self, fpr):
         self.fpr = float(fpr)
         return self
 
-    @since('2.1.0')
+    @since('2.2.0')
     def setFdr(self, fdr):
         """
         set FDR [0.0, 1.0] for feature selection by FDR.
@@ -353,7 +356,7 @@ def setFdr(self, fdr):
         self.fdr = float(fdr)
         return self
 
-    @since('2.1.0')
+    @since('2.2.0')
     def setFwe(self, fwe):
         """
         set FWE [0.0, 1.0] for feature selection by FWE.

From aa5f2cc67a1b523902c52330179fa227159728a5 Mon Sep 17 00:00:00 2001
From: "Peng, Meng" <peng.meng@intel.com>
Date: Fri, 23 Dec 2016 23:43:09 +0800
Subject: [PATCH 8/9] fix typo

---
 docs/ml-features.md                                             | 2 +-
 docs/mllib-feature-extraction.md                                | 2 +-
 .../main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala  | 2 +-
 .../scala/org/apache/spark/mllib/feature/ChiSqSelector.scala    | 2 +-
 .../org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala     | 2 +-
 python/pyspark/ml/feature.py                                    | 2 +-
 python/pyspark/mllib/feature.py                                 | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/ml-features.md b/docs/ml-features.md
index 4db2907d59ec1..1d3449746c9be 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1428,7 +1428,7 @@ features to choose. It supports five selection methods: `numTopFeatures`, `perce
 * `percentile` is similar to `numTopFeatures` but chooses a fraction of all features instead of a fixed number.
 * `fpr` chooses all features whose p-value is below a threshold, thus controlling the false positive rate of selection.
 * `fdr` uses the [Benjamini-Hochberg procedure](https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure) to choose all features whose false discovery rate is below a threshold.
-* `fwe` chooses all features whose whose p-values is below a threshold, thus controlling the family-wise error rate of selection.
+* `fwe` chooses all features whose p-values is below a threshold, thus controlling the family-wise error rate of selection.
 By default, the selection method is `numTopFeatures`, with the default number of top features set to 50.
 The user can choose a selection method using `setSelectorType`.
 
diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md
index c67cfbe8e607b..acd28943132db 100644
--- a/docs/mllib-feature-extraction.md
+++ b/docs/mllib-feature-extraction.md
@@ -233,7 +233,7 @@ features to choose. It supports five selection methods: `numTopFeatures`, `perce
 * `percentile` is similar to `numTopFeatures` but chooses a fraction of all features instead of a fixed number.
 * `fpr` chooses all features whose p-value is below a threshold, thus controlling the false positive rate of selection.
 * `fdr` uses the [Benjamini-Hochberg procedure](https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure) to choose all features whose false discovery rate is below a threshold.
-* `fwe` chooses all features whose whose p-values is below a threshold, thus controlling the family-wise error rate of selection.
+* `fwe` chooses all features whose p-values is below a threshold, thus controlling the family-wise error rate of selection.
 
 By default, the selection method is `numTopFeatures`, with the default number of top features set to 50.
 The user can choose a selection method using `setSelectorType`.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
index abfae3f75d753..353bd186daf01 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
@@ -148,7 +148,7 @@ private[feature] trait ChiSqSelectorParams extends Params
  *  - `fdr` uses the [Benjamini-Hochberg procedure]
  *    (https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure)
  *    to choose all features whose false discovery rate is below a threshold.
- *  - `fwe` chooses all features whose whose p-values is below a threshold,
+ *  - `fwe` chooses all features whose p-values is below a threshold,
  *    thus controlling the family-wise error rate of selection.
  * By default, the selection method is `numTopFeatures`, with the default number of top features
  * set to 50.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index 934c3b666a3b4..9dea3c3e843c4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -180,7 +180,7 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] {
  *  - `fdr` uses the [Benjamini-Hochberg procedure]
  *    (https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure)
  *    to choose all features whose false discovery rate is below a threshold.
- *  - `fwe` chooses all features whose whose p-values is below a threshold,
+ *  - `fwe` chooses all features whose p-values is below a threshold,
  *    thus controlling the family-wise error rate of selection.
  * By default, the selection method is `numTopFeatures`, with the default number of top features
  * set to 50.
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
index 6578963ad8895..b585e09c69f44 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
@@ -83,7 +83,7 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext {
       LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 5.0, 4.0, 4.0))),
       LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 6.0, 4.0, 0.0, 0.0)))), 2)
 
-  test("ChiSqSelector transform by KBest test (sparse & dense vector)") {
+  test("ChiSqSelector transform by numTopFeatures test (sparse & dense vector)") {
     val preFilteredData =
       Set(LabeledPoint(0.0, Vectors.dense(Array(6.0, 7.0, 7.0))),
         LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 0.0))),
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 8488ffd4987fc..3c1d57daec859 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -2639,7 +2639,7 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja
     `fdr` uses the [Benjamini-Hochberg procedure]
     (https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure)
     to choose all features whose false discovery rate is below a threshold.
-    `fwe` chooses all features whose whose p-values is below a threshold,
+    `fwe` chooses all features whose p-values is below a threshold,
     thus controlling the family-wise error rate of selection.
     By default, the selection method is `numTopFeatures`, with the default number of top features
     set to 50.
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 21f3a458380a9..55a917f1d609a 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -283,7 +283,7 @@ class ChiSqSelector(object):
     `fdr` uses the [Benjamini-Hochberg procedure]
     (https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure)
     to choose all features whose false discovery rate is below a threshold.
-    `fwe` chooses all features whose whose p-values is below a threshold,
+    `fwe` chooses all features whose p-values is below a threshold,
     thus controlling the family-wise error rate of selection.
     By default, the selection method is `numTopFeatures`, with the default number of top features
     set to 50.

From da6ac358ee4861f0d0cd320f46482b2e2535322a Mon Sep 17 00:00:00 2001
From: "Peng, Meng" <peng.meng@intel.com>
Date: Tue, 27 Dec 2016 21:22:11 +0800
Subject: [PATCH 9/9] python code style change

---
 .../mllib/feature/ChiSqSelectorSuite.scala    |  2 +-
 python/pyspark/ml/feature.py                  | 25 ++++++++++++-------
 python/pyspark/mllib/feature.py               | 25 ++++++++++++-------
 3 files changed, 33 insertions(+), 19 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
index b585e09c69f44..305cb4cbbdeea 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
@@ -113,7 +113,7 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext {
     val filteredData = labeledDiscreteData.map { lp =>
       LabeledPoint(lp.label, model.transform(lp.features))
     }.collect().toSet
-    assert(filteredData == preFilteredData)
+    assert(filteredData === preFilteredData)
   }
 
   test("ChiSqSelector transform by FPR test (sparse & dense vector)") {
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 3c1d57daec859..dbd17e01d2213 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -2632,15 +2632,22 @@ class ChiSqSelector(JavaEstimator, HasFeaturesCol, HasOutputCol, HasLabelCol, Ja
     Creates a ChiSquared feature selector.
     The selector supports different selection methods: `numTopFeatures`, `percentile`, `fpr`,
     `fdr`, `fwe`.
-    `numTopFeatures` chooses a fixed number of top features according to a chi-squared test.
-    `percentile` is similar but chooses a fraction of all features instead of a fixed number.
-    `fpr` chooses all features whose p-value is below a threshold, thus controlling the false
-    positive rate of selection.
-    `fdr` uses the [Benjamini-Hochberg procedure]
-    (https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure)
-    to choose all features whose false discovery rate is below a threshold.
-    `fwe` chooses all features whose p-values is below a threshold,
-    thus controlling the family-wise error rate of selection.
+
+     * `numTopFeatures` chooses a fixed number of top features according to a chi-squared test.
+
+     * `percentile` is similar but chooses a fraction of all features
+       instead of a fixed number.
+
+     * `fpr` chooses all features whose p-value is below a threshold,
+       thus controlling the false positive rate of selection.
+
+     * `fdr` uses the `Benjamini-Hochberg procedure <https://en.wikipedia.org/wiki/
+       False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure>`_
+       to choose all features whose false discovery rate is below a threshold.
+
+     * `fwe` chooses all features whose p-values is below a threshold,
+       thus controlling the family-wise error rate of selection.
+
     By default, the selection method is `numTopFeatures`, with the default number of top features
     set to 50.
 
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 55a917f1d609a..61f2bc7492ad6 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -276,15 +276,22 @@ class ChiSqSelector(object):
     Creates a ChiSquared feature selector.
     The selector supports different selection methods: `numTopFeatures`, `percentile`, `fpr`,
     `fdr`, `fwe`.
-    `numTopFeatures` chooses a fixed number of top features according to a chi-squared test.
-    `percentile` is similar but chooses a fraction of all features instead of a fixed number.
-    `fpr` chooses all features whose p-value is below a threshold, thus controlling the false
-    positive rate of selection.
-    `fdr` uses the [Benjamini-Hochberg procedure]
-    (https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure)
-    to choose all features whose false discovery rate is below a threshold.
-    `fwe` chooses all features whose p-values is below a threshold,
-    thus controlling the family-wise error rate of selection.
+
+     * `numTopFeatures` chooses a fixed number of top features according to a chi-squared test.
+
+     * `percentile` is similar but chooses a fraction of all features
+       instead of a fixed number.
+
+     * `fpr` chooses all features whose p-value is below a threshold,
+       thus controlling the false positive rate of selection.
+
+     * `fdr` uses the `Benjamini-Hochberg procedure <https://en.wikipedia.org/wiki/
+       False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure>`_
+       to choose all features whose false discovery rate is below a threshold.
+
+     * `fwe` chooses all features whose p-values is below a threshold,
+       thus controlling the family-wise error rate of selection.
+
     By default, the selection method is `numTopFeatures`, with the default number of top features
     set to 50.