From 2adebe8de3881509e510fc518c562d1141ccd0ef Mon Sep 17 00:00:00 2001
From: "Peng, Meng" <peng.meng@intel.com>
Date: Wed, 10 Aug 2016 13:40:18 +0800
Subject: [PATCH 01/21] add a chiSquare Selector based on False Positive Rate
 (FPR) test

---
 .../spark/mllib/feature/ChiSqSelector.scala   | 29 +++++++++++++++++--
 .../mllib/feature/ChiSqSelectorSuite.scala    | 18 ++++++++++++
 2 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index c8c2823bbaf04..f3316eeee8fec 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -173,8 +173,8 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] {
  * Creates a ChiSquared feature selector.
  * @param numTopFeatures number of features that selector will select
  *                       (ordered by statistic value descending)
- *                       Note that if the number of features is < numTopFeatures, then this will
- *                       select all features.
+ *                       Note that if the number of features is less than numTopFeatures,
+ *                       then this will select all features.
  */
 @Since("1.3.0")
 class ChiSqSelector @Since("1.3.0") (
@@ -197,3 +197,28 @@ class ChiSqSelector @Since("1.3.0") (
     new ChiSqSelectorModel(indices)
   }
 }
+
+/**
+ * Creates a ChiSquared feature selector by False Positive Rate (FPR) test.
+ * @param alpha the highest p-value for features to be kept
+ */
+@Since("2.1.0")
+class ChiSqSelectorByFpr @Since("2.1.0") (
+  @Since("2.1.0") val alpha: Double) extends Serializable {
+
+  /**
+   * Returns a ChiSquared feature selector by FPR.
+   *
+   * @param data an `RDD[LabeledPoint]` containing the labeled dataset with categorical features.
+   *             Real-valued features will be treated as categorical for each distinct value.
+   *             Apply feature discretizer before using this function.
+   */
+  @Since("2.1.0")
+  def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = {
+    val indices = Statistics.chiSqTest(data)
+      .zipWithIndex.filter { case (res, _) => res.pValue < alpha }
+      .map { case (_, indices) => indices }
+      .sorted
+    new ChiSqSelectorModel(indices)
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
index 734800a9afad6..6b2209c8a7c15 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
@@ -65,6 +65,24 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(filteredData == preFilteredData)
   }
 
+  test("ChiSqSelectorByFpr transform test (sparse & dense vector)") {
+    val labeledDiscreteData = sc.parallelize(
+      Seq(LabeledPoint(0.0, Vectors.sparse(4, Array((0, 8.0), (1, 7.0)))),
+        LabeledPoint(1.0, Vectors.sparse(4, Array((1, 9.0), (2, 6.0), (3, 4.0)))),
+        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 4.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0, 9.0)))), 2)
+    val preFilteredData =
+      Set(LabeledPoint(0.0, Vectors.dense(Array(0.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(4.0))),
+        LabeledPoint(1.0, Vectors.dense(Array(4.0))),
+        LabeledPoint(2.0, Vectors.dense(Array(9.0))))
+    val model = new ChiSqSelectorByFpr(0.1).fit(labeledDiscreteData)
+    val filteredData = labeledDiscreteData.map { lp =>
+      LabeledPoint(lp.label, model.transform(lp.features))
+    }.collect().toSet
+    assert(filteredData == preFilteredData)
+  }
+
   test("model load / save") {
     val model = ChiSqSelectorSuite.createModel()
     val tempDir = Utils.createTempDir()

From 7623563884355a04867ce5271baa286f65180e62 Mon Sep 17 00:00:00 2001
From: "Peng, Meng" <peng.meng@intel.com>
Date: Tue, 16 Aug 2016 21:36:11 +0800
Subject: [PATCH 02/21] Configure the ChiSqSelector to reuse ChiSqTestResult by
 numTopFeatures, Percentile, and Fpr selector

---
 .../mllib/JavaChiSqSelectorExample.java       |   3 +-
 .../examples/mllib/ChiSqSelectorExample.scala |   3 +-
 .../spark/ml/feature/ChiSqSelector.scala      |  60 ++++++++++-
 .../mllib/api/python/PythonMLLibAPI.scala     |   4 +-
 .../spark/mllib/feature/ChiSqSelector.scala   | 102 +++++++++++-------
 .../mllib/feature/ChiSqSelectorSuite.scala    |   6 +-
 6 files changed, 126 insertions(+), 52 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java
index ad44acb4cd6e3..f0619b7bc5685 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java
@@ -56,7 +56,8 @@ public LabeledPoint call(LabeledPoint lp) {
     );
 
     // Create ChiSqSelector that will select top 50 of 692 features
-    ChiSqSelector selector = new ChiSqSelector(50);
+    ChiSqSelector selector = new ChiSqSelector();
+    selector.setNumTopFeatures(50);
     // Create ChiSqSelector model (selecting features)
     final ChiSqSelectorModel transformer = selector.fit(discretizedData.rdd());
     // Filter the top 50 features from each feature vector
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala
index 5e400b7d715b4..9fb520ce56acc 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala
@@ -43,7 +43,8 @@ object ChiSqSelectorExample {
       LabeledPoint(lp.label, Vectors.dense(lp.features.toArray.map { x => (x / 16).floor }))
     }
     // Create ChiSqSelector that will select top 50 of 692 features
-    val selector = new ChiSqSelector(50)
+    val selector = new ChiSqSelector()
+    selector.setNumTopFeatures(50)
     // Create ChiSqSelector model (selecting features)
     val transformer = selector.fit(discretizedData)
     // Filter the top 50 features from each feature vector
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
index 1482eb3d1f7a6..439514bdb4a4c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
@@ -27,6 +27,7 @@ import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util._
 import org.apache.spark.mllib.feature
+import org.apache.spark.mllib.feature.SelectorType
 import org.apache.spark.mllib.linalg.{Vectors => OldVectors}
 import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint}
 import org.apache.spark.rdd.RDD
@@ -51,11 +52,29 @@ private[feature] trait ChiSqSelectorParams extends Params
       " number of features is < numTopFeatures, then this will select all features.",
     ParamValidators.gtEq(1))
   setDefault(numTopFeatures -> 50)
+  final val percentile = new IntParam(this, "percentile",
+    "Percentile of features that selector will select, ordered by statistics value descending.",
+    ParamValidators.gtEq(0))
+  setDefault(percentile -> 10)
+
+  final val alpha = new DoubleParam(this, "alpha",
+    "The highest p-value for features to be kept.",
+    ParamValidators.gtEq(0))
+  setDefault(alpha -> 0.05)
+
+  final val selectorType = SelectorType.KBest
 
   /** @group getParam */
   def getNumTopFeatures: Int = $(numTopFeatures)
+
+  def getPercentile: Int = $(percentile)
+
+  def getAlpha: Double = $(alpha)
+
+  def getSelectorType: SelectorType.Value = selectorType
 }
 
+
 /**
  * Chi-Squared feature selection, which selects categorical features to use for predicting a
  * categorical label.
@@ -66,10 +85,26 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str
 
   @Since("1.6.0")
   def this() = this(Identifiable.randomUID("chiSqSelector"))
-
+  val chiSqSelector = new feature.ChiSqSelector()
   /** @group setParam */
   @Since("1.6.0")
-  def setNumTopFeatures(value: Int): this.type = set(numTopFeatures, value)
+  def setNumTopFeatures(value: Int): this.type = {
+    chiSqSelector.setNumTopFeatures(value)
+    chiSqSelector.setSelectorType(SelectorType.KBest)
+    set(numTopFeatures, value)
+  }
+
+  def setPercentile(value: Int): this.type = {
+    chiSqSelector.setPercentile(value)
+    chiSqSelector.setSelectorType(SelectorType.Percentile)
+    set(percentile, value)
+  }
+
+  def setAlpha(value: Double): this.type = {
+    chiSqSelector.setAlpha(value)
+    chiSqSelector.setSelectorType(SelectorType.Fpr)
+    set(alpha, value)
+  }
 
   /** @group setParam */
   @Since("1.6.0")
@@ -89,10 +124,25 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str
     val input: RDD[OldLabeledPoint] =
       dataset.select(col($(labelCol)).cast(DoubleType), col($(featuresCol))).rdd.map {
         case Row(label: Double, features: Vector) =>
-          OldLabeledPoint(label, OldVectors.fromML(features))
+        OldLabeledPoint(label, OldVectors.fromML(features))
       }
-    val chiSqSelector = new feature.ChiSqSelector($(numTopFeatures)).fit(input)
-    copyValues(new ChiSqSelectorModel(uid, chiSqSelector).setParent(this))
+    val model = chiSqSelector.fit(input)
+    copyValues(new ChiSqSelectorModel(uid, model).setParent(this))
+  }
+
+  def selectKBest(value: Int): ChiSqSelectorModel = {
+    val model = chiSqSelector.selectKBest(value)
+    copyValues(new ChiSqSelectorModel(uid, model).setParent(this))
+  }
+
+  def selectPercentile(value: Int): ChiSqSelectorModel = {
+    val model = chiSqSelector.selectPercentile(value)
+    copyValues(new ChiSqSelectorModel(uid, model).setParent(this))
+  }
+
+  def selectFpr(value: Double): ChiSqSelectorModel = {
+    val model = chiSqSelector.selectFpr(value)
+    copyValues(new ChiSqSelectorModel(uid, model).setParent(this))
   }
 
   @Since("1.6.0")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index a80cca70f4b28..bdcfe70651e3d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -634,8 +634,8 @@ private[python] class PythonMLLibAPI extends Serializable {
    * Extra care needs to be taken in the Python code to ensure it gets freed on
    * exit; see the Py4J documentation.
    */
-  def fitChiSqSelector(numTopFeatures: Int, data: JavaRDD[LabeledPoint]): ChiSqSelectorModel = {
-    new ChiSqSelector(numTopFeatures).fit(data.rdd)
+  def fitChiSqSelector(data: JavaRDD[LabeledPoint]): ChiSqSelectorModel = {
+    new ChiSqSelector().fit(data.rdd)
   }
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index f3316eeee8fec..9bc75c65165fd 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -27,22 +27,26 @@ import org.apache.spark.annotation.Since
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.stat.Statistics
+import org.apache.spark.mllib.stat.test.ChiSqTestResult
 import org.apache.spark.mllib.util.{Loader, Saveable}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.SparkContext
 import org.apache.spark.sql.{Row, SparkSession}
 
+object SelectorType extends Enumeration {
+  type SelectorType = Value
+  val KBest, Percentile, Fpr = Value
+}
+
 /**
  * Chi Squared selector model.
  *
- * @param selectedFeatures list of indices to select (filter). Must be ordered asc
+ * @param selectedFeatures list of indices to select (filter).
  */
 @Since("1.3.0")
 class ChiSqSelectorModel @Since("1.3.0") (
   @Since("1.3.0") val selectedFeatures: Array[Int]) extends VectorTransformer with Saveable {
 
-  require(isSorted(selectedFeatures), "Array has to be sorted asc")
-
   protected def isSorted(array: Array[Int]): Boolean = {
     var i = 1
     val len = array.length
@@ -69,21 +73,23 @@ class ChiSqSelectorModel @Since("1.3.0") (
    * Preserves the order of filtered features the same as their indices are stored.
    * Might be moved to Vector as .slice
    * @param features vector
-   * @param filterIndices indices of features to filter, must be ordered asc
+   * @param filterIndices indices of features to filter
    */
   private def compress(features: Vector, filterIndices: Array[Int]): Vector = {
+    val orderedIndices = filterIndices.sorted
+    require(isSorted(orderedIndices), "Array has to be sorted asc")
     features match {
       case SparseVector(size, indices, values) =>
-        val newSize = filterIndices.length
+        val newSize = orderedIndices.length
         val newValues = new ArrayBuilder.ofDouble
         val newIndices = new ArrayBuilder.ofInt
         var i = 0
         var j = 0
         var indicesIdx = 0
         var filterIndicesIdx = 0
-        while (i < indices.length && j < filterIndices.length) {
+        while (i < indices.length && j < orderedIndices.length) {
           indicesIdx = indices(i)
-          filterIndicesIdx = filterIndices(j)
+          filterIndicesIdx = orderedIndices(j)
           if (indicesIdx == filterIndicesIdx) {
             newIndices += j
             newValues += values(i)
@@ -101,7 +107,7 @@ class ChiSqSelectorModel @Since("1.3.0") (
         Vectors.sparse(newSize, newIndices.result(), newValues.result())
       case DenseVector(values) =>
         val values = features.toArray
-        Vectors.dense(filterIndices.map(i => values(i)))
+        Vectors.dense(orderedIndices.map(i => values(i)))
       case other =>
         throw new UnsupportedOperationException(
           s"Only sparse and dense vectors are supported but got ${other.getClass}.")
@@ -171,14 +177,34 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] {
 
 /**
  * Creates a ChiSquared feature selector.
- * @param numTopFeatures number of features that selector will select
- *                       (ordered by statistic value descending)
- *                       Note that if the number of features is less than numTopFeatures,
- *                       then this will select all features.
  */
 @Since("1.3.0")
-class ChiSqSelector @Since("1.3.0") (
-  @Since("1.3.0") val numTopFeatures: Int) extends Serializable {
+class ChiSqSelector @Since("1.3.0") () extends Serializable {
+  var numTopFeatures: Int = 1
+  var percentile: Int = 10
+  var alpha: Double = 0.05
+  var selectorType = SelectorType.KBest
+  var chiSqTestResult: Array[ChiSqTestResult] = new Array[ChiSqTestResult](0)
+
+  def setNumTopFeatures(value: Int): this.type = {
+    numTopFeatures = value
+    selectorType = SelectorType.KBest
+    this
+  }
+  def setPercentile(value: Int): this.type = {
+    percentile = value
+    selectorType = SelectorType.Percentile
+    this
+  }
+  def setAlpha(value: Double): this.type = {
+    alpha = value
+    selectorType = SelectorType.Fpr
+    this
+  }
+  def setSelectorType(value: SelectorType.Value): this.type = {
+    selectorType = value
+    this
+  }
 
   /**
    * Returns a ChiSquared feature selector.
@@ -189,36 +215,32 @@ class ChiSqSelector @Since("1.3.0") (
    */
   @Since("1.3.0")
   def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = {
-    val indices = Statistics.chiSqTest(data)
-      .zipWithIndex.sortBy { case (res, _) => -res.statistic }
-      .take(numTopFeatures)
-      .map { case (_, indices) => indices }
-      .sorted
+    chiSqTestResult = Statistics.chiSqTest(data)
+      selectorType match {
+        case SelectorType.KBest => selectKBest(numTopFeatures)
+        case SelectorType.Percentile => selectPercentile(percentile)
+        case SelectorType.Fpr => selectFpr(alpha)
+      }
+  }
+
+  def selectKBest(value: Int): ChiSqSelectorModel = {
+    val indices = chiSqTestResult.zipWithIndex.sortBy { case (res, _) => -res.statistic }
+    .take(numTopFeatures)
+    .map { case (_, indices) => indices }
     new ChiSqSelectorModel(indices)
   }
-}
 
-/**
- * Creates a ChiSquared feature selector by False Positive Rate (FPR) test.
- * @param alpha the highest p-value for features to be kept
- */
-@Since("2.1.0")
-class ChiSqSelectorByFpr @Since("2.1.0") (
-  @Since("2.1.0") val alpha: Double) extends Serializable {
+  def selectPercentile(value: Int): ChiSqSelectorModel = {
+    val indices = chiSqTestResult.zipWithIndex.sortBy { case (res, _) => -res.statistic }
+    .take((chiSqTestResult.length * percentile / 100).toInt)
+    .map { case (_, indices) => indices }
+    new ChiSqSelectorModel(indices)
+  }
 
-  /**
-   * Returns a ChiSquared feature selector by FPR.
-   *
-   * @param data an `RDD[LabeledPoint]` containing the labeled dataset with categorical features.
-   *             Real-valued features will be treated as categorical for each distinct value.
-   *             Apply feature discretizer before using this function.
-   */
-  @Since("2.1.0")
-  def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = {
-    val indices = Statistics.chiSqTest(data)
-      .zipWithIndex.filter { case (res, _) => res.pValue < alpha }
-      .map { case (_, indices) => indices }
-      .sorted
+  def selectFpr(value: Double): ChiSqSelectorModel = {
+    val indices = chiSqTestResult.zipWithIndex.filter{ case (res, _) => res.pValue < alpha }
+    .map { case (_, indices) => indices }
     new ChiSqSelectorModel(indices)
   }
 }
+
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
index 6b2209c8a7c15..d61888df9c0dc 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
@@ -58,14 +58,14 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext {
         LabeledPoint(1.0, Vectors.dense(Array(6.0))),
         LabeledPoint(1.0, Vectors.dense(Array(8.0))),
         LabeledPoint(2.0, Vectors.dense(Array(5.0))))
-    val model = new ChiSqSelector(1).fit(labeledDiscreteData)
+    val model = new ChiSqSelector().fit(labeledDiscreteData)
     val filteredData = labeledDiscreteData.map { lp =>
       LabeledPoint(lp.label, model.transform(lp.features))
     }.collect().toSet
     assert(filteredData == preFilteredData)
   }
 
-  test("ChiSqSelectorByFpr transform test (sparse & dense vector)") {
+  test("ChiSqSelector by FPR transform test (sparse & dense vector)") {
     val labeledDiscreteData = sc.parallelize(
       Seq(LabeledPoint(0.0, Vectors.sparse(4, Array((0, 8.0), (1, 7.0)))),
         LabeledPoint(1.0, Vectors.sparse(4, Array((1, 9.0), (2, 6.0), (3, 4.0)))),
@@ -76,7 +76,7 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext {
         LabeledPoint(1.0, Vectors.dense(Array(4.0))),
         LabeledPoint(1.0, Vectors.dense(Array(4.0))),
         LabeledPoint(2.0, Vectors.dense(Array(9.0))))
-    val model = new ChiSqSelectorByFpr(0.1).fit(labeledDiscreteData)
+    val model = new ChiSqSelector().setAlpha(0.1).fit(labeledDiscreteData)
     val filteredData = labeledDiscreteData.map { lp =>
       LabeledPoint(lp.label, model.transform(lp.features))
     }.collect().toSet

From 3d6aecb8441503c9c3d62a2d8a3d48824b9d6637 Mon Sep 17 00:00:00 2001
From: "Peng, Meng" <peng.meng@intel.com>
Date: Wed, 17 Aug 2016 10:34:59 +0800
Subject: [PATCH 03/21] Config the ChiSqSelector to reuse the ChiSqTestResult
 by KBest, Percentile and FPR selector

---
 .../mllib/JavaChiSqSelectorExample.java       |  3 +-
 .../examples/mllib/ChiSqSelectorExample.scala |  3 +-
 .../spark/ml/feature/ChiSqSelector.scala      | 12 +++---
 .../mllib/api/python/PythonMLLibAPI.scala     |  4 +-
 .../spark/mllib/feature/ChiSqSelector.scala   | 38 ++++++++++---------
 .../mllib/feature/ChiSqSelectorSuite.scala    |  2 +-
 6 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java
index f0619b7bc5685..ad44acb4cd6e3 100644
--- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java
@@ -56,8 +56,7 @@ public LabeledPoint call(LabeledPoint lp) {
     );
 
     // Create ChiSqSelector that will select top 50 of 692 features
-    ChiSqSelector selector = new ChiSqSelector();
-    selector.setNumTopFeatures(50);
+    ChiSqSelector selector = new ChiSqSelector(50);
     // Create ChiSqSelector model (selecting features)
     final ChiSqSelectorModel transformer = selector.fit(discretizedData.rdd());
     // Filter the top 50 features from each feature vector
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala
index 9fb520ce56acc..5e400b7d715b4 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/ChiSqSelectorExample.scala
@@ -43,8 +43,7 @@ object ChiSqSelectorExample {
       LabeledPoint(lp.label, Vectors.dense(lp.features.toArray.map { x => (x / 16).floor }))
     }
     // Create ChiSqSelector that will select top 50 of 692 features
-    val selector = new ChiSqSelector()
-    selector.setNumTopFeatures(50)
+    val selector = new ChiSqSelector(50)
     // Create ChiSqSelector model (selecting features)
     val transformer = selector.fit(discretizedData)
     // Filter the top 50 features from each feature vector
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
index 439514bdb4a4c..a44ac2fe73aea 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
@@ -27,7 +27,7 @@ import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util._
 import org.apache.spark.mllib.feature
-import org.apache.spark.mllib.feature.SelectorType
+import org.apache.spark.mllib.feature.ChiSqSelectorType
 import org.apache.spark.mllib.linalg.{Vectors => OldVectors}
 import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint}
 import org.apache.spark.rdd.RDD
@@ -62,7 +62,7 @@ private[feature] trait ChiSqSelectorParams extends Params
     ParamValidators.gtEq(0))
   setDefault(alpha -> 0.05)
 
-  final val selectorType = SelectorType.KBest
+  final val selectorType = ChiSqSelectorType.KBest
 
   /** @group getParam */
   def getNumTopFeatures: Int = $(numTopFeatures)
@@ -71,7 +71,7 @@ private[feature] trait ChiSqSelectorParams extends Params
 
   def getAlpha: Double = $(alpha)
 
-  def getSelectorType: SelectorType.Value = selectorType
+  def getChiSqSelectorType: ChiSqSelectorType.Value = selectorType
 }
 
 
@@ -90,19 +90,19 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str
   @Since("1.6.0")
   def setNumTopFeatures(value: Int): this.type = {
     chiSqSelector.setNumTopFeatures(value)
-    chiSqSelector.setSelectorType(SelectorType.KBest)
+    chiSqSelector.setChiSqSelectorType(ChiSqSelectorType.KBest)
     set(numTopFeatures, value)
   }
 
   def setPercentile(value: Int): this.type = {
     chiSqSelector.setPercentile(value)
-    chiSqSelector.setSelectorType(SelectorType.Percentile)
+    chiSqSelector.setChiSqSelectorType(ChiSqSelectorType.Percentile)
     set(percentile, value)
   }
 
   def setAlpha(value: Double): this.type = {
     chiSqSelector.setAlpha(value)
-    chiSqSelector.setSelectorType(SelectorType.Fpr)
+    chiSqSelector.setChiSqSelectorType(ChiSqSelectorType.Fpr)
     set(alpha, value)
   }
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index bdcfe70651e3d..a80cca70f4b28 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -634,8 +634,8 @@ private[python] class PythonMLLibAPI extends Serializable {
    * Extra care needs to be taken in the Python code to ensure it gets freed on
    * exit; see the Py4J documentation.
    */
-  def fitChiSqSelector(data: JavaRDD[LabeledPoint]): ChiSqSelectorModel = {
-    new ChiSqSelector().fit(data.rdd)
+  def fitChiSqSelector(numTopFeatures: Int, data: JavaRDD[LabeledPoint]): ChiSqSelectorModel = {
+    new ChiSqSelector(numTopFeatures).fit(data.rdd)
   }
 
   /**
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index 9bc75c65165fd..e2345b85a279e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -33,7 +33,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.SparkContext
 import org.apache.spark.sql.{Row, SparkSession}
 
-object SelectorType extends Enumeration {
+object ChiSqSelectorType extends Enumeration {
   type SelectorType = Value
   val KBest, Percentile, Fpr = Value
 }
@@ -77,7 +77,6 @@ class ChiSqSelectorModel @Since("1.3.0") (
    */
   private def compress(features: Vector, filterIndices: Array[Int]): Vector = {
     val orderedIndices = filterIndices.sorted
-    require(isSorted(orderedIndices), "Array has to be sorted asc")
     features match {
       case SparseVector(size, indices, values) =>
         val newSize = orderedIndices.length
@@ -178,30 +177,34 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] {
 /**
  * Creates a ChiSquared feature selector.
  */
-@Since("1.3.0")
-class ChiSqSelector @Since("1.3.0") () extends Serializable {
-  var numTopFeatures: Int = 1
-  var percentile: Int = 10
-  var alpha: Double = 0.05
-  var selectorType = SelectorType.KBest
-  var chiSqTestResult: Array[ChiSqTestResult] = new Array[ChiSqTestResult](0)
-
+@Since("2.1.0")
+class ChiSqSelector @Since("2.1.0") () extends Serializable {
+  private var numTopFeatures: Int = 1
+  private var percentile: Int = 10
+  private var alpha: Double = 0.05
+  private var selectorType = ChiSqSelectorType.KBest
+  private var chiSqTestResult: Array[ChiSqTestResult] = _
+
+  def this(numTopFeatures: Int) {
+    this()
+    this.numTopFeatures = numTopFeatures
+  }
   def setNumTopFeatures(value: Int): this.type = {
     numTopFeatures = value
-    selectorType = SelectorType.KBest
+    selectorType = ChiSqSelectorType.KBest
     this
   }
   def setPercentile(value: Int): this.type = {
     percentile = value
-    selectorType = SelectorType.Percentile
+    selectorType = ChiSqSelectorType.Percentile
     this
   }
   def setAlpha(value: Double): this.type = {
     alpha = value
-    selectorType = SelectorType.Fpr
+    selectorType = ChiSqSelectorType.Fpr
     this
   }
-  def setSelectorType(value: SelectorType.Value): this.type = {
+  def setChiSqSelectorType(value: ChiSqSelectorType.Value): this.type = {
     selectorType = value
     this
   }
@@ -217,9 +220,10 @@ class ChiSqSelector @Since("1.3.0") () extends Serializable {
   def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = {
     chiSqTestResult = Statistics.chiSqTest(data)
       selectorType match {
-        case SelectorType.KBest => selectKBest(numTopFeatures)
-        case SelectorType.Percentile => selectPercentile(percentile)
-        case SelectorType.Fpr => selectFpr(alpha)
+        case ChiSqSelectorType.KBest => selectKBest(numTopFeatures)
+        case ChiSqSelectorType.Percentile => selectPercentile(percentile)
+        case ChiSqSelectorType.Fpr => selectFpr(alpha)
+        case _ => throw new Exception
       }
   }
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
index d61888df9c0dc..e181a544f7159 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/feature/ChiSqSelectorSuite.scala
@@ -58,7 +58,7 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext {
         LabeledPoint(1.0, Vectors.dense(Array(6.0))),
         LabeledPoint(1.0, Vectors.dense(Array(8.0))),
         LabeledPoint(2.0, Vectors.dense(Array(5.0))))
-    val model = new ChiSqSelector().fit(labeledDiscreteData)
+    val model = new ChiSqSelector(1).fit(labeledDiscreteData)
     val filteredData = labeledDiscreteData.map { lp =>
       LabeledPoint(lp.label, model.transform(lp.features))
     }.collect().toSet

From 5305709c9d4029186318b99fa9c7c483897aa653 Mon Sep 17 00:00:00 2001
From: "Peng, Meng" <peng.meng@intel.com>
Date: Wed, 17 Aug 2016 17:59:16 +0800
Subject: [PATCH 04/21] add Since annotation

---
 .../spark/ml/feature/ChiSqSelector.scala      | 63 ++++++++++++-------
 .../spark/mllib/feature/ChiSqSelector.scala   | 34 ++++++----
 .../spark/ml/feature/ChiSqSelectorSuite.scala | 11 +++-
 3 files changed, 74 insertions(+), 34 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
index a44ac2fe73aea..d6b847a7770b0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
@@ -27,7 +27,6 @@ import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util._
 import org.apache.spark.mllib.feature
-import org.apache.spark.mllib.feature.ChiSqSelectorType
 import org.apache.spark.mllib.linalg.{Vectors => OldVectors}
 import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint}
 import org.apache.spark.rdd.RDD
@@ -52,28 +51,33 @@ private[feature] trait ChiSqSelectorParams extends Params
       " number of features is < numTopFeatures, then this will select all features.",
     ParamValidators.gtEq(1))
   setDefault(numTopFeatures -> 50)
-  final val percentile = new IntParam(this, "percentile",
+
+  /** @group getParam */
+  def getNumTopFeatures: Int = $(numTopFeatures)
+
+  final val percentile = new DoubleParam(this, "percentile",
     "Percentile of features that selector will select, ordered by statistics value descending.",
     ParamValidators.gtEq(0))
   setDefault(percentile -> 10)
 
+  /** @group getParam */
+  def getPercentile: Double = $(percentile)
+
   final val alpha = new DoubleParam(this, "alpha",
     "The highest p-value for features to be kept.",
     ParamValidators.gtEq(0))
   setDefault(alpha -> 0.05)
 
-  final val selectorType = ChiSqSelectorType.KBest
-
   /** @group getParam */
-  def getNumTopFeatures: Int = $(numTopFeatures)
-
-  def getPercentile: Int = $(percentile)
-
   def getAlpha: Double = $(alpha)
 
-  def getChiSqSelectorType: ChiSqSelectorType.Value = selectorType
-}
+  final val selectorType = new Param[String](this, "selectorType",
+    "ChiSqSelector Type: KBest, Percentile, Fpr")
+  setDefault(selectorType -> "KBest")
 
+  /** @group getParam */
+  def getChiSqSelectorType: String = $(selectorType)
+}
 
 /**
  * Chi-Squared feature selection, which selects categorical features to use for predicting a
@@ -85,24 +89,26 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str
 
   @Since("1.6.0")
   def this() = this(Identifiable.randomUID("chiSqSelector"))
-  val chiSqSelector = new feature.ChiSqSelector()
+
+  @Since("2.1.0")
+  var chiSqSelector: feature.ChiSqSelector = null
+
   /** @group setParam */
-  @Since("1.6.0")
+  @Since("2.1.0")
   def setNumTopFeatures(value: Int): this.type = {
-    chiSqSelector.setNumTopFeatures(value)
-    chiSqSelector.setChiSqSelectorType(ChiSqSelectorType.KBest)
+    set(selectorType, "KBest")
     set(numTopFeatures, value)
   }
 
-  def setPercentile(value: Int): this.type = {
-    chiSqSelector.setPercentile(value)
-    chiSqSelector.setChiSqSelectorType(ChiSqSelectorType.Percentile)
+  @Since("2.1.0")
+  def setPercentile(value: Double): this.type = {
+    set(selectorType, "Percentile")
     set(percentile, value)
   }
 
+  @Since("2.1.0")
   def setAlpha(value: Double): this.type = {
-    chiSqSelector.setAlpha(value)
-    chiSqSelector.setChiSqSelectorType(ChiSqSelectorType.Fpr)
+    set(selectorType, "Fpr")
     set(alpha, value)
   }
 
@@ -124,23 +130,38 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str
     val input: RDD[OldLabeledPoint] =
       dataset.select(col($(labelCol)).cast(DoubleType), col($(featuresCol))).rdd.map {
         case Row(label: Double, features: Vector) =>
-        OldLabeledPoint(label, OldVectors.fromML(features))
+          OldLabeledPoint(label, OldVectors.fromML(features))
       }
+    $(selectorType) match {
+      case "KBest" =>
+        chiSqSelector = new feature.ChiSqSelector().setNumTopFeatures($(numTopFeatures))
+      case "Percentile" =>
+        chiSqSelector = new feature.ChiSqSelector().setPercentile($(percentile))
+      case "Fpr" =>
+        chiSqSelector = new feature.ChiSqSelector().setAlpha($(alpha))
+      case _ => throw new Exception("Unknown ChiSqSelector Type.")
+    }
     val model = chiSqSelector.fit(input)
     copyValues(new ChiSqSelectorModel(uid, model).setParent(this))
   }
 
+  @Since("2.1.0")
   def selectKBest(value: Int): ChiSqSelectorModel = {
+    require(chiSqSelector != null, "ChiSqSelector has not been created.")
     val model = chiSqSelector.selectKBest(value)
     copyValues(new ChiSqSelectorModel(uid, model).setParent(this))
   }
 
-  def selectPercentile(value: Int): ChiSqSelectorModel = {
+  @Since("2.1.0")
+  def selectPercentile(value: Double): ChiSqSelectorModel = {
+    require(chiSqSelector != null, "ChiSqSelector has not been created.")
     val model = chiSqSelector.selectPercentile(value)
     copyValues(new ChiSqSelectorModel(uid, model).setParent(this))
   }
 
+  @Since("2.1.0")
   def selectFpr(value: Double): ChiSqSelectorModel = {
+    require(chiSqSelector != null, "ChiSqSelector has not been created.")
     val model = chiSqSelector.selectFpr(value)
     copyValues(new ChiSqSelectorModel(uid, model).setParent(this))
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index e2345b85a279e..1c3b49a04b843 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -33,6 +33,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.SparkContext
 import org.apache.spark.sql.{Row, SparkSession}
 
+@Since("2.1.0")
 object ChiSqSelectorType extends Enumeration {
   type SelectorType = Value
   val KBest, Percentile, Fpr = Value
@@ -179,31 +180,40 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] {
  */
 @Since("2.1.0")
 class ChiSqSelector @Since("2.1.0") () extends Serializable {
-  private var numTopFeatures: Int = 1
-  private var percentile: Int = 10
+  private var numTopFeatures: Int = 50
+  private var percentile: Double = 10.0
   private var alpha: Double = 0.05
   private var selectorType = ChiSqSelectorType.KBest
   private var chiSqTestResult: Array[ChiSqTestResult] = _
 
+  @Since("1.3.0")
   def this(numTopFeatures: Int) {
     this()
     this.numTopFeatures = numTopFeatures
   }
+
+  @Since("2.1.0")
   def setNumTopFeatures(value: Int): this.type = {
     numTopFeatures = value
     selectorType = ChiSqSelectorType.KBest
     this
   }
-  def setPercentile(value: Int): this.type = {
+
+  @Since("2.1.0")
+  def setPercentile(value: Double): this.type = {
     percentile = value
     selectorType = ChiSqSelectorType.Percentile
     this
   }
+
+  @Since("2.1.0")
   def setAlpha(value: Double): this.type = {
     alpha = value
     selectorType = ChiSqSelectorType.Fpr
     this
   }
+
+  @Since("2.1.0")
   def setChiSqSelectorType(value: ChiSqSelectorType.Value): this.type = {
     selectorType = value
     this
@@ -219,14 +229,15 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable {
   @Since("1.3.0")
   def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = {
     chiSqTestResult = Statistics.chiSqTest(data)
-      selectorType match {
-        case ChiSqSelectorType.KBest => selectKBest(numTopFeatures)
-        case ChiSqSelectorType.Percentile => selectPercentile(percentile)
-        case ChiSqSelectorType.Fpr => selectFpr(alpha)
-        case _ => throw new Exception
-      }
+    selectorType match {
+      case ChiSqSelectorType.KBest => selectKBest(numTopFeatures)
+      case ChiSqSelectorType.Percentile => selectPercentile(percentile)
+      case ChiSqSelectorType.Fpr => selectFpr(alpha)
+      case _ => throw new Exception("Unknown ChiSqSelector Type")
+    }
   }
 
+  @Since("2.1.0")
   def selectKBest(value: Int): ChiSqSelectorModel = {
     val indices = chiSqTestResult.zipWithIndex.sortBy { case (res, _) => -res.statistic }
     .take(numTopFeatures)
@@ -234,17 +245,18 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable {
     new ChiSqSelectorModel(indices)
   }
 
-  def selectPercentile(value: Int): ChiSqSelectorModel = {
+  @Since("2.1.0")
+  def selectPercentile(value: Double): ChiSqSelectorModel = {
     val indices = chiSqTestResult.zipWithIndex.sortBy { case (res, _) => -res.statistic }
     .take((chiSqTestResult.length * percentile / 100).toInt)
     .map { case (_, indices) => indices }
     new ChiSqSelectorModel(indices)
   }
 
+  @Since("2.1.0")
   def selectFpr(value: Double): ChiSqSelectorModel = {
     val indices = chiSqTestResult.zipWithIndex.filter{ case (res, _) => res.pValue < alpha }
     .map { case (_, indices) => indices }
     new ChiSqSelectorModel(indices)
   }
 }
-
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala
index 3558290b23ae0..a29ff83ae0cce 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala
@@ -49,16 +49,23 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext
       .map(x => (x._1.label, x._1.features, x._2))
       .toDF("label", "data", "preFilteredData")
 
-    val model = new ChiSqSelector()
+    val selector = new ChiSqSelector()
       .setNumTopFeatures(1)
       .setFeaturesCol("data")
       .setLabelCol("label")
       .setOutputCol("filtered")
 
-    model.fit(df).transform(df).select("filtered", "preFilteredData").collect().foreach {
+    selector.fit(df).transform(df).select("filtered", "preFilteredData").collect().foreach {
       case Row(vec1: Vector, vec2: Vector) =>
         assert(vec1 ~== vec2 absTol 1e-1)
     }
+
+    selector.selectPercentile(34).transform(df)
+    .select("filtered", "preFilteredData").collect().foreach {
+      case Row(vec1: Vector, vec2: Vector) =>
+        assert(vec1 ~== vec2 absTol 1e-1)
+    }
+
   }
 
   test("ChiSqSelector read/write") {

From 1e8d83a58b919256435d7f183a4cfb2154dfd2ee Mon Sep 17 00:00:00 2001
From: "Peng, Meng" <peng.meng@intel.com>
Date: Mon, 22 Aug 2016 11:09:33 +0800
Subject: [PATCH 05/21] Not reuse the ChiSqTestResult to be consistent with
 other methods

---
 .../spark/ml/feature/ChiSqSelector.scala      | 44 +++++--------------
 .../spark/mllib/feature/ChiSqSelector.scala   | 44 ++++++-------------
 .../spark/ml/feature/ChiSqSelectorSuite.scala |  2 +-
 3 files changed, 25 insertions(+), 65 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
index d6b847a7770b0..884da6de85450 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
@@ -27,6 +27,7 @@ import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util._
 import org.apache.spark.mllib.feature
+import org.apache.spark.mllib.feature.ChiSqSelectorType
 import org.apache.spark.mllib.linalg.{Vectors => OldVectors}
 import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint}
 import org.apache.spark.rdd.RDD
@@ -73,7 +74,7 @@ private[feature] trait ChiSqSelectorParams extends Params
 
   final val selectorType = new Param[String](this, "selectorType",
     "ChiSqSelector Type: KBest, Percentile, Fpr")
-  setDefault(selectorType -> "KBest")
+  setDefault(selectorType -> ChiSqSelectorType.KBest.toString)
 
   /** @group getParam */
   def getChiSqSelectorType: String = $(selectorType)
@@ -90,25 +91,22 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str
   @Since("1.6.0")
   def this() = this(Identifiable.randomUID("chiSqSelector"))
 
-  @Since("2.1.0")
-  var chiSqSelector: feature.ChiSqSelector = null
-
   /** @group setParam */
   @Since("2.1.0")
   def setNumTopFeatures(value: Int): this.type = {
-    set(selectorType, "KBest")
+    set(selectorType, ChiSqSelectorType.KBest.toString)
     set(numTopFeatures, value)
   }
 
   @Since("2.1.0")
   def setPercentile(value: Double): this.type = {
-    set(selectorType, "Percentile")
+    set(selectorType, ChiSqSelectorType.Percentile.toString)
     set(percentile, value)
   }
 
   @Since("2.1.0")
   def setAlpha(value: Double): this.type = {
-    set(selectorType, "Fpr")
+    set(selectorType, ChiSqSelectorType.Fpr.toString)
     set(alpha, value)
   }
 
@@ -132,37 +130,15 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str
         case Row(label: Double, features: Vector) =>
           OldLabeledPoint(label, OldVectors.fromML(features))
       }
-    $(selectorType) match {
+    var model = $(selectorType) match {
       case "KBest" =>
-        chiSqSelector = new feature.ChiSqSelector().setNumTopFeatures($(numTopFeatures))
+        new feature.ChiSqSelector().setNumTopFeatures($(numTopFeatures)).fit(input)
       case "Percentile" =>
-        chiSqSelector = new feature.ChiSqSelector().setPercentile($(percentile))
+        new feature.ChiSqSelector().setPercentile($(percentile)).fit(input)
       case "Fpr" =>
-        chiSqSelector = new feature.ChiSqSelector().setAlpha($(alpha))
-      case _ => throw new Exception("Unknown ChiSqSelector Type.")
+        new feature.ChiSqSelector().setAlpha($(alpha)).fit(input)
+      case _ => throw new IllegalStateException("Unknown ChiSqSelector Type.")
     }
-    val model = chiSqSelector.fit(input)
-    copyValues(new ChiSqSelectorModel(uid, model).setParent(this))
-  }
-
-  @Since("2.1.0")
-  def selectKBest(value: Int): ChiSqSelectorModel = {
-    require(chiSqSelector != null, "ChiSqSelector has not been created.")
-    val model = chiSqSelector.selectKBest(value)
-    copyValues(new ChiSqSelectorModel(uid, model).setParent(this))
-  }
-
-  @Since("2.1.0")
-  def selectPercentile(value: Double): ChiSqSelectorModel = {
-    require(chiSqSelector != null, "ChiSqSelector has not been created.")
-    val model = chiSqSelector.selectPercentile(value)
-    copyValues(new ChiSqSelectorModel(uid, model).setParent(this))
-  }
-
-  @Since("2.1.0")
-  def selectFpr(value: Double): ChiSqSelectorModel = {
-    require(chiSqSelector != null, "ChiSqSelector has not been created.")
-    val model = chiSqSelector.selectFpr(value)
     copyValues(new ChiSqSelectorModel(uid, model).setParent(this))
   }
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index 1c3b49a04b843..6c0db2cb8ccac 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -27,7 +27,6 @@ import org.apache.spark.annotation.Since
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.stat.Statistics
-import org.apache.spark.mllib.stat.test.ChiSqTestResult
 import org.apache.spark.mllib.util.{Loader, Saveable}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.SparkContext
@@ -184,7 +183,6 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable {
   private var percentile: Double = 10.0
   private var alpha: Double = 0.05
   private var selectorType = ChiSqSelectorType.KBest
-  private var chiSqTestResult: Array[ChiSqTestResult] = _
 
   @Since("1.3.0")
   def this(numTopFeatures: Int) {
@@ -201,6 +199,7 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable {
 
   @Since("2.1.0")
   def setPercentile(value: Double): this.type = {
+    require(value <= 100 && value >= 0, "Percentile should be larger than 0 and less than 100")
     percentile = value
     selectorType = ChiSqSelectorType.Percentile
     this
@@ -228,35 +227,20 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable {
    */
   @Since("1.3.0")
   def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = {
-    chiSqTestResult = Statistics.chiSqTest(data)
-    selectorType match {
-      case ChiSqSelectorType.KBest => selectKBest(numTopFeatures)
-      case ChiSqSelectorType.Percentile => selectPercentile(percentile)
-      case ChiSqSelectorType.Fpr => selectFpr(alpha)
-      case _ => throw new Exception("Unknown ChiSqSelector Type")
+    var indices = selectorType match {
+      case ChiSqSelectorType.KBest => Statistics.chiSqTest(data)
+        .zipWithIndex.sortBy { case (res, _) => -res.statistic }
+        .take(numTopFeatures)
+        .map { case (_, indices) => indices }
+      case ChiSqSelectorType.Percentile => Statistics.chiSqTest(data)
+        .zipWithIndex.sortBy { case (res, _) => -res.statistic }
+        .take((data.count() * percentile / 100).toInt)
+        .map { case (_, indices) => indices }
+      case ChiSqSelectorType.Fpr => Statistics.chiSqTest(data)
+        .zipWithIndex.filter{ case (res, _) => res.pValue < alpha }
+        .map { case (_, indices) => indices }
+      case _ => throw new IllegalStateException("Unknown ChiSqSelector Type")
     }
-  }
-
-  @Since("2.1.0")
-  def selectKBest(value: Int): ChiSqSelectorModel = {
-    val indices = chiSqTestResult.zipWithIndex.sortBy { case (res, _) => -res.statistic }
-    .take(numTopFeatures)
-    .map { case (_, indices) => indices }
-    new ChiSqSelectorModel(indices)
-  }
-
-  @Since("2.1.0")
-  def selectPercentile(value: Double): ChiSqSelectorModel = {
-    val indices = chiSqTestResult.zipWithIndex.sortBy { case (res, _) => -res.statistic }
-    .take((chiSqTestResult.length * percentile / 100).toInt)
-    .map { case (_, indices) => indices }
-    new ChiSqSelectorModel(indices)
-  }
-
-  @Since("2.1.0")
-  def selectFpr(value: Double): ChiSqSelectorModel = {
-    val indices = chiSqTestResult.zipWithIndex.filter{ case (res, _) => res.pValue < alpha }
-    .map { case (_, indices) => indices }
     new ChiSqSelectorModel(indices)
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala
index a29ff83ae0cce..50d175a18753c 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala
@@ -60,7 +60,7 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext
         assert(vec1 ~== vec2 absTol 1e-1)
     }
 
-    selector.selectPercentile(34).transform(df)
+    selector.setPercentile(34).fit(df).transform(df)
     .select("filtered", "preFilteredData").collect().foreach {
       case Row(vec1: Vector, vec2: Vector) =>
         assert(vec1 ~== vec2 absTol 1e-1)

From 85a17dd8a3e0a8d0d5a041d14a4472b5c202abcc Mon Sep 17 00:00:00 2001
From: "Peng, Meng" <peng.meng@intel.com>
Date: Mon, 22 Aug 2016 19:20:38 +0800
Subject: [PATCH 06/21] fix Percentile bugs, optimize the code

---
 .../spark/ml/feature/ChiSqSelector.scala      | 18 +++++++++----
 .../spark/mllib/feature/ChiSqSelector.scala   | 25 ++++++++++---------
 .../spark/ml/feature/ChiSqSelectorSuite.scala |  2 +-
 3 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
index 884da6de85450..0d1946136832f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
@@ -72,6 +72,13 @@ private[feature] trait ChiSqSelectorParams extends Params
   /** @group getParam */
   def getAlpha: Double = $(alpha)
 
+  /**
+   * The ChiSqSelector supports KBest, Percentile, Fpr selection,
+   * which is the same as ChiSqSelectorType defined in MLLIB.
+   * when call setNumTopFeatures, the selectorType is set to KBest
+   * when call setPercentile, the selectorType is set to Percentile
+   * when call setFpr, the selectorType is set to Fpr
+   */
   final val selectorType = new Param[String](this, "selectorType",
     "ChiSqSelector Type: KBest, Percentile, Fpr")
   setDefault(selectorType -> ChiSqSelectorType.KBest.toString)
@@ -130,14 +137,15 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str
         case Row(label: Double, features: Vector) =>
           OldLabeledPoint(label, OldVectors.fromML(features))
       }
-    var model = $(selectorType) match {
-      case "KBest" =>
+    var model = ChiSqSelectorType.withName($(selectorType)) match {
+      case ChiSqSelectorType.KBest =>
         new feature.ChiSqSelector().setNumTopFeatures($(numTopFeatures)).fit(input)
-      case "Percentile" =>
+      case ChiSqSelectorType.Percentile =>
         new feature.ChiSqSelector().setPercentile($(percentile)).fit(input)
-      case "Fpr" =>
+      case ChiSqSelectorType.Fpr =>
         new feature.ChiSqSelector().setAlpha($(alpha)).fit(input)
-      case _ => throw new IllegalStateException("Unknown ChiSqSelector Type.")
+      case errorType =>
+        throw new IllegalStateException(s"Unknown ChiSqSelector Type: $errorType")
     }
     copyValues(new ChiSqSelectorModel(uid, model).setParent(this))
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index 6c0db2cb8ccac..74a61dc9ce585 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -33,7 +33,7 @@ import org.apache.spark.SparkContext
 import org.apache.spark.sql.{Row, SparkSession}
 
 @Since("2.1.0")
-object ChiSqSelectorType extends Enumeration {
+private[spark] object ChiSqSelectorType extends Enumeration {
   type SelectorType = Value
   val KBest, Percentile, Fpr = Value
 }
@@ -180,7 +180,7 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] {
 @Since("2.1.0")
 class ChiSqSelector @Since("2.1.0") () extends Serializable {
   private var numTopFeatures: Int = 50
-  private var percentile: Double = 10.0
+  private var percentile: Double = 0.1
   private var alpha: Double = 0.05
   private var selectorType = ChiSqSelectorType.KBest
 
@@ -199,7 +199,7 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable {
 
   @Since("2.1.0")
   def setPercentile(value: Double): this.type = {
-    require(value <= 100 && value >= 0, "Percentile should be larger than 0 and less than 100")
+    require(value <= 1 && value >= 0, "Percentile should be larger than 0 and less than 100")
     percentile = value
     selectorType = ChiSqSelectorType.Percentile
     this
@@ -227,20 +227,21 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable {
    */
   @Since("1.3.0")
   def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = {
-    var indices = selectorType match {
-      case ChiSqSelectorType.KBest => Statistics.chiSqTest(data)
+    val chiSqTestResult = Statistics.chiSqTest(data)
+    val features = selectorType match {
+      case ChiSqSelectorType.KBest => chiSqTestResult
         .zipWithIndex.sortBy { case (res, _) => -res.statistic }
         .take(numTopFeatures)
-        .map { case (_, indices) => indices }
-      case ChiSqSelectorType.Percentile => Statistics.chiSqTest(data)
+      case ChiSqSelectorType.Percentile => chiSqTestResult
         .zipWithIndex.sortBy { case (res, _) => -res.statistic }
-        .take((data.count() * percentile / 100).toInt)
-        .map { case (_, indices) => indices }
-      case ChiSqSelectorType.Fpr => Statistics.chiSqTest(data)
+        .take((chiSqTestResult.length * percentile).toInt)
+      case ChiSqSelectorType.Fpr => chiSqTestResult
         .zipWithIndex.filter{ case (res, _) => res.pValue < alpha }
-        .map { case (_, indices) => indices }
-      case _ => throw new IllegalStateException("Unknown ChiSqSelector Type")
+      case errorType =>
+        throw new IllegalStateException("Unknown ChiSqSelector Type: $errorType")
     }
+    val indices = features.map { case (_, indices) => indices }
     new ChiSqSelectorModel(indices)
   }
 }
+
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala
index 50d175a18753c..e0293dbc4b0b2 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/ChiSqSelectorSuite.scala
@@ -60,7 +60,7 @@ class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext
         assert(vec1 ~== vec2 absTol 1e-1)
     }
 
-    selector.setPercentile(34).fit(df).transform(df)
+    selector.setPercentile(0.34).fit(df).transform(df)
     .select("filtered", "preFilteredData").collect().foreach {
       case Row(vec1: Vector, vec2: Vector) =>
         assert(vec1 ~== vec2 absTol 1e-1)

From 61b71c81a75f88f8f9144e325e1a58f271e1aba0 Mon Sep 17 00:00:00 2001
From: "Peng, Meng" <peng.meng@intel.com>
Date: Mon, 22 Aug 2016 19:34:37 +0800
Subject: [PATCH 07/21] change the default value of Percentile

---
 .../scala/org/apache/spark/ml/feature/ChiSqSelector.scala   | 2 +-
 .../org/apache/spark/mllib/feature/ChiSqSelector.scala      | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
index 0d1946136832f..b9b88a0ddcf66 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
@@ -59,7 +59,7 @@ private[feature] trait ChiSqSelectorParams extends Params
   final val percentile = new DoubleParam(this, "percentile",
     "Percentile of features that selector will select, ordered by statistics value descending.",
     ParamValidators.gtEq(0))
-  setDefault(percentile -> 10)
+  setDefault(percentile -> 0.1)
 
   /** @group getParam */
   def getPercentile: Double = $(percentile)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index 74a61dc9ce585..79692f51887e1 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -76,7 +76,11 @@ class ChiSqSelectorModel @Since("1.3.0") (
    * @param filterIndices indices of features to filter
    */
   private def compress(features: Vector, filterIndices: Array[Int]): Vector = {
-    val orderedIndices = filterIndices.sorted
+    val orderedIndices = if (isSorted(filterIndices)) {
+      filterIndices
+    } else {
+      filterIndices.sorted
+    }
     features match {
       case SparseVector(size, indices, values) =>
         val newSize = orderedIndices.length

From d7b2892129cb64600443992a0041ef1781f74d86 Mon Sep 17 00:00:00 2001
From: "Peng, Meng" <peng.meng@intel.com>
Date: Tue, 23 Aug 2016 12:57:34 +0800
Subject: [PATCH 08/21] Add require for setAlpha value

---
 .../scala/org/apache/spark/mllib/feature/ChiSqSelector.scala   | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index 79692f51887e1..0302b2ca86c34 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -203,7 +203,7 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable {
 
   @Since("2.1.0")
   def setPercentile(value: Double): this.type = {
-    require(value <= 1 && value >= 0, "Percentile should be larger than 0 and less than 100")
+    require(value <= 1 && value >= 0, "Percentile should be larger than 0 and less than 1")
     percentile = value
     selectorType = ChiSqSelectorType.Percentile
     this
@@ -211,6 +211,7 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable {
 
   @Since("2.1.0")
   def setAlpha(value: Double): this.type = {
+    require(value <= 1 && value >= 0, "alpha value should be larger than 0 and less than 1")
     alpha = value
     selectorType = ChiSqSelectorType.Fpr
     this

From 6699396aa7413040e24dddca679f24fe75c05cfb Mon Sep 17 00:00:00 2001
From: "Peng, Meng" <peng.meng@intel.com>
Date: Tue, 23 Aug 2016 20:30:01 +0800
Subject: [PATCH 09/21] rm isSorted function, change gtEq(0) to inRange(0,1)
 for percentile and alpha check

---
 .../spark/ml/feature/ChiSqSelector.scala      |  4 ++--
 .../spark/mllib/feature/ChiSqSelector.scala   | 22 ++++---------------
 2 files changed, 6 insertions(+), 20 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
index b9b88a0ddcf66..6affcd4f25453 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
@@ -58,7 +58,7 @@ private[feature] trait ChiSqSelectorParams extends Params
 
   final val percentile = new DoubleParam(this, "percentile",
     "Percentile of features that selector will select, ordered by statistics value descending.",
-    ParamValidators.gtEq(0))
+    ParamValidators.inRange(0, 1))
   setDefault(percentile -> 0.1)
 
   /** @group getParam */
@@ -66,7 +66,7 @@ private[feature] trait ChiSqSelectorParams extends Params
 
   final val alpha = new DoubleParam(this, "alpha",
     "The highest p-value for features to be kept.",
-    ParamValidators.gtEq(0))
+    ParamValidators.inRange(0, 1))
   setDefault(alpha -> 0.05)
 
   /** @group getParam */
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index 0302b2ca86c34..a47a896633323 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -47,16 +47,6 @@ private[spark] object ChiSqSelectorType extends Enumeration {
 class ChiSqSelectorModel @Since("1.3.0") (
   @Since("1.3.0") val selectedFeatures: Array[Int]) extends VectorTransformer with Saveable {
 
-  protected def isSorted(array: Array[Int]): Boolean = {
-    var i = 1
-    val len = array.length
-    while (i < len) {
-      if (array(i) < array(i-1)) return false
-      i += 1
-    }
-    true
-  }
-
   /**
    * Applies transformation on a vector.
    *
@@ -76,11 +66,7 @@ class ChiSqSelectorModel @Since("1.3.0") (
    * @param filterIndices indices of features to filter
    */
   private def compress(features: Vector, filterIndices: Array[Int]): Vector = {
-    val orderedIndices = if (isSorted(filterIndices)) {
-      filterIndices
-    } else {
-      filterIndices.sorted
-    }
+    val orderedIndices = filterIndices.sorted
     features match {
       case SparseVector(size, indices, values) =>
         val newSize = orderedIndices.length
@@ -203,7 +189,7 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable {
 
   @Since("2.1.0")
   def setPercentile(value: Double): this.type = {
-    require(value <= 1 && value >= 0, "Percentile should be larger than 0 and less than 1")
+    require(0.0 <= value && value <= 1.0, "Percentile must be in [0,1]")
     percentile = value
     selectorType = ChiSqSelectorType.Percentile
     this
@@ -211,7 +197,7 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable {
 
   @Since("2.1.0")
   def setAlpha(value: Double): this.type = {
-    require(value <= 1 && value >= 0, "alpha value should be larger than 0 and less than 1")
+    require(0.0 <= value && value <= 1.0, "Alpha must be in [0,1]")
     alpha = value
     selectorType = ChiSqSelectorType.Fpr
     this
@@ -243,7 +229,7 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable {
       case ChiSqSelectorType.Fpr => chiSqTestResult
         .zipWithIndex.filter{ case (res, _) => res.pValue < alpha }
       case errorType =>
-        throw new IllegalStateException("Unknown ChiSqSelector Type: $errorType")
+        throw new IllegalStateException(s"Unknown ChiSqSelector Type: $errorType")
     }
     val indices = features.map { case (_, indices) => indices }
     new ChiSqSelectorModel(indices)

From b8986b5cd763b9d44c6672571ad27552ba8bca73 Mon Sep 17 00:00:00 2001
From: "Peng, Meng" <peng.meng@intel.com>
Date: Tue, 23 Aug 2016 20:49:58 +0800
Subject: [PATCH 10/21] Optimize fit function of ml ChiSqSelector

---
 .../org/apache/spark/ml/feature/ChiSqSelector.scala    | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
index 6affcd4f25453..da5c9c536296e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
@@ -137,16 +137,18 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str
         case Row(label: Double, features: Vector) =>
           OldLabeledPoint(label, OldVectors.fromML(features))
       }
-    var model = ChiSqSelectorType.withName($(selectorType)) match {
+    var selector = new feature.ChiSqSelector()
+    ChiSqSelectorType.withName($(selectorType)) match {
       case ChiSqSelectorType.KBest =>
-        new feature.ChiSqSelector().setNumTopFeatures($(numTopFeatures)).fit(input)
+        selector.setNumTopFeatures($(numTopFeatures))
       case ChiSqSelectorType.Percentile =>
-        new feature.ChiSqSelector().setPercentile($(percentile)).fit(input)
+        selector.setPercentile($(percentile))
       case ChiSqSelectorType.Fpr =>
-        new feature.ChiSqSelector().setAlpha($(alpha)).fit(input)
+        selector.setAlpha($(alpha))
       case errorType =>
         throw new IllegalStateException(s"Unknown ChiSqSelector Type: $errorType")
     }
+    val model = selector.fit(input)
     copyValues(new ChiSqSelectorModel(uid, model).setParent(this))
   }
 

From 5c2e44cba3494623c283d56bd7cfc1c915901b22 Mon Sep 17 00:00:00 2001
From: "Peng, Meng" <peng.meng@intel.com>
Date: Wed, 24 Aug 2016 22:28:02 +0800
Subject: [PATCH 11/21] Fpr to FPR, sort all cases in fit

---
 .../org/apache/spark/ml/feature/ChiSqSelector.scala   | 10 +++++-----
 .../apache/spark/mllib/feature/ChiSqSelector.scala    | 11 +++++------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
index da5c9c536296e..7992b8b78a8f5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
@@ -73,14 +73,14 @@ private[feature] trait ChiSqSelectorParams extends Params
   def getAlpha: Double = $(alpha)
 
   /**
-   * The ChiSqSelector supports KBest, Percentile, Fpr selection,
+   * The ChiSqSelector supports KBest, Percentile, FPR selection,
    * which is the same as ChiSqSelectorType defined in MLLIB.
    * when call setNumTopFeatures, the selectorType is set to KBest
    * when call setPercentile, the selectorType is set to Percentile
-   * when call setFpr, the selectorType is set to Fpr
+   * when call setAlpha, the selectorType is set to FPR
    */
   final val selectorType = new Param[String](this, "selectorType",
-    "ChiSqSelector Type: KBest, Percentile, Fpr")
+    "ChiSqSelector Type: KBest, Percentile, FPR")
   setDefault(selectorType -> ChiSqSelectorType.KBest.toString)
 
   /** @group getParam */
@@ -113,7 +113,7 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str
 
   @Since("2.1.0")
   def setAlpha(value: Double): this.type = {
-    set(selectorType, ChiSqSelectorType.Fpr.toString)
+    set(selectorType, ChiSqSelectorType.FPR.toString)
     set(alpha, value)
   }
 
@@ -143,7 +143,7 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str
         selector.setNumTopFeatures($(numTopFeatures))
       case ChiSqSelectorType.Percentile =>
         selector.setPercentile($(percentile))
-      case ChiSqSelectorType.Fpr =>
+      case ChiSqSelectorType.FPR =>
         selector.setAlpha($(alpha))
       case errorType =>
         throw new IllegalStateException(s"Unknown ChiSqSelector Type: $errorType")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index a47a896633323..610f5c2dd479b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -35,7 +35,7 @@ import org.apache.spark.sql.{Row, SparkSession}
 @Since("2.1.0")
 private[spark] object ChiSqSelectorType extends Enumeration {
   type SelectorType = Value
-  val KBest, Percentile, Fpr = Value
+  val KBest, Percentile, FPR = Value
 }
 
 /**
@@ -199,7 +199,7 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable {
   def setAlpha(value: Double): this.type = {
     require(0.0 <= value && value <= 1.0, "Alpha must be in [0,1]")
     alpha = value
-    selectorType = ChiSqSelectorType.Fpr
+    selectorType = ChiSqSelectorType.FPR
     this
   }
 
@@ -219,15 +219,14 @@ class ChiSqSelector @Since("2.1.0") () extends Serializable {
   @Since("1.3.0")
   def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = {
     val chiSqTestResult = Statistics.chiSqTest(data)
+      .zipWithIndex.sortBy { case (res, _) => -res.statistic }
     val features = selectorType match {
       case ChiSqSelectorType.KBest => chiSqTestResult
-        .zipWithIndex.sortBy { case (res, _) => -res.statistic }
         .take(numTopFeatures)
       case ChiSqSelectorType.Percentile => chiSqTestResult
-        .zipWithIndex.sortBy { case (res, _) => -res.statistic }
         .take((chiSqTestResult.length * percentile).toInt)
-      case ChiSqSelectorType.Fpr => chiSqTestResult
-        .zipWithIndex.filter{ case (res, _) => res.pValue < alpha }
+      case ChiSqSelectorType.FPR => chiSqTestResult
+        .filter{ case (res, _) => res.pValue < alpha }
       case errorType =>
         throw new IllegalStateException(s"Unknown ChiSqSelector Type: $errorType")
     }

From 0d3967af95bd4e303cf6a5e1e826806f0f6ee617 Mon Sep 17 00:00:00 2001
From: "Peng, Meng" <peng.meng@intel.com>
Date: Mon, 29 Aug 2016 22:35:35 +0800
Subject: [PATCH 12/21] Add Python API for ChiSqSelector

---
 .../mllib/api/python/PythonMLLibAPI.scala     | 28 ++++++++--
 python/pyspark/mllib/feature.py               | 52 ++++++++++++++++---
 2 files changed, 71 insertions(+), 9 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index a80cca70f4b28..5dfbd55790b10 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -629,13 +629,35 @@ private[python] class PythonMLLibAPI extends Serializable {
   }
 
   /**
-   * Java stub for ChiSqSelector.fit(). This stub returns a
+   * Java stub for ChiSqSelector.fit() when the seletion type is KBest. This stub returns a
    * handle to the Java object instead of the content of the Java object.
    * Extra care needs to be taken in the Python code to ensure it gets freed on
    * exit; see the Py4J documentation.
    */
-  def fitChiSqSelector(numTopFeatures: Int, data: JavaRDD[LabeledPoint]): ChiSqSelectorModel = {
-    new ChiSqSelector(numTopFeatures).fit(data.rdd)
+  def fitChiSqSelectorKBest(numTopFeatures: Int,
+    data: JavaRDD[LabeledPoint]): ChiSqSelectorModel = {
+    new ChiSqSelector().setNumTopFeatures(numTopFeatures).fit(data.rdd)
+  }
+
+  /**
+   * Java stub for ChiSqSelector.fit() when the selection type is Percentile. This stub returns a
+   * handle to the Java object instead of the content of the Java object.
+   * Extra care needs to be taken in the Python code to ensure it gets freed on
+   * exit; see the Py4J documentation.
+   */
+  def fitChiSqSelectorPercentile(percentile: Double,
+    data: JavaRDD[LabeledPoint]): ChiSqSelectorModel = {
+    new ChiSqSelector().setPercentile(percentile).fit(data.rdd)
+  }
+
+  /**
+   * Java stub for ChiSqSelector.fit() when the selection type is FPR. This stub returns a
+   * handle to the Java object instead of the content of the Java object.
+   * Extra care needs to be taken in the Python code to ensure it gets freed on
+   * exit; see the Py4J documentation.
+   */
+  def fitChiSqSelectorFPR(alpha: Double, data: JavaRDD[LabeledPoint]): ChiSqSelectorModel = {
+    new ChiSqSelector().setAlpha(alpha).fit(data.rdd)
   }
 
   /**
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index c8a6e33f4d9a4..2bdbabb32ad10 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -276,24 +276,64 @@ class ChiSqSelector(object):
     """
     Creates a ChiSquared feature selector.
 
-    :param numTopFeatures: number of features that selector will select.
-
     >>> data = [
     ...     LabeledPoint(0.0, SparseVector(3, {0: 8.0, 1: 7.0})),
     ...     LabeledPoint(1.0, SparseVector(3, {1: 9.0, 2: 6.0})),
     ...     LabeledPoint(1.0, [0.0, 9.0, 8.0]),
     ...     LabeledPoint(2.0, [8.0, 9.0, 5.0])
     ... ]
-    >>> model = ChiSqSelector(1).fit(sc.parallelize(data))
+    >>> model = ChiSqSelector().setNumTopFeatures(1).fit(sc.parallelize(data))
+    >>> model.transform(SparseVector(3, {1: 9.0, 2: 6.0}))
+    SparseVector(1, {0: 6.0})
+    >>> model.transform(DenseVector([8.0, 9.0, 5.0]))
+    DenseVector([5.0])
+    >>> model = ChiSqSelector().setPercentile(0.34).fit(sc.parallelize(data))
     >>> model.transform(SparseVector(3, {1: 9.0, 2: 6.0}))
     SparseVector(1, {0: 6.0})
     >>> model.transform(DenseVector([8.0, 9.0, 5.0]))
     DenseVector([5.0])
+    >>> data = [
+    ...     LabeledPoint(0.0, SparseVector(4, {0: 8.0, 1: 7.0})),
+    ...     LabeledPoint(1.0, SparseVector(4, {1: 9.0, 2: 6.0, 3: 4.0})),
+    ...     LabeledPoint(1.0, [0.0, 9.0, 8.0, 4.0]),
+    ...     LabeledPoint(2.0, [8.0, 9.0, 5.0, 9.0])
+    ... ]
+    >>> model = ChiSqSelector().setAlpha(0.1).fit(sc.parallelize(data))
+    >>> model.transform(DenseVector([1.0,2.0,3.0,4.0]))
+    DenseVector([4.0])
 
     .. versionadded:: 1.4.0
     """
-    def __init__(self, numTopFeatures):
-        self.numTopFeatures = int(numTopFeatures)
+    def __init__(self):
+        self.param = 50
+        self.fitFunc = "fitChiSqSelectorKBest"
+
+    @since('2.1.0')
+    def setNumTopFeatures(self, numTopFeatures):
+        """
+        set numTopFeature for feature selection by number of top features
+        """
+        self.param = int(numTopFeatures)
+        self.fitFunc = "fitChiSqSelectorKBest"
+        return self
+
+    @since('2.1.0')
+    def setPercentile(self, percentile):
+        """
+        set Percentile [0.0, 1.0] for feature selection by percentile
+        """
+        self.param = float(percentile)
+        self.fitFunc = "fitChiSqSelectorPercentile"
+        return self
+
+    @since('2.1.0')
+    def setAlpha(self, alpha):
+        """
+        set Alpha [0.0, 1.0] for feature selection by FPR
+        """
+        self.param = float(alpha)
+        self.fitFunc = "fitChiSqSelectorFPR"
+        return self
 
     @since('1.4.0')
     def fit(self, data):
@@ -305,7 +345,7 @@ def fit(self, data):
                      treated as categorical for each distinct value.
                      Apply feature discretizer before using this function.
         """
-        jmodel = callMLlibFunc("fitChiSqSelector", self.numTopFeatures, data)
+        jmodel = callMLlibFunc(self.fitFunc, self.param, data)
         return ChiSqSelectorModel(jmodel)
 
 

From 1dc6a8ebad693009dd4bca0e579252ec274bce86 Mon Sep 17 00:00:00 2001
From: "Peng, Meng" <peng.meng@intel.com>
Date: Mon, 5 Sep 2016 13:55:51 +0800
Subject: [PATCH 13/21] split the ChiSqSelector param to numTopFeateres,
 Percentile, Alpha in Python

---
 python/pyspark/mllib/feature.py | 32 +++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 2bdbabb32ad10..bb0cc13419d42 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -271,6 +271,11 @@ def transform(self, vector):
         """
         return JavaVectorTransformer.transform(self, vector)
 
+class ChiSqSelectorType:
+    """
+    This class defines the selector types of Chi Square Selector.
+    """
+    KBest, Percentile, FPR = range(3)
 
 class ChiSqSelector(object):
     """
@@ -305,34 +310,34 @@ class ChiSqSelector(object):
     .. versionadded:: 1.4.0
     """
     def __init__(self):
-        self.param = 50
-        self.fitFunc = "fitChiSqSelectorKBest"
+        self.numTopFeatures = 50
+        self.selectorType = ChiSqSelectorType.KBest
 
     @since('2.1.0')
     def setNumTopFeatures(self, numTopFeatures):
         """
         set numTopFeature for feature selection by number of top features
         """
-        self.param = int(numTopFeatures)
-        self.fitFunc = "fitChiSqSelectorKBest"
+        self.numTopFeatures = int(numTopFeatures)
+        self.selectorType = ChiSqSelectorType.KBest
         return self
 
     @since('2.1.0')
     def setPercentile(self, percentile):
         """
-        set Percentile [0.0, 1.0] for feature selection by percentile
+        set percentile [0.0, 1.0] for feature selection by percentile
         """
-        self.param = float(percentile)
-        self.fitFunc = "fitChiSqSelectorPercentile"
+        self.percentile = float(percentile)
+        self.selectorType = ChiSqSelectorType.Percentile
         return self
 
     @since('2.1.0')
     def setAlpha(self, alpha):
         """
-        set Alpha [0.0, 1.0] for feature selection by FPR
+        set alpha [0.0, 1.0] for feature selection by FPR
         """
-        self.param = float(alpha)
-        self.fitFunc = "fitChiSqSelectorFPR"
+        self.alpha = float(alpha)
+        self.selectorType = ChiSqSelectorType.FPR
         return self
 
     @since('1.4.0')
@@ -345,7 +350,12 @@ def fit(self, data):
                      treated as categorical for each distinct value.
                      Apply feature discretizer before using this function.
         """
-        jmodel = callMLlibFunc(self.fitFunc, self.param, data)
+        if self.selectorType == ChiSqSelectorType.KBest:
+            jmodel = callMLlibFunc("fitChiSqSelectorKBest", self.numTopFeatures, data)
+        elif self.selectorType == ChiSqSelectorType.Percentile:
+            jmodel = callMLlibFunc("fitChiSqSelectorPercentile", self.percentile, data)
+        elif self.selectorType == ChiSqSelectorType.FPR:
+            jmodel = callMLlibFunc("fitChiSqSelectorFPR", self.alpha, data)
         return ChiSqSelectorModel(jmodel)
 
 

From 990887181b12c354270ae3127e10fa35e790a784 Mon Sep 17 00:00:00 2001
From: "Peng, Meng" <peng.meng@intel.com>
Date: Mon, 5 Sep 2016 23:22:05 +0800
Subject: [PATCH 14/21] Add type check for Python ChiSqSelector

---
 python/pyspark/mllib/feature.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index bb0cc13419d42..1c18d978607f4 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -356,6 +356,8 @@ def fit(self, data):
             jmodel = callMLlibFunc("fitChiSqSelectorPercentile", self.percentile, data)
         elif self.selectorType == ChiSqSelectorType.FPR:
             jmodel = callMLlibFunc("fitChiSqSelectorFPR", self.alpha, data)
+        else:
+            raise TypeError("Chi Square selector only supports: KBest, Percentile, and FPR.")
         return ChiSqSelectorModel(jmodel)
 
 

From bbccac7b5d3812d71498f9a420829f3e67955794 Mon Sep 17 00:00:00 2001
From: "Peng, Meng" <peng.meng@intel.com>
Date: Tue, 6 Sep 2016 17:19:09 +0800
Subject: [PATCH 15/21] Change the exception type of value check

---
 python/pyspark/mllib/feature.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 1c18d978607f4..5350f949c3ecf 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -357,7 +357,8 @@ def fit(self, data):
         elif self.selectorType == ChiSqSelectorType.FPR:
             jmodel = callMLlibFunc("fitChiSqSelectorFPR", self.alpha, data)
         else:
-            raise TypeError("Chi Square selector only supports: KBest, Percentile, and FPR.")
+            raise ValueError("ChiSqSelector type supports KBest(0), Percentile(1) and "
+                "FPR(2), the current value is: %s" % self.selectorType)
         return ChiSqSelectorModel(jmodel)
 
 

From c35bcf14a7c1694ec03f4c58fc6f5b8d5ef3e5bb Mon Sep 17 00:00:00 2001
From: "Peng, Meng" <peng.meng@intel.com>
Date: Tue, 13 Sep 2016 17:57:46 +0800
Subject: [PATCH 16/21] change python code style

---
 python/pyspark/mllib/feature.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 5350f949c3ecf..26abe4175ce28 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -357,8 +357,8 @@ def fit(self, data):
         elif self.selectorType == ChiSqSelectorType.FPR:
             jmodel = callMLlibFunc("fitChiSqSelectorFPR", self.alpha, data)
         else:
-            raise ValueError("ChiSqSelector type supports KBest(0), Percentile(1) and "
-                "FPR(2), the current value is: %s" % self.selectorType)
+            raise ValueError("ChiSqSelector type supports KBest(0), Percentile(1) and"
+                             " FPR(2), the current value is: %s" % self.selectorType)
         return ChiSqSelectorModel(jmodel)
 
 

From e8f03edbc54d3c9cb32688f79dd129dbf043da38 Mon Sep 17 00:00:00 2001
From: "Peng, Meng" <peng.meng@intel.com>
Date: Tue, 13 Sep 2016 18:33:40 +0800
Subject: [PATCH 17/21] change python code style

---
 python/pyspark/mllib/feature.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 26abe4175ce28..dd876fbac9194 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -271,12 +271,14 @@ def transform(self, vector):
         """
         return JavaVectorTransformer.transform(self, vector)
 
+
 class ChiSqSelectorType:
     """
     This class defines the selector types of Chi Square Selector.
     """
     KBest, Percentile, FPR = range(3)
 
+
 class ChiSqSelector(object):
     """
     Creates a ChiSquared feature selector.

From ec74ddebe6bc9f1e85c60e9fcd88355691b5f42c Mon Sep 17 00:00:00 2001
From: "Peng, Meng" <peng.meng@intel.com>
Date: Tue, 13 Sep 2016 20:19:55 +0800
Subject: [PATCH 18/21] revert isSort to pass MiMa test

---
 .../spark/mllib/feature/ChiSqSelector.scala    | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index 610f5c2dd479b..16beb5072c26d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -47,6 +47,16 @@ private[spark] object ChiSqSelectorType extends Enumeration {
 class ChiSqSelectorModel @Since("1.3.0") (
   @Since("1.3.0") val selectedFeatures: Array[Int]) extends VectorTransformer with Saveable {
 
+  protected def isSorted(array: Array[Int]): Boolean = {
+    var i = 1
+    val len = array.length
+    while (i < len) {
+      if (array(i) < array(i-1)) return false
+      i += 1
+    }
+    true
+  }
+
   /**
    * Applies transformation on a vector.
    *
@@ -169,10 +179,10 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] {
  */
 @Since("2.1.0")
 class ChiSqSelector @Since("2.1.0") () extends Serializable {
-  private var numTopFeatures: Int = 50
-  private var percentile: Double = 0.1
-  private var alpha: Double = 0.05
-  private var selectorType = ChiSqSelectorType.KBest
+  var numTopFeatures: Int = 50
+  var percentile: Double = 0.1
+  var alpha: Double = 0.05
+  var selectorType = ChiSqSelectorType.KBest
 
   @Since("1.3.0")
   def this(numTopFeatures: Int) {

From 6398f4ca954ff0971c5ef3db7956847b09d54849 Mon Sep 17 00:00:00 2001
From: "Peng, Meng" <peng.meng@intel.com>
Date: Wed, 14 Sep 2016 15:11:15 +0800
Subject: [PATCH 19/21] Change MimaExcludes

---
 .../org/apache/spark/mllib/feature/ChiSqSelector.scala | 10 ----------
 project/MimaExcludes.scala                             |  3 +++
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index 16beb5072c26d..226b296293133 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -47,16 +47,6 @@ private[spark] object ChiSqSelectorType extends Enumeration {
 class ChiSqSelectorModel @Since("1.3.0") (
   @Since("1.3.0") val selectedFeatures: Array[Int]) extends VectorTransformer with Saveable {
 
-  protected def isSorted(array: Array[Int]): Boolean = {
-    var i = 1
-    val len = array.length
-    while (i < len) {
-      if (array(i) < array(i-1)) return false
-      i += 1
-    }
-    true
-  }
-
   /**
    * Applies transformation on a vector.
    *
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 688218f6f43af..d24f2f3f25192 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -787,6 +787,9 @@ object MimaExcludes {
     ) ++ Seq(
         // [SPARK-14743] Improve delegation token handling in secure cluster
         ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.deploy.SparkHadoopUtil.getTimeFromNowToRenewal")
+    ) ++ Seq(
+        // [SPARK-17017] Add chiSquare selector based on False Positive Rate (FPR) test
+        ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.mllib.feature.ChiSqSelectorModel.isSorted")
       )
   }
 

From 1d2f67f01be415203173bc441fbb238ccb7121a9 Mon Sep 17 00:00:00 2001
From: "Peng, Meng" <peng.meng@intel.com>
Date: Sun, 18 Sep 2016 15:43:24 +0800
Subject: [PATCH 20/21] add javadoc

---
 .../org/apache/spark/ml/feature/ChiSqSelector.scala    |  5 ++++-
 .../org/apache/spark/mllib/feature/ChiSqSelector.scala | 10 ++++++++--
 python/pyspark/mllib/feature.py                        |  4 ++--
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
index 7992b8b78a8f5..accaecb9d6dce 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
@@ -90,6 +90,9 @@ private[feature] trait ChiSqSelectorParams extends Params
 /**
  * Chi-Squared feature selection, which selects categorical features to use for predicting a
  * categorical label.
+ * The selector supports three selection methods: KBest, Percentile and FPR.
+ * By default, the selection method is KBest, the default number of top features is 50.
+ * User can use setNumTopFeatures, setPercentile and setAlpha to set different selection methods.
  */
 @Since("1.6.0")
 final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: String)
@@ -99,7 +102,7 @@ final class ChiSqSelector @Since("1.6.0") (@Since("1.6.0") override val uid: Str
   def this() = this(Identifiable.randomUID("chiSqSelector"))
 
   /** @group setParam */
-  @Since("2.1.0")
+  @Since("1.6.0")
   def setNumTopFeatures(value: Int): this.type = {
     set(selectorType, ChiSqSelectorType.KBest.toString)
     set(numTopFeatures, value)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index 87998af01d39d..079e8ff9d598e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -166,21 +166,27 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] {
 
 /**
  * Creates a ChiSquared feature selector.
+ * The selector supports three selection methods: KBest, Percentile and FPR.
+ * By default, the selection method is KBest, the default number of top features is 50.
+ * User can use setNumTopFeatures, setPercentile and setAlpha to set different selection methods.
  */
-@Since("2.1.0")
+@Since("1.3.0")
 class ChiSqSelector @Since("2.1.0") () extends Serializable {
   var numTopFeatures: Int = 50
   var percentile: Double = 0.1
   var alpha: Double = 0.05
   var selectorType = ChiSqSelectorType.KBest
 
+  /**
+   * The is the same to call this() and setNumTopFeatures(numTopFeatures)
+   */
   @Since("1.3.0")
   def this(numTopFeatures: Int) {
     this()
     this.numTopFeatures = numTopFeatures
   }
 
-  @Since("2.1.0")
+  @Since("1.6.0")
   def setNumTopFeatures(value: Int): this.type = {
     numTopFeatures = value
     selectorType = ChiSqSelectorType.KBest
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index cc4be6f994ecc..5b489617afb45 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -310,8 +310,8 @@ class ChiSqSelector(object):
 
     .. versionadded:: 1.4.0
     """
-    def __init__(self):
-        self.numTopFeatures = 50
+    def __init__(self, numTopFeatures=50):
+        self.numTopFeatures = numTopFeatures
         self.selectorType = ChiSqSelectorType.KBest
 
     @since('2.1.0')

From 88d2143989a4219020100999c53bb1186fce5d1c Mon Sep 17 00:00:00 2001
From: "Peng, Meng" <peng.meng@intel.com>
Date: Mon, 19 Sep 2016 09:36:05 +0800
Subject: [PATCH 21/21] change javadoc

---
 .../scala/org/apache/spark/ml/feature/ChiSqSelector.scala  | 7 +++++--
 .../org/apache/spark/mllib/feature/ChiSqSelector.scala     | 7 +++++--
 python/pyspark/mllib/feature.py                            | 6 ++++++
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
index accaecb9d6dce..0c6a37bab0aad 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
@@ -90,8 +90,11 @@ private[feature] trait ChiSqSelectorParams extends Params
 /**
  * Chi-Squared feature selection, which selects categorical features to use for predicting a
  * categorical label.
- * The selector supports three selection methods: KBest, Percentile and FPR.
- * By default, the selection method is KBest, the default number of top features is 50.
+ * The selector supports three selection methods: `KBest`, `Percentile` and `FPR`.
+ * `KBest` chooses the `k` top features according to a chi-squared test.
+ * `Percentile` is similar but chooses a fraction of all features instead of a fixed number.
+ * `FPR` chooses all features whose false positive rate meets some threshold.
+ * By default, the selection method is `KBest`, the default number of top features is 50.
  * User can use setNumTopFeatures, setPercentile and setAlpha to set different selection methods.
  */
 @Since("1.6.0")
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index 079e8ff9d598e..f68a017184b21 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -166,8 +166,11 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] {
 
 /**
  * Creates a ChiSquared feature selector.
- * The selector supports three selection methods: KBest, Percentile and FPR.
- * By default, the selection method is KBest, the default number of top features is 50.
+ * The selector supports three selection methods: `KBest`, `Percentile` and `FPR`.
+ * `KBest` chooses the `k` top features according to a chi-squared test.
+ * `Percentile` is similar but chooses a fraction of all features instead of a fixed number.
+ * `FPR` chooses all features whose false positive rate meets some threshold.
+ * By default, the selection method is `KBest`, the default number of top features is 50.
  * User can use setNumTopFeatures, setPercentile and setAlpha to set different selection methods.
  */
 @Since("1.3.0")
diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py
index 87449979f3a8c..077c11370eb3f 100644
--- a/python/pyspark/mllib/feature.py
+++ b/python/pyspark/mllib/feature.py
@@ -281,6 +281,12 @@ class ChiSqSelectorType:
 class ChiSqSelector(object):
     """
     Creates a ChiSquared feature selector.
+    The selector supports three selection methods: `KBest`, `Percentile` and `FPR`.
+    `KBest` chooses the `k` top features according to a chi-squared test.
+    `Percentile` is similar but chooses a fraction of all features instead of a fixed number.
+    `FPR` chooses all features whose false positive rate meets some threshold.
+    By default, the selection method is `KBest`, the default number of top features is 50.
+    User can use setNumTopFeatures, setPercentile and setAlpha to set different selection methods.
 
     >>> data = [
     ...     LabeledPoint(0.0, SparseVector(3, {0: 8.0, 1: 7.0})),