From a2358f7afa8502b8272a4e7caa6c64ad9f0db27d Mon Sep 17 00:00:00 2001
From: Ruben Janssen <rubenljanssen@gmail.com>
Date: Sat, 16 Jul 2016 16:03:19 +0100
Subject: [PATCH 1/9] added a python example for chisq selector in mllib

---
 .../python/mllib/chisq_selector_example.py    | 54 +++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 examples/src/main/python/mllib/chisq_selector_example.py
diff --git a/examples/src/main/python/mllib/chisq_selector_example.py b/examples/src/main/python/mllib/chisq_selector_example.py
new file mode 100644
index 0000000000000..55b8640eefa9d
--- /dev/null
+++ b/examples/src/main/python/mllib/chisq_selector_example.py
@@ -0,0 +1,54 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+from pyspark import SparkContext
+
+import numpy as np
+
+# $example on$
+from pyspark.mllib.regression import LabeledPoint
+from pyspark.mllib.feature import ChiSqSelector
+from pyspark.mllib.util import MLUtils
+# $example off$
+
+if __name__ == "__main__":
+    sc = SparkContext(appName="ChiSqSelectorExample")
+
+    # $example on$
+    # Load and parse the data file into an RDD of LabeledPoint.
+    data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt')
+
+    # Discretize data in 16 equal bins since ChiSqSelector requires categorical features
+    def distributeOverBins(lp):
+        return np.array(map(lambda x: x % 16, lp.features.toArray()))
+
+    # Even though features are doubles, the ChiSqSelector treats each unique value as a category
+    discretizedData = data.map(lambda lp: LabeledPoint(lp.label, distributeOverBins(lp)))
+
+    # Create ChiSqSelector that will select top 50 of 692 features
+    selector = ChiSqSelector(numTopFeatures=50)
+
+    # Create ChiSqSelector model (selecting features)
+    transformer = selector.fit(discretizedData)
+
+    # Filter the top 50 features from each feature vector
+    filteredData = transformer.transform(discretizedData.map(lambda lp: lp.features))
+    # $example off$
+
+    filteredData.foreach(print)
+    sc.stop()

From ca7cd787e174e04fbe0fcdcff26c8169450abc7b Mon Sep 17 00:00:00 2001
From: Ruben Janssen <rubenljanssen@gmail.com>
Date: Mon, 1 Aug 2016 19:14:01 +0100
Subject: [PATCH 2/9] updated documentation to refer to the example

---
 docs/mllib-feature-extraction.md | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md
index 867be7f2932ed..71a0336275676 100644
--- a/docs/mllib-feature-extraction.md
+++ b/docs/mllib-feature-extraction.md
@@ -251,7 +251,6 @@ The following example shows the basic use of ChiSqSelector. The data set used ha
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
-
 Refer to the [`ChiSqSelector` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector)
 for details on the API.
 
@@ -259,12 +258,18 @@ for details on the API.
 </div>
 
 <div data-lang="java" markdown="1">
-
 Refer to the [`ChiSqSelector` Java docs](api/java/org/apache/spark/mllib/feature/ChiSqSelector.html)
 for details on the API.
 
 {% include_example java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java %}
 </div>
+
+<div data-lang="python" markdown="1">
+Refer to the [`ChiSqSelector` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.feature.ChiSqSelector)
+for details on the API.
+
+{% include_example python/mllib/chisq_selector_example.py %}
+</div>
 </div>
 
 ## ElementwiseProduct

From 035aeb63ef8e8f2af8f7ed838d434a069392c336 Mon Sep 17 00:00:00 2001
From: Ruben Janssen <rubenljanssen@gmail.com>
Date: Sun, 16 Oct 2016 16:00:44 +0100
Subject: [PATCH 3/9] updated with changes suggested by sethah

---
 .../src/main/python/mllib/chisq_selector_example.py  | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/examples/src/main/python/mllib/chisq_selector_example.py b/examples/src/main/python/mllib/chisq_selector_example.py
index 55b8640eefa9d..0466dd93c9a26 100644
--- a/examples/src/main/python/mllib/chisq_selector_example.py
+++ b/examples/src/main/python/mllib/chisq_selector_example.py
@@ -24,6 +24,7 @@
 from pyspark.mllib.regression import LabeledPoint
 from pyspark.mllib.feature import ChiSqSelector
 from pyspark.mllib.util import MLUtils
+from pyspark.mllib.linalg import Vectors
 # $example off$
 
 if __name__ == "__main__":
@@ -35,7 +36,8 @@
 
     # Discretize data in 16 equal bins since ChiSqSelector requires categorical features
     def distributeOverBins(lp):
-        return np.array(map(lambda x: x % 16, lp.features.toArray()))
+        return np.floor(lp.features.toArray() / 16)
+
 
     # Even though features are doubles, the ChiSqSelector treats each unique value as a category
     discretizedData = data.map(lambda lp: LabeledPoint(lp.label, distributeOverBins(lp)))
@@ -47,8 +49,14 @@ def distributeOverBins(lp):
     transformer = selector.fit(discretizedData)
 
     # Filter the top 50 features from each feature vector
-    filteredData = transformer.transform(discretizedData.map(lambda lp: lp.features))
+
+    #filteredData = transformer.transform(discretizedData.map(lambda lp: lp.features))
+    filteredData = discretizedData.map(lambda lp: LabeledPoint(lp.label, transformer.transform(np.array([1]))))
+
+
     # $example off$
 
+    print('filtered data:')
     filteredData.foreach(print)
+
     sc.stop()

From f49e6aea59994c471ea0270b41d5237a1f2a6a47 Mon Sep 17 00:00:00 2001
From: Ruben Janssen <rubenljanssen@gmail.com>
Date: Sun, 16 Oct 2016 16:09:46 +0100
Subject: [PATCH 4/9] oops forgot to revert back local changes

---
 examples/src/main/python/mllib/chisq_selector_example.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/examples/src/main/python/mllib/chisq_selector_example.py b/examples/src/main/python/mllib/chisq_selector_example.py
index 0466dd93c9a26..65e8d80da38bd 100644
--- a/examples/src/main/python/mllib/chisq_selector_example.py
+++ b/examples/src/main/python/mllib/chisq_selector_example.py
@@ -24,7 +24,6 @@
 from pyspark.mllib.regression import LabeledPoint
 from pyspark.mllib.feature import ChiSqSelector
 from pyspark.mllib.util import MLUtils
-from pyspark.mllib.linalg import Vectors
 # $example off$
 
 if __name__ == "__main__":
@@ -38,7 +37,6 @@
     def distributeOverBins(lp):
         return np.floor(lp.features.toArray() / 16)
 
-
     # Even though features are doubles, the ChiSqSelector treats each unique value as a category
     discretizedData = data.map(lambda lp: LabeledPoint(lp.label, distributeOverBins(lp)))
 
@@ -49,11 +47,7 @@ def distributeOverBins(lp):
     transformer = selector.fit(discretizedData)
 
     # Filter the top 50 features from each feature vector
-
-    #filteredData = transformer.transform(discretizedData.map(lambda lp: lp.features))
-    filteredData = discretizedData.map(lambda lp: LabeledPoint(lp.label, transformer.transform(np.array([1]))))
-
-
+    filteredData = transformer.transform(discretizedData.map(lambda lp: lp.features))
     # $example off$
 
     print('filtered data:')

From 8363e28e2d400c599052120153fc08eff8253cd5 Mon Sep 17 00:00:00 2001
From: setjet <rubenljanssen@gmail.com>
Date: Mon, 3 Apr 2017 20:53:02 +0100
Subject: [PATCH 5/9] increased pyspark version

---
 python/pyspark/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index 08a301695fda7..41bf8c269b795 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.1.0.dev0"
+__version__ = "2.2.0.dev0"

From 881470d87d499c16cfbf6ea0a265369d60ba8f80 Mon Sep 17 00:00:00 2001
From: setjet <rubenljanssen@gmail.com>
Date: Mon, 3 Apr 2017 22:25:37 +0100
Subject: [PATCH 6/9] Revert "oops forgot to revert back local changes"

This reverts commit f49e6aea59994c471ea0270b41d5237a1f2a6a47.
---
 examples/src/main/python/mllib/chisq_selector_example.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/examples/src/main/python/mllib/chisq_selector_example.py b/examples/src/main/python/mllib/chisq_selector_example.py
index 65e8d80da38bd..0466dd93c9a26 100644
--- a/examples/src/main/python/mllib/chisq_selector_example.py
+++ b/examples/src/main/python/mllib/chisq_selector_example.py
@@ -24,6 +24,7 @@
 from pyspark.mllib.regression import LabeledPoint
 from pyspark.mllib.feature import ChiSqSelector
 from pyspark.mllib.util import MLUtils
+from pyspark.mllib.linalg import Vectors
 # $example off$
 
 if __name__ == "__main__":
@@ -37,6 +38,7 @@
     def distributeOverBins(lp):
         return np.floor(lp.features.toArray() / 16)
 
+
     # Even though features are doubles, the ChiSqSelector treats each unique value as a category
     discretizedData = data.map(lambda lp: LabeledPoint(lp.label, distributeOverBins(lp)))
 
@@ -47,7 +49,11 @@ def distributeOverBins(lp):
     transformer = selector.fit(discretizedData)
 
     # Filter the top 50 features from each feature vector
-    filteredData = transformer.transform(discretizedData.map(lambda lp: lp.features))
+
+    #filteredData = transformer.transform(discretizedData.map(lambda lp: lp.features))
+    filteredData = discretizedData.map(lambda lp: LabeledPoint(lp.label, transformer.transform(np.array([1]))))
+
+
     # $example off$
 
     print('filtered data:')

From 09171936d5d1e9293fee6d28c44d74441a4920ab Mon Sep 17 00:00:00 2001
From: setjet <rubenljanssen@gmail.com>
Date: Mon, 3 Apr 2017 22:26:03 +0100
Subject: [PATCH 7/9] Revert "updated with changes suggested by sethah"

This reverts commit 035aeb63ef8e8f2af8f7ed838d434a069392c336.
---
 .../src/main/python/mllib/chisq_selector_example.py  | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/examples/src/main/python/mllib/chisq_selector_example.py b/examples/src/main/python/mllib/chisq_selector_example.py
index 0466dd93c9a26..55b8640eefa9d 100644
--- a/examples/src/main/python/mllib/chisq_selector_example.py
+++ b/examples/src/main/python/mllib/chisq_selector_example.py
@@ -24,7 +24,6 @@
 from pyspark.mllib.regression import LabeledPoint
 from pyspark.mllib.feature import ChiSqSelector
 from pyspark.mllib.util import MLUtils
-from pyspark.mllib.linalg import Vectors
 # $example off$
 
 if __name__ == "__main__":
@@ -36,8 +35,7 @@
 
     # Discretize data in 16 equal bins since ChiSqSelector requires categorical features
     def distributeOverBins(lp):
-        return np.floor(lp.features.toArray() / 16)
-
+        return np.array(map(lambda x: x % 16, lp.features.toArray()))
 
     # Even though features are doubles, the ChiSqSelector treats each unique value as a category
     discretizedData = data.map(lambda lp: LabeledPoint(lp.label, distributeOverBins(lp)))
@@ -49,14 +47,8 @@ def distributeOverBins(lp):
     transformer = selector.fit(discretizedData)
 
     # Filter the top 50 features from each feature vector
-
-    #filteredData = transformer.transform(discretizedData.map(lambda lp: lp.features))
-    filteredData = discretizedData.map(lambda lp: LabeledPoint(lp.label, transformer.transform(np.array([1]))))
-
-
+    filteredData = transformer.transform(discretizedData.map(lambda lp: lp.features))
     # $example off$
 
-    print('filtered data:')
     filteredData.foreach(print)
-
     sc.stop()

From c15654aa242d486b5eeb7e22e79915a165f6bb99 Mon Sep 17 00:00:00 2001
From: setjet <rubenljanssen@gmail.com>
Date: Mon, 3 Apr 2017 22:26:30 +0100
Subject: [PATCH 8/9] Revert "updated documentation to refer to the example"

This reverts commit ca7cd787e174e04fbe0fcdcff26c8169450abc7b.
---
 docs/mllib-feature-extraction.md | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md
index 503948b3acd24..75aea70601875 100644
--- a/docs/mllib-feature-extraction.md
+++ b/docs/mllib-feature-extraction.md
@@ -256,6 +256,7 @@ The following example shows the basic use of ChiSqSelector. The data set used ha
 
 <div class="codetabs">
 <div data-lang="scala" markdown="1">
+
 Refer to the [`ChiSqSelector` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector)
 for details on the API.
 
@@ -263,18 +264,12 @@ for details on the API.
 </div>
 
 <div data-lang="java" markdown="1">
+
 Refer to the [`ChiSqSelector` Java docs](api/java/org/apache/spark/mllib/feature/ChiSqSelector.html)
 for details on the API.
 
 {% include_example java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java %}
 </div>
-
-<div data-lang="python" markdown="1">
-Refer to the [`ChiSqSelector` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.feature.ChiSqSelector)
-for details on the API.
-
-{% include_example python/mllib/chisq_selector_example.py %}
-</div>
 </div>
 
 ## ElementwiseProduct

From 47e4ab2cf8794718d68b5007f4980aae175eb94e Mon Sep 17 00:00:00 2001
From: setjet <rubenljanssen@gmail.com>
Date: Mon, 3 Apr 2017 22:26:39 +0100
Subject: [PATCH 9/9] Revert "added a python example for chisq selector in
 mllib"

This reverts commit a2358f7afa8502b8272a4e7caa6c64ad9f0db27d.
---
 .../python/mllib/chisq_selector_example.py    | 54 -------------------
 1 file changed, 54 deletions(-)
 delete mode 100644 examples/src/main/python/mllib/chisq_selector_example.py

diff --git a/examples/src/main/python/mllib/chisq_selector_example.py b/examples/src/main/python/mllib/chisq_selector_example.py
deleted file mode 100644
index 55b8640eefa9d..0000000000000
--- a/examples/src/main/python/mllib/chisq_selector_example.py
+++ /dev/null
@@ -1,54 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from __future__ import print_function
-from pyspark import SparkContext
-
-import numpy as np
-
-# $example on$
-from pyspark.mllib.regression import LabeledPoint
-from pyspark.mllib.feature import ChiSqSelector
-from pyspark.mllib.util import MLUtils
-# $example off$
-
-if __name__ == "__main__":
-    sc = SparkContext(appName="ChiSqSelectorExample")
-
-    # $example on$
-    # Load and parse the data file into an RDD of LabeledPoint.
-    data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt')
-
-    # Discretize data in 16 equal bins since ChiSqSelector requires categorical features
-    def distributeOverBins(lp):
-        return np.array(map(lambda x: x % 16, lp.features.toArray()))
-
-    # Even though features are doubles, the ChiSqSelector treats each unique value as a category
-    discretizedData = data.map(lambda lp: LabeledPoint(lp.label, distributeOverBins(lp)))
-
-    # Create ChiSqSelector that will select top 50 of 692 features
-    selector = ChiSqSelector(numTopFeatures=50)
-
-    # Create ChiSqSelector model (selecting features)
-    transformer = selector.fit(discretizedData)
-
-    # Filter the top 50 features from each feature vector
-    filteredData = transformer.transform(discretizedData.map(lambda lp: lp.features))
-    # $example off$
-
-    filteredData.foreach(print)
-    sc.stop()