From a2358f7afa8502b8272a4e7caa6c64ad9f0db27d Mon Sep 17 00:00:00 2001 From: Ruben Janssen Date: Sat, 16 Jul 2016 16:03:19 +0100 Subject: [PATCH 1/9] added a python example for chisq selector in mllib --- .../python/mllib/chisq_selector_example.py | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 examples/src/main/python/mllib/chisq_selector_example.py diff --git a/examples/src/main/python/mllib/chisq_selector_example.py b/examples/src/main/python/mllib/chisq_selector_example.py new file mode 100644 index 0000000000000..55b8640eefa9d --- /dev/null +++ b/examples/src/main/python/mllib/chisq_selector_example.py @@ -0,0 +1,54 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import print_function +from pyspark import SparkContext + +import numpy as np + +# $example on$ +from pyspark.mllib.regression import LabeledPoint +from pyspark.mllib.feature import ChiSqSelector +from pyspark.mllib.util import MLUtils +# $example off$ + +if __name__ == "__main__": + sc = SparkContext(appName="ChiSqSelectorExample") + + # $example on$ + # Load and parse the data file into an RDD of LabeledPoint. + data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt') + + # Discretize data in 16 equal bins since ChiSqSelector requires categorical features + def distributeOverBins(lp): + return np.array(map(lambda x: x % 16, lp.features.toArray())) + + # Even though features are doubles, the ChiSqSelector treats each unique value as a category + discretizedData = data.map(lambda lp: LabeledPoint(lp.label, distributeOverBins(lp))) + + # Create ChiSqSelector that will select top 50 of 692 features + selector = ChiSqSelector(numTopFeatures=50) + + # Create ChiSqSelector model (selecting features) + transformer = selector.fit(discretizedData) + + # Filter the top 50 features from each feature vector + filteredData = transformer.transform(discretizedData.map(lambda lp: lp.features)) + # $example off$ + + filteredData.foreach(print) + sc.stop() From ca7cd787e174e04fbe0fcdcff26c8169450abc7b Mon Sep 17 00:00:00 2001 From: Ruben Janssen Date: Mon, 1 Aug 2016 19:14:01 +0100 Subject: [PATCH 2/9] updated documentation to refer to the example --- docs/mllib-feature-extraction.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md index 867be7f2932ed..71a0336275676 100644 --- a/docs/mllib-feature-extraction.md +++ b/docs/mllib-feature-extraction.md @@ -251,7 +251,6 @@ The following example shows the basic use of ChiSqSelector. The data set used ha
- Refer to the [`ChiSqSelector` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector) for details on the API. @@ -259,12 +258,18 @@ for details on the API.
- Refer to the [`ChiSqSelector` Java docs](api/java/org/apache/spark/mllib/feature/ChiSqSelector.html) for details on the API. {% include_example java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java %}
+ +
+Refer to the [`ChiSqSelector` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.feature.ChiSqSelector) +for details on the API. + +{% include_example python/mllib/chisq_selector_example.py %} +
## ElementwiseProduct From 035aeb63ef8e8f2af8f7ed838d434a069392c336 Mon Sep 17 00:00:00 2001 From: Ruben Janssen Date: Sun, 16 Oct 2016 16:00:44 +0100 Subject: [PATCH 3/9] updated with changes suggested by sethah --- .../src/main/python/mllib/chisq_selector_example.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/examples/src/main/python/mllib/chisq_selector_example.py b/examples/src/main/python/mllib/chisq_selector_example.py index 55b8640eefa9d..0466dd93c9a26 100644 --- a/examples/src/main/python/mllib/chisq_selector_example.py +++ b/examples/src/main/python/mllib/chisq_selector_example.py @@ -24,6 +24,7 @@ from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.feature import ChiSqSelector from pyspark.mllib.util import MLUtils +from pyspark.mllib.linalg import Vectors # $example off$ if __name__ == "__main__": @@ -35,7 +36,8 @@ # Discretize data in 16 equal bins since ChiSqSelector requires categorical features def distributeOverBins(lp): - return np.array(map(lambda x: x % 16, lp.features.toArray())) + return np.floor(lp.features.toArray() / 16) + # Even though features are doubles, the ChiSqSelector treats each unique value as a category discretizedData = data.map(lambda lp: LabeledPoint(lp.label, distributeOverBins(lp))) @@ -47,8 +49,14 @@ def distributeOverBins(lp): transformer = selector.fit(discretizedData) # Filter the top 50 features from each feature vector - filteredData = transformer.transform(discretizedData.map(lambda lp: lp.features)) + + #filteredData = transformer.transform(discretizedData.map(lambda lp: lp.features)) + filteredData = discretizedData.map(lambda lp: LabeledPoint(lp.label, transformer.transform(np.array([1])))) + + # $example off$ + print('filtered data:') filteredData.foreach(print) + sc.stop() From f49e6aea59994c471ea0270b41d5237a1f2a6a47 Mon Sep 17 00:00:00 2001 From: Ruben Janssen Date: Sun, 16 Oct 2016 16:09:46 +0100 Subject: [PATCH 4/9] oops forgot to revert back local changes --- examples/src/main/python/mllib/chisq_selector_example.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/examples/src/main/python/mllib/chisq_selector_example.py b/examples/src/main/python/mllib/chisq_selector_example.py index 0466dd93c9a26..65e8d80da38bd 100644 --- a/examples/src/main/python/mllib/chisq_selector_example.py +++ b/examples/src/main/python/mllib/chisq_selector_example.py @@ -24,7 +24,6 @@ from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.feature import ChiSqSelector from pyspark.mllib.util import MLUtils -from pyspark.mllib.linalg import Vectors # $example off$ if __name__ == "__main__": @@ -38,7 +37,6 @@ def distributeOverBins(lp): return np.floor(lp.features.toArray() / 16) - # Even though features are doubles, the ChiSqSelector treats each unique value as a category discretizedData = data.map(lambda lp: LabeledPoint(lp.label, distributeOverBins(lp))) @@ -49,11 +47,7 @@ def distributeOverBins(lp): transformer = selector.fit(discretizedData) # Filter the top 50 features from each feature vector - - #filteredData = transformer.transform(discretizedData.map(lambda lp: lp.features)) - filteredData = discretizedData.map(lambda lp: LabeledPoint(lp.label, transformer.transform(np.array([1])))) - - + filteredData = transformer.transform(discretizedData.map(lambda lp: lp.features)) # $example off$ print('filtered data:') From 8363e28e2d400c599052120153fc08eff8253cd5 Mon Sep 17 00:00:00 2001 From: setjet Date: Mon, 3 Apr 2017 20:53:02 +0100 Subject: [PATCH 5/9] increased pyspark version --- python/pyspark/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/version.py b/python/pyspark/version.py index 08a301695fda7..41bf8c269b795 100644 --- a/python/pyspark/version.py +++ b/python/pyspark/version.py @@ -16,4 +16,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.1.0.dev0" +__version__ = "2.2.0.dev0" From 881470d87d499c16cfbf6ea0a265369d60ba8f80 Mon Sep 17 00:00:00 2001 From: setjet Date: Mon, 3 Apr 2017 22:25:37 +0100 Subject: [PATCH 6/9] Revert "oops forgot to revert back local changes" This reverts commit f49e6aea59994c471ea0270b41d5237a1f2a6a47. --- examples/src/main/python/mllib/chisq_selector_example.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/examples/src/main/python/mllib/chisq_selector_example.py b/examples/src/main/python/mllib/chisq_selector_example.py index 65e8d80da38bd..0466dd93c9a26 100644 --- a/examples/src/main/python/mllib/chisq_selector_example.py +++ b/examples/src/main/python/mllib/chisq_selector_example.py @@ -24,6 +24,7 @@ from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.feature import ChiSqSelector from pyspark.mllib.util import MLUtils +from pyspark.mllib.linalg import Vectors # $example off$ if __name__ == "__main__": @@ -37,6 +38,7 @@ def distributeOverBins(lp): return np.floor(lp.features.toArray() / 16) + # Even though features are doubles, the ChiSqSelector treats each unique value as a category discretizedData = data.map(lambda lp: LabeledPoint(lp.label, distributeOverBins(lp))) @@ -47,7 +49,11 @@ def distributeOverBins(lp): transformer = selector.fit(discretizedData) # Filter the top 50 features from each feature vector - filteredData = transformer.transform(discretizedData.map(lambda lp: lp.features)) + + #filteredData = transformer.transform(discretizedData.map(lambda lp: lp.features)) + filteredData = discretizedData.map(lambda lp: LabeledPoint(lp.label, transformer.transform(np.array([1])))) + + # $example off$ print('filtered data:') From 09171936d5d1e9293fee6d28c44d74441a4920ab Mon Sep 17 00:00:00 2001 From: setjet Date: Mon, 3 Apr 2017 22:26:03 +0100 Subject: [PATCH 7/9] Revert "updated with changes suggested by sethah" This reverts commit 035aeb63ef8e8f2af8f7ed838d434a069392c336. --- .../src/main/python/mllib/chisq_selector_example.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/examples/src/main/python/mllib/chisq_selector_example.py b/examples/src/main/python/mllib/chisq_selector_example.py index 0466dd93c9a26..55b8640eefa9d 100644 --- a/examples/src/main/python/mllib/chisq_selector_example.py +++ b/examples/src/main/python/mllib/chisq_selector_example.py @@ -24,7 +24,6 @@ from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.feature import ChiSqSelector from pyspark.mllib.util import MLUtils -from pyspark.mllib.linalg import Vectors # $example off$ if __name__ == "__main__": @@ -36,8 +35,7 @@ # Discretize data in 16 equal bins since ChiSqSelector requires categorical features def distributeOverBins(lp): - return np.floor(lp.features.toArray() / 16) - + return np.array(map(lambda x: x % 16, lp.features.toArray())) # Even though features are doubles, the ChiSqSelector treats each unique value as a category discretizedData = data.map(lambda lp: LabeledPoint(lp.label, distributeOverBins(lp))) @@ -49,14 +47,8 @@ def distributeOverBins(lp): transformer = selector.fit(discretizedData) # Filter the top 50 features from each feature vector - - #filteredData = transformer.transform(discretizedData.map(lambda lp: lp.features)) - filteredData = discretizedData.map(lambda lp: LabeledPoint(lp.label, transformer.transform(np.array([1])))) - - + filteredData = transformer.transform(discretizedData.map(lambda lp: lp.features)) # $example off$ - print('filtered data:') filteredData.foreach(print) - sc.stop() From c15654aa242d486b5eeb7e22e79915a165f6bb99 Mon Sep 17 00:00:00 2001 From: setjet Date: Mon, 3 Apr 2017 22:26:30 +0100 Subject: [PATCH 8/9] Revert "updated documentation to refer to the example" This reverts commit ca7cd787e174e04fbe0fcdcff26c8169450abc7b. --- docs/mllib-feature-extraction.md | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/docs/mllib-feature-extraction.md b/docs/mllib-feature-extraction.md index 503948b3acd24..75aea70601875 100644 --- a/docs/mllib-feature-extraction.md +++ b/docs/mllib-feature-extraction.md @@ -256,6 +256,7 @@ The following example shows the basic use of ChiSqSelector. The data set used ha
+ Refer to the [`ChiSqSelector` Scala docs](api/scala/index.html#org.apache.spark.mllib.feature.ChiSqSelector) for details on the API. @@ -263,18 +264,12 @@ for details on the API.
+ Refer to the [`ChiSqSelector` Java docs](api/java/org/apache/spark/mllib/feature/ChiSqSelector.html) for details on the API. {% include_example java/org/apache/spark/examples/mllib/JavaChiSqSelectorExample.java %}
- -
-Refer to the [`ChiSqSelector` Python docs](api/python/pyspark.mllib.html#pyspark.mllib.feature.ChiSqSelector) -for details on the API. - -{% include_example python/mllib/chisq_selector_example.py %} -
## ElementwiseProduct From 47e4ab2cf8794718d68b5007f4980aae175eb94e Mon Sep 17 00:00:00 2001 From: setjet Date: Mon, 3 Apr 2017 22:26:39 +0100 Subject: [PATCH 9/9] Revert "added a python example for chisq selector in mllib" This reverts commit a2358f7afa8502b8272a4e7caa6c64ad9f0db27d. --- .../python/mllib/chisq_selector_example.py | 54 ------------------- 1 file changed, 54 deletions(-) delete mode 100644 examples/src/main/python/mllib/chisq_selector_example.py diff --git a/examples/src/main/python/mllib/chisq_selector_example.py b/examples/src/main/python/mllib/chisq_selector_example.py deleted file mode 100644 index 55b8640eefa9d..0000000000000 --- a/examples/src/main/python/mllib/chisq_selector_example.py +++ /dev/null @@ -1,54 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from __future__ import print_function -from pyspark import SparkContext - -import numpy as np - -# $example on$ -from pyspark.mllib.regression import LabeledPoint -from pyspark.mllib.feature import ChiSqSelector -from pyspark.mllib.util import MLUtils -# $example off$ - -if __name__ == "__main__": - sc = SparkContext(appName="ChiSqSelectorExample") - - # $example on$ - # Load and parse the data file into an RDD of LabeledPoint. - data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt') - - # Discretize data in 16 equal bins since ChiSqSelector requires categorical features - def distributeOverBins(lp): - return np.array(map(lambda x: x % 16, lp.features.toArray())) - - # Even though features are doubles, the ChiSqSelector treats each unique value as a category - discretizedData = data.map(lambda lp: LabeledPoint(lp.label, distributeOverBins(lp))) - - # Create ChiSqSelector that will select top 50 of 692 features - selector = ChiSqSelector(numTopFeatures=50) - - # Create ChiSqSelector model (selecting features) - transformer = selector.fit(discretizedData) - - # Filter the top 50 features from each feature vector - filteredData = transformer.transform(discretizedData.map(lambda lp: lp.features)) - # $example off$ - - filteredData.foreach(print) - sc.stop()