From 819098cd43c8a0fc7bae6a9b3686d45f7aafebe4 Mon Sep 17 00:00:00 2001 From: MechCoder Date: Tue, 30 Jun 2015 01:14:18 +0530 Subject: [PATCH 1/5] [SPARK-8711] [ML] Add additional methods ot PySpark ML tree models --- python/pyspark/ml/classification.py | 18 +++++++++--- python/pyspark/ml/regression.py | 43 +++++++++++++++++++++++++++-- 2 files changed, 54 insertions(+), 7 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 7abbde8b260eb..53572c67a384e 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -18,7 +18,8 @@ from pyspark.ml.util import keyword_only from pyspark.ml.wrapper import JavaEstimator, JavaModel from pyspark.ml.param.shared import * -from pyspark.ml.regression import RandomForestParams +from pyspark.ml.regression import ( + RandomForestParams, DecisionTreeModel, treeEnsembleModels) from pyspark.mllib.common import inherit_doc @@ -202,6 +203,10 @@ class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred >>> td = si_model.transform(df) >>> dt = DecisionTreeClassifier(maxDepth=2, labelCol="indexed") >>> model = dt.fit(td) + >>> model.numNodes + 3 + >>> model.depth + 1 >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) >>> model.transform(test0).head().prediction 0.0 @@ -269,7 +274,8 @@ def getImpurity(self): return self.getOrDefault(self.impurity) -class DecisionTreeClassificationModel(JavaModel): +@inherit_doc +class DecisionTreeClassificationModel(DecisionTreeModel): """ Model fitted by DecisionTreeClassifier. """ @@ -294,6 +300,8 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred >>> td = si_model.transform(df) >>> rf = RandomForestClassifier(numTrees=2, maxDepth=2, labelCol="indexed", seed=42) >>> model = rf.fit(td) + >>> model.treeWeights + [1.0, 1.0] >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) >>> model.transform(test0).head().prediction 0.0 @@ -423,7 +431,7 @@ def getFeatureSubsetStrategy(self): return self.getOrDefault(self.featureSubsetStrategy) -class RandomForestClassificationModel(JavaModel): +class RandomForestClassificationModel(treeEnsembleModels): """ Model fitted by RandomForestClassifier. """ @@ -448,6 +456,8 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol >>> td = si_model.transform(df) >>> gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed") >>> model = gbt.fit(td) + >>> model.treeWeights + [1.0, 0.1, 0.1, 0.1, 0.1] >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) >>> model.transform(test0).head().prediction 0.0 @@ -558,7 +568,7 @@ def getStepSize(self): return self.getOrDefault(self.stepSize) -class GBTClassificationModel(JavaModel): +class GBTClassificationModel(treeEnsembleModels): """ Model fitted by GBTClassifier. """ diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index b139e27372d80..ffa43459eea01 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -172,6 +172,10 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) >>> dt = DecisionTreeRegressor(maxDepth=2) >>> model = dt.fit(df) + >>> model.depth + 2 + >>> model.numNodes + 1 >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) >>> model.transform(test0).head().prediction 0.0 @@ -239,7 +243,36 @@ def getImpurity(self): return self.getOrDefault(self.impurity) -class DecisionTreeRegressionModel(JavaModel): +@inherit_doc +class DecisionTreeModel(JavaModel): + + @property + def numNodes(self): + """Return number of nodes of the decision tree.""" + return self._call_java("numNodes") + + @property + def depth(self): + """Return depth of the decision tree.""" + return self._call_java("depth") + + def __repr__(self): + return self._call_java("toString") + + +@inherit_doc +class treeEnsembleModels(JavaModel): + + @property + def treeWeights(self): + return list(self._call_java("treeWeights")) + + def __repr__(self): + return self._call_java("toString") + + +@inherit_doc +class DecisionTreeRegressionModel(DecisionTreeModel): """ Model fitted by DecisionTreeRegressor. """ @@ -259,6 +292,8 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) >>> rf = RandomForestRegressor(numTrees=2, maxDepth=2, seed=42) >>> model = rf.fit(df) + >>> model.treeWeights + [1.0, 1.0] >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) >>> model.transform(test0).head().prediction 0.0 @@ -389,7 +424,7 @@ def getFeatureSubsetStrategy(self): return self.getOrDefault(self.featureSubsetStrategy) -class RandomForestRegressionModel(JavaModel): +class RandomForestRegressionModel(treeEnsembleModels): """ Model fitted by RandomForestRegressor. """ @@ -409,6 +444,8 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) >>> gbt = GBTRegressor(maxIter=5, maxDepth=2) >>> model = gbt.fit(df) + >>> model.treeWeights + [1.0, 0.1, 0.1, 0.1, 0.1] >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) >>> model.transform(test0).head().prediction 0.0 @@ -518,7 +555,7 @@ def getStepSize(self): return self.getOrDefault(self.stepSize) -class GBTRegressionModel(JavaModel): +class GBTRegressionModel(treeEnsembleModels): """ Model fitted by GBTRegressor. """ From 47d702399b35580977d2e47a3a344f06059c860e Mon Sep 17 00:00:00 2001 From: MechCoder Date: Tue, 30 Jun 2015 10:27:55 +0530 Subject: [PATCH 2/5] Use np.allclose and treeEnsembleModel -> TreeEnsembleMethods --- python/pyspark/ml/classification.py | 16 +++++++++------- python/pyspark/ml/regression.py | 20 +++++++++++--------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 53572c67a384e..89117e492846b 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -19,7 +19,7 @@ from pyspark.ml.wrapper import JavaEstimator, JavaModel from pyspark.ml.param.shared import * from pyspark.ml.regression import ( - RandomForestParams, DecisionTreeModel, treeEnsembleModels) + RandomForestParams, DecisionTreeModel, TreeEnsembleModels) from pyspark.mllib.common import inherit_doc @@ -290,6 +290,7 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred It supports both binary and multiclass labels, as well as both continuous and categorical features. + >>> from numpy import allclose >>> from pyspark.mllib.linalg import Vectors >>> from pyspark.ml.feature import StringIndexer >>> df = sqlContext.createDataFrame([ @@ -300,8 +301,8 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred >>> td = si_model.transform(df) >>> rf = RandomForestClassifier(numTrees=2, maxDepth=2, labelCol="indexed", seed=42) >>> model = rf.fit(td) - >>> model.treeWeights - [1.0, 1.0] + >>> allclose(model.treeWeights, [1.0, 1.0]) + True >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) >>> model.transform(test0).head().prediction 0.0 @@ -431,7 +432,7 @@ def getFeatureSubsetStrategy(self): return self.getOrDefault(self.featureSubsetStrategy) -class RandomForestClassificationModel(treeEnsembleModels): +class RandomForestClassificationModel(TreeEnsembleModels): """ Model fitted by RandomForestClassifier. """ @@ -446,6 +447,7 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol It supports binary labels, as well as both continuous and categorical features. Note: Multiclass labels are not currently supported. + >>> from numpy import allclose >>> from pyspark.mllib.linalg import Vectors >>> from pyspark.ml.feature import StringIndexer >>> df = sqlContext.createDataFrame([ @@ -456,8 +458,8 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol >>> td = si_model.transform(df) >>> gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed") >>> model = gbt.fit(td) - >>> model.treeWeights - [1.0, 0.1, 0.1, 0.1, 0.1] + >>> allclose(model.treeWeights, [1.0, 0.1, 0.1, 0.1, 0.1]) + True >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) >>> model.transform(test0).head().prediction 0.0 @@ -568,7 +570,7 @@ def getStepSize(self): return self.getOrDefault(self.stepSize) -class GBTClassificationModel(treeEnsembleModels): +class GBTClassificationModel(TreeEnsembleModels): """ Model fitted by GBTClassifier. """ diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index ffa43459eea01..2142b2a7bd966 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -173,9 +173,9 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi >>> dt = DecisionTreeRegressor(maxDepth=2) >>> model = dt.fit(df) >>> model.depth - 2 - >>> model.numNodes 1 + >>> model.numNodes + 3 >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) >>> model.transform(test0).head().prediction 0.0 @@ -261,7 +261,7 @@ def __repr__(self): @inherit_doc -class treeEnsembleModels(JavaModel): +class TreeEnsembleModels(JavaModel): @property def treeWeights(self): @@ -286,14 +286,15 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi learning algorithm for regression. It supports both continuous and categorical features. + >>> from numpy import allclose >>> from pyspark.mllib.linalg import Vectors >>> df = sqlContext.createDataFrame([ ... (1.0, Vectors.dense(1.0)), ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) >>> rf = RandomForestRegressor(numTrees=2, maxDepth=2, seed=42) >>> model = rf.fit(df) - >>> model.treeWeights - [1.0, 1.0] + >>> allclose(model.treeWeights, [1.0, 1.0]) + True >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) >>> model.transform(test0).head().prediction 0.0 @@ -424,7 +425,7 @@ def getFeatureSubsetStrategy(self): return self.getOrDefault(self.featureSubsetStrategy) -class RandomForestRegressionModel(treeEnsembleModels): +class RandomForestRegressionModel(TreeEnsembleModels): """ Model fitted by RandomForestRegressor. """ @@ -438,14 +439,15 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, learning algorithm for regression. It supports both continuous and categorical features. + >>> from numpy import allclose >>> from pyspark.mllib.linalg import Vectors >>> df = sqlContext.createDataFrame([ ... (1.0, Vectors.dense(1.0)), ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) >>> gbt = GBTRegressor(maxIter=5, maxDepth=2) >>> model = gbt.fit(df) - >>> model.treeWeights - [1.0, 0.1, 0.1, 0.1, 0.1] + >>> allclose(model.treeWeights, [1.0, 0.1, 0.1, 0.1, 0.1]) + True >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) >>> model.transform(test0).head().prediction 0.0 @@ -555,7 +557,7 @@ def getStepSize(self): return self.getOrDefault(self.stepSize) -class GBTRegressionModel(treeEnsembleModels): +class GBTRegressionModel(TreeEnsembleModels): """ Model fitted by GBTRegressor. """ From 6d16ad8b38a9a47de82d261fcbb3d6a0dec621c1 Mon Sep 17 00:00:00 2001 From: MechCoder Date: Wed, 1 Jul 2015 00:15:05 +0530 Subject: [PATCH 3/5] Fix Python 3 Error --- .../src/main/scala/org/apache/spark/ml/tree/treeModels.scala | 5 +++++ python/pyspark/ml/regression.py | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala index 1929f9d02156e..4c643d9c6ce4b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala @@ -17,6 +17,7 @@ package org.apache.spark.ml.tree +import org.apache.spark.mllib.linalg.{Vectors, Vector} /** * Abstraction for Decision Tree models. @@ -70,6 +71,10 @@ private[ml] trait TreeEnsembleModel { /** Weights for each tree, zippable with [[trees]] */ def treeWeights: Array[Double] + /** Weights used by the python wrappers. */ + // Note: An array cannot be returned directly due to serialization problems. + def pyTreeWeights: Vector = Vectors.dense(treeWeights) + /** Summary of the model */ override def toString: String = { // Implementing classes should generally override this method to be more descriptive. diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 2142b2a7bd966..40017fc39c08f 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -265,7 +265,8 @@ class TreeEnsembleModels(JavaModel): @property def treeWeights(self): - return list(self._call_java("treeWeights")) + """Return the weights for each tree""" + return list(self._call_java("pyTreeWeights")) def __repr__(self): return self._call_java("toString") From 38a08606cb82578f989d60846f79033e02e4edb7 Mon Sep 17 00:00:00 2001 From: MechCoder Date: Tue, 7 Jul 2015 01:11:01 +0530 Subject: [PATCH 4/5] rename pyTreeWeights to javaTreeWeights --- mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala | 2 +- python/pyspark/ml/regression.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala index 4c643d9c6ce4b..0f9709e6638d5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala @@ -73,7 +73,7 @@ private[ml] trait TreeEnsembleModel { /** Weights used by the python wrappers. */ // Note: An array cannot be returned directly due to serialization problems. - def pyTreeWeights: Vector = Vectors.dense(treeWeights) + def javaTreeWeights: Vector = Vectors.dense(treeWeights) /** Summary of the model */ override def toString: String = { diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 40017fc39c08f..44f60a769566d 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -266,7 +266,7 @@ class TreeEnsembleModels(JavaModel): @property def treeWeights(self): """Return the weights for each tree""" - return list(self._call_java("pyTreeWeights")) + return list(self._call_java("javaTreeWeights")) def __repr__(self): return self._call_java("toString") From 23b08be2b912fa40ca5162935bd22dfae7a3e967 Mon Sep 17 00:00:00 2001 From: MechCoder Date: Tue, 7 Jul 2015 10:18:58 +0530 Subject: [PATCH 5/5] private [spark] --- mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala index 0f9709e6638d5..22873909c33fa 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala @@ -73,7 +73,7 @@ private[ml] trait TreeEnsembleModel { /** Weights used by the python wrappers. */ // Note: An array cannot be returned directly due to serialization problems. - def javaTreeWeights: Vector = Vectors.dense(treeWeights) + private[spark] def javaTreeWeights: Vector = Vectors.dense(treeWeights) /** Summary of the model */ override def toString: String = {