From c596b3931e4653b4e064a73ebbb58aed52baa995 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Tue, 11 Oct 2016 18:11:57 +0800 Subject: [PATCH 01/19] recreate pr --- .../spark/ml/clustering/GaussianMixture.scala | 15 +++++++++++++++ .../ml/clustering/GaussianMixtureSuite.scala | 7 +++++++ 2 files changed, 22 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala index a7bb41379538d..ba42bc0c50b89 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala @@ -129,6 +129,21 @@ class GaussianMixtureModel private[ml] ( Vectors.dense(probs) } + /** + * Return the total log-likelihood for this model on the given data. + */ + @Since("2.1.0") + def computeLogLikelihood(dataset: Dataset[_]): Double = { + SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) + val sc = dataset.sparkSession.sparkContext + val bcweightVec = sc.broadcast(Vectors.dense(weights)) + transform(dataset).select($(probabilityCol)).map { + case Row(probs: Vector) => + val likelihood = BLAS.dot(probs, bcweightVec.value) + math.log(likelihood) + }.reduce(_ + _) + } + /** * Retrieve Gaussian distributions as a DataFrame. * Each row represents a Gaussian Distribution. diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala index a362aeea3962b..83b336e4ce256 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala @@ -140,6 +140,13 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext model.setSummary(None) assert(!model.hasSummary) + + // Check validity of LogLikelihood + val llk = transformed.select(probabilityColName).map(_.getAs[Vector](0)).map { probs => + val likelihood = probs.toArray.zip(model.weights).map { case (a, b) => a * b }.sum + math.log(likelihood) + }.rdd.sum() + assert(model.computeLogLikelihood(dataset) === llk) } test("read/write") { From 6973301f0e90a1aa3793da1535d3afff128bcda5 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Tue, 11 Oct 2016 21:25:20 +0800 Subject: [PATCH 02/19] update pr --- .../spark/ml/clustering/GaussianMixture.scala | 19 ++++++++++---- .../org/apache/spark/mllib/util/MLUtils.scala | 2 +- .../ml/clustering/GaussianMixtureSuite.scala | 26 +++++++++++++++++++ 3 files changed, 41 insertions(+), 6 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala index ba42bc0c50b89..78d63e10c989d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala @@ -135,11 +135,20 @@ class GaussianMixtureModel private[ml] ( @Since("2.1.0") def computeLogLikelihood(dataset: Dataset[_]): Double = { SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) - val sc = dataset.sparkSession.sparkContext - val bcweightVec = sc.broadcast(Vectors.dense(weights)) - transform(dataset).select($(probabilityCol)).map { - case Row(probs: Vector) => - val likelihood = BLAS.dot(probs, bcweightVec.value) + + val spark = dataset.sparkSession + + import spark.implicits._ + + val bcWeightAndDists = + spark.sparkContext.broadcast(weights.zip(gaussians)) + + dataset.select(col($(featuresCol))).map { + case Row(point: Vector) => + val likelihood = bcWeightAndDists.value.map { + case (weight, dist) => + EPSILON + weight * dist.pdf(point) + }.sum math.log(likelihood) }.reduce(_ + _) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index 6bb3271aacb44..112b82a574b40 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -34,7 +34,7 @@ import org.apache.spark.storage.StorageLevel import org.apache.spark.util.random.BernoulliCellSampler /** - * Helper methods to load, save and pre-process data used in ML Lib. + * Helper methods to load, save and pre-process data used in MLLib. */ @Since("0.8.0") object MLUtils extends Logging { diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala index 83b336e4ce256..a157a923c35d0 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala @@ -149,6 +149,32 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext assert(model.computeLogLikelihood(dataset) === llk) } + test("check LogLikelihood") { + val rdd = sc.parallelize(1 to 5).map(i => TestRow(Vectors.dense(i, i + 1))) + val dataset = spark.createDataFrame(rdd) + + val gaussian1 = new MultivariateGaussian(Vectors.dense(Array(-1.0, -1.0)), + Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0))) + val gaussian2 = new MultivariateGaussian(Vectors.dense(Array(1.0, 1.0)), + Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0))) + val model = new GaussianMixtureModel("gmm", Array(0.2, 0.8), Array(gaussian1, gaussian2)) + + val llk = model.computeLogLikelihood(dataset) + assert(llk ~== -52.804472030823533 relTol 1E-4) + /* + Using the following Python code to compute the log-likelihood: + + import numpy as np + from scipy.stats import multivariate_normal + data = np.array([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]]) + pdf1 = multivariate_normal.pdf(data, mean=[-1, -1], cov=np.array([[1, 0], [0, 1]])) + pdf2 = multivariate_normal.pdf(data, mean=[1, 1], cov=np.array([[1, 0], [0, 1]])) + pdf = pdf1 * 0.2 + pdf2 * 0.8 + > sum(np.log(pdf)) + -52.804472030823533 + */ + } + test("read/write") { def checkModelData(model: GaussianMixtureModel, model2: GaussianMixtureModel): Unit = { assert(model.weights === model2.weights) From 9e316b9bd88be30d75e2628134d29784feb8c502 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Wed, 12 Oct 2016 11:01:52 +0800 Subject: [PATCH 03/19] update tol --- .../scala/org/apache/spark/ml/clustering/GaussianMixture.scala | 2 -- .../org/apache/spark/ml/clustering/GaussianMixtureSuite.scala | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala index 78d63e10c989d..2106ab0928a12 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala @@ -138,8 +138,6 @@ class GaussianMixtureModel private[ml] ( val spark = dataset.sparkSession - import spark.implicits._ - val bcWeightAndDists = spark.sparkContext.broadcast(weights.zip(gaussians)) diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala index a157a923c35d0..2bbe669ce738f 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala @@ -160,7 +160,7 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext val model = new GaussianMixtureModel("gmm", Array(0.2, 0.8), Array(gaussian1, gaussian2)) val llk = model.computeLogLikelihood(dataset) - assert(llk ~== -52.804472030823533 relTol 1E-4) + assert(llk ~== -52.804472030823533 relTol 1E-6) /* Using the following Python code to compute the log-likelihood: From 05f60b964695eef1e6a1551deeb8466a9a96da2b Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Wed, 12 Oct 2016 16:33:25 +0800 Subject: [PATCH 04/19] add implicits --- .../scala/org/apache/spark/ml/clustering/GaussianMixture.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala index 2106ab0928a12..18e615374704b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala @@ -137,6 +137,7 @@ class GaussianMixtureModel private[ml] ( SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) val spark = dataset.sparkSession + import spark.implicits._ val bcWeightAndDists = spark.sparkContext.broadcast(weights.zip(gaussians)) From 45bb563a6ce8355de59f0e76a86e8f1a6a7dc15d Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Thu, 8 Dec 2016 13:21:56 +0800 Subject: [PATCH 05/19] fix isotonic --- .../org/apache/spark/ml/clustering/GaussianMixtureSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala index 2bbe669ce738f..5ed7fa3166bb7 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala @@ -18,7 +18,7 @@ package org.apache.spark.ml.clustering import org.apache.spark.SparkFunSuite -import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, Vector, Vectors} +import org.apache.spark.ml.linalg._ import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.stat.distribution.MultivariateGaussian import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} From c992a1a90ea991a215e7b00e643932e6ee060a04 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Tue, 10 Jan 2017 14:08:29 +0800 Subject: [PATCH 06/19] fix conflict and update version --- .../scala/org/apache/spark/ml/clustering/GaussianMixture.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala index 18e615374704b..127e9a3942ea9 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala @@ -132,7 +132,7 @@ class GaussianMixtureModel private[ml] ( /** * Return the total log-likelihood for this model on the given data. */ - @Since("2.1.0") + @Since("2.2.0") def computeLogLikelihood(dataset: Dataset[_]): Double = { SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) From 2bbdf5b2756dc2c0220d31632f0883e375e3e675 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Tue, 10 Jan 2017 15:27:45 +0800 Subject: [PATCH 07/19] fix a bug in test --- .../spark/ml/clustering/GaussianMixtureSuite.scala | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala index 5ed7fa3166bb7..815a05e56da70 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala @@ -142,11 +142,14 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext assert(!model.hasSummary) // Check validity of LogLikelihood - val llk = transformed.select(probabilityColName).map(_.getAs[Vector](0)).map { probs => - val likelihood = probs.toArray.zip(model.weights).map { case (a, b) => a * b }.sum + transformed.select(probabilityColName).rdd.map(_.getAs[Vector](0)) + val llk = transformed.select(probabilityColName).rdd.map(_.getAs[Vector](0)) + .map { probs => + val likelihood = probs.toArray.zip(model.weights).map { + case (a, b) => a * b }.sum math.log(likelihood) - }.rdd.sum() - assert(model.computeLogLikelihood(dataset) === llk) + }.sum() + assert(model.computeLogLikelihood(dataset) ~== llk relTol 1E-8) } test("check LogLikelihood") { From d5bb62b3019f9b61ba715158b73f4eb0a19a2f5f Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Tue, 10 Jan 2017 18:23:24 +0800 Subject: [PATCH 08/19] fix test --- .../apache/spark/ml/clustering/GaussianMixture.scala | 10 +++------- .../spark/ml/clustering/GaussianMixtureSuite.scala | 10 ---------- 2 files changed, 3 insertions(+), 17 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala index 127e9a3942ea9..f2928753fd579 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala @@ -135,18 +135,14 @@ class GaussianMixtureModel private[ml] ( @Since("2.2.0") def computeLogLikelihood(dataset: Dataset[_]): Double = { SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) - val spark = dataset.sparkSession import spark.implicits._ - val bcWeightAndDists = - spark.sparkContext.broadcast(weights.zip(gaussians)) - + val bcWeightAndDists = spark.sparkContext.broadcast(weights.zip(gaussians)) dataset.select(col($(featuresCol))).map { - case Row(point: Vector) => + case Row(feature: Vector) => val likelihood = bcWeightAndDists.value.map { - case (weight, dist) => - EPSILON + weight * dist.pdf(point) + case (weight, dist) => EPSILON + weight * dist.pdf(feature) }.sum math.log(likelihood) }.reduce(_ + _) diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala index 815a05e56da70..47274f5aa78c4 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala @@ -140,16 +140,6 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext model.setSummary(None) assert(!model.hasSummary) - - // Check validity of LogLikelihood - transformed.select(probabilityColName).rdd.map(_.getAs[Vector](0)) - val llk = transformed.select(probabilityColName).rdd.map(_.getAs[Vector](0)) - .map { probs => - val likelihood = probs.toArray.zip(model.weights).map { - case (a, b) => a * b }.sum - math.log(likelihood) - }.sum() - assert(model.computeLogLikelihood(dataset) ~== llk relTol 1E-8) } test("check LogLikelihood") { From dd4ee788301cd37b287378a631df6324e2546df0 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Thu, 12 Jan 2017 11:14:03 +0800 Subject: [PATCH 09/19] update test --- .../ml/clustering/GaussianMixtureSuite.scala | 35 +++++-------------- 1 file changed, 8 insertions(+), 27 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala index 47274f5aa78c4..9149dbf3f7994 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala @@ -18,7 +18,7 @@ package org.apache.spark.ml.clustering import org.apache.spark.SparkFunSuite -import org.apache.spark.ml.linalg._ +import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, Vector, Vectors} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.stat.distribution.MultivariateGaussian import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} @@ -142,32 +142,6 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext assert(!model.hasSummary) } - test("check LogLikelihood") { - val rdd = sc.parallelize(1 to 5).map(i => TestRow(Vectors.dense(i, i + 1))) - val dataset = spark.createDataFrame(rdd) - - val gaussian1 = new MultivariateGaussian(Vectors.dense(Array(-1.0, -1.0)), - Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0))) - val gaussian2 = new MultivariateGaussian(Vectors.dense(Array(1.0, 1.0)), - Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0))) - val model = new GaussianMixtureModel("gmm", Array(0.2, 0.8), Array(gaussian1, gaussian2)) - - val llk = model.computeLogLikelihood(dataset) - assert(llk ~== -52.804472030823533 relTol 1E-6) - /* - Using the following Python code to compute the log-likelihood: - - import numpy as np - from scipy.stats import multivariate_normal - data = np.array([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]]) - pdf1 = multivariate_normal.pdf(data, mean=[-1, -1], cov=np.array([[1, 0], [0, 1]])) - pdf2 = multivariate_normal.pdf(data, mean=[1, 1], cov=np.array([[1, 0], [0, 1]])) - pdf = pdf1 * 0.2 + pdf2 * 0.8 - > sum(np.log(pdf)) - -52.804472030823533 - */ - } - test("read/write") { def checkModelData(model: GaussianMixtureModel, model2: GaussianMixtureModel): Unit = { assert(model.weights === model2.weights) @@ -233,6 +207,10 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext [,1] [,2] [1,] 0.2961543 0.160783 [2,] 0.1607830 1.008878 + + model$loglik + + [1] -46.89499 */ val weights = Array(0.5333333, 0.4666667) val means = Array(Vectors.dense(10.363673, 9.897081), Vectors.dense(0.11731091, -0.06192351)) @@ -245,6 +223,9 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext val expected = new GaussianMixtureModel("dummy", weights, gaussians) val actual = new GaussianMixture().setK(2).setSeed(seed).fit(rDataset) modelEquals(expected, actual) + + val llk = expected.computeLogLikelihood(rDataset) + assert(llk ~== -46.89499 absTol 1E-6) } test("upper triangular matrix unpacking") { From 8d8bb242698f3342133cbc473dbf4a9b64281cf7 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Thu, 12 Jan 2017 11:41:50 +0800 Subject: [PATCH 10/19] update tol --- .../org/apache/spark/ml/clustering/GaussianMixtureSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala index 9149dbf3f7994..a25bea5505c6b 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala @@ -225,7 +225,7 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext modelEquals(expected, actual) val llk = expected.computeLogLikelihood(rDataset) - assert(llk ~== -46.89499 absTol 1E-6) + assert(llk ~== -46.89499 absTol 1E-5) } test("upper triangular matrix unpacking") { From fe2c424a4aa08f5f50387069db26f179e50395d4 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Mon, 16 Jan 2017 13:01:19 +0800 Subject: [PATCH 11/19] add llk in summary --- .../spark/ml/clustering/GaussianMixture.scala | 26 ++++--------------- .../ml/clustering/GaussianMixtureSuite.scala | 2 +- python/pyspark/ml/clustering.py | 8 ++++++ 3 files changed, 14 insertions(+), 22 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala index f2928753fd579..db5fff5af86ef 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala @@ -129,25 +129,6 @@ class GaussianMixtureModel private[ml] ( Vectors.dense(probs) } - /** - * Return the total log-likelihood for this model on the given data. - */ - @Since("2.2.0") - def computeLogLikelihood(dataset: Dataset[_]): Double = { - SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT) - val spark = dataset.sparkSession - import spark.implicits._ - - val bcWeightAndDists = spark.sparkContext.broadcast(weights.zip(gaussians)) - dataset.select(col($(featuresCol))).map { - case Row(feature: Vector) => - val likelihood = bcWeightAndDists.value.map { - case (weight, dist) => EPSILON + weight * dist.pdf(feature) - }.sum - math.log(likelihood) - }.reduce(_ + _) - } - /** * Retrieve Gaussian distributions as a DataFrame. * Each row represents a Gaussian Distribution. @@ -435,7 +416,7 @@ class GaussianMixture @Since("2.0.0") ( val model = copyValues(new GaussianMixtureModel(uid, weights, gaussianDists)).setParent(this) val summary = new GaussianMixtureSummary(model.transform(dataset), - $(predictionCol), $(probabilityCol), $(featuresCol), $(k)) + $(predictionCol), $(probabilityCol), $(featuresCol), $(k), logLikelihood) model.setSummary(Some(summary)) instr.logSuccess(model) model @@ -693,6 +674,7 @@ private class ExpectationAggregator( * in `predictions`. * @param featuresCol Name for column of features in `predictions`. * @param k Number of clusters. + * @param logLikelihood Total log-likelihood for this model on the given data. */ @Since("2.0.0") @Experimental @@ -701,7 +683,9 @@ class GaussianMixtureSummary private[clustering] ( predictionCol: String, @Since("2.0.0") val probabilityCol: String, featuresCol: String, - k: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k) { + k: Int, + @Since("2.2.0") val logLikelihood: Double) + extends ClusteringSummary(predictions, predictionCol, featuresCol, k) { /** * Probability of each cluster. diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala index a25bea5505c6b..e54eb2750c389 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala @@ -224,7 +224,7 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext val actual = new GaussianMixture().setK(2).setSeed(seed).fit(rDataset) modelEquals(expected, actual) - val llk = expected.computeLogLikelihood(rDataset) + val llk = actual.summary.logLikelihood assert(llk ~== -46.89499 absTol 1E-5) } diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 25f97f5696a14..8e68c68852c5d 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -281,6 +281,14 @@ def probability(self): """ return self._call_java("probability") + @property + @since("2.2.0") + def logLikelihood(self): + """ + Total log-likelihood for this model on the given data. + """ + return self._call_java("logLikelihood") + class KMeansSummary(ClusteringSummary): """ From 68f72fa84afbe9381bebbb620c0ceda7e48dd36e Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Mon, 16 Jan 2017 16:11:10 +0800 Subject: [PATCH 12/19] add mima exclueds --- project/MimaExcludes.scala | 3 +++ 1 file changed, 3 insertions(+) diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index 2314d7f45cb21..ba667d2bb5d60 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -44,6 +44,9 @@ object MimaExcludes { // [SPARK-18537] Add a REST api to spark streaming ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.streaming.scheduler.StreamingListener.onStreamingStarted") + + // [SPARK-14272][ML] Evaluate GaussianMixtureModel with LogLikelihood + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.clustering.GaussianMixtureSummary.this") ) // Exclude rules for 2.1.x From d3336429dfb6564f93ddaa615771882fc72eed58 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Mon, 16 Jan 2017 16:17:17 +0800 Subject: [PATCH 13/19] fix mima --- project/MimaExcludes.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index ba667d2bb5d60..67b00af436c54 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -43,7 +43,7 @@ object MimaExcludes { ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.catalog.Catalog.recoverPartitions"), // [SPARK-18537] Add a REST api to spark streaming - ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.streaming.scheduler.StreamingListener.onStreamingStarted") + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.streaming.scheduler.StreamingListener.onStreamingStarted"), // [SPARK-14272][ML] Evaluate GaussianMixtureModel with LogLikelihood ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.clustering.GaussianMixtureSummary.this") From d6fa8fa83ebfb3a77185da1c63991a778841e84b Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Tue, 17 Jan 2017 17:13:22 +0800 Subject: [PATCH 14/19] add test --- python/pyspark/ml/clustering.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 8e68c68852c5d..7ed226a9df34c 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -175,6 +175,7 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte 3 >>> summary.clusterSizes [2, 2, 2] + >>> summary.logLikelihood >>> weights = model.weights >>> len(weights) 3 From fd85c5d221a0cc52c8b5f4662182d487e34db63b Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Wed, 18 Jan 2017 09:37:14 +0800 Subject: [PATCH 15/19] add output of summary --- python/pyspark/ml/clustering.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 7ed226a9df34c..19a1a2815880a 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -176,6 +176,7 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte >>> summary.clusterSizes [2, 2, 2] >>> summary.logLikelihood + 8.1463602464817928 >>> weights = model.weights >>> len(weights) 3 From eebae43c84a1179260648f7f5cdbb63a60fcc40d Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Wed, 18 Jan 2017 14:31:47 +0800 Subject: [PATCH 16/19] update doc test --- python/pyspark/ml/clustering.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 19a1a2815880a..79c7da0b63131 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -176,7 +176,7 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte >>> summary.clusterSizes [2, 2, 2] >>> summary.logLikelihood - 8.1463602464817928 + 8.146360246481793 >>> weights = model.weights >>> len(weights) 3 From eb27bcc9c571cda08303a65b65c2d3d947e900ac Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Wed, 18 Jan 2017 18:48:43 +0800 Subject: [PATCH 17/19] update doc test --- python/pyspark/ml/clustering.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 79c7da0b63131..c6c1a0033190e 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -176,7 +176,7 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte >>> summary.clusterSizes [2, 2, 2] >>> summary.logLikelihood - 8.146360246481793 + 8.14636... >>> weights = model.weights >>> len(weights) 3 From e3bc962cb81a11bdcf4c910f6cab319769d177ca Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Thu, 19 Jan 2017 15:23:18 +0800 Subject: [PATCH 18/19] update mima --- project/MimaExcludes.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index e2934f75b0fcf..c79c19ab49923 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -46,7 +46,10 @@ object MimaExcludes { ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.streaming.scheduler.StreamingListener.onStreamingStarted"), // [SPARK-19148][SQL] do not expose the external table concept in Catalog - ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.catalog.Catalog.createTable") + ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.catalog.Catalog.createTable"), + + // [SPARK-14272][ML] Evaluate GaussianMixtureModel with LogLikelihood. + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.clustering.GaussianMixtureSummary.this") ) // Exclude rules for 2.1.x @@ -933,9 +936,6 @@ object MimaExcludes { ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel.setFeatureSubsetStrategy"), ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.numTrees"), ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.setFeatureSubsetStrategy") - ) ++ Seq( - // [SPARK-14272][ML] Evaluate GaussianMixtureModel with LogLikelihood. - ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.clustering.GaussianMixtureSummary.this") ) } From cbec946583536283bf31dd5fb4f61b724e502e68 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Thu, 19 Jan 2017 15:26:19 +0800 Subject: [PATCH 19/19] update pr desc --- project/MimaExcludes.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index c79c19ab49923..bf628210a16e6 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -48,7 +48,7 @@ object MimaExcludes { // [SPARK-19148][SQL] do not expose the external table concept in Catalog ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.catalog.Catalog.createTable"), - // [SPARK-14272][ML] Evaluate GaussianMixtureModel with LogLikelihood. + // [SPARK-14272][ML] Add logLikelihood in GaussianMixtureSummary ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.clustering.GaussianMixtureSummary.this") )