From c596b3931e4653b4e064a73ebbb58aed52baa995 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Tue, 11 Oct 2016 18:11:57 +0800
Subject: [PATCH 01/19] recreate pr

---
 .../spark/ml/clustering/GaussianMixture.scala     | 15 +++++++++++++++
 .../ml/clustering/GaussianMixtureSuite.scala      |  7 +++++++
 2 files changed, 22 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index a7bb41379538d..ba42bc0c50b89 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -129,6 +129,21 @@ class GaussianMixtureModel private[ml] (
     Vectors.dense(probs)
   }
 
+  /**
+   * Return the total log-likelihood for this model on the given data.
+   */
+  @Since("2.1.0")
+  def computeLogLikelihood(dataset: Dataset[_]): Double = {
+    SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT)
+    val sc = dataset.sparkSession.sparkContext
+    val bcweightVec = sc.broadcast(Vectors.dense(weights))
+    transform(dataset).select($(probabilityCol)).map {
+      case Row(probs: Vector) =>
+        val likelihood = BLAS.dot(probs, bcweightVec.value)
+        math.log(likelihood)
+    }.reduce(_ + _)
+  }
+
   /**
    * Retrieve Gaussian distributions as a DataFrame.
    * Each row represents a Gaussian Distribution.
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
index a362aeea3962b..83b336e4ce256 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
@@ -140,6 +140,13 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
 
     model.setSummary(None)
     assert(!model.hasSummary)
+
+    // Check validity of LogLikelihood
+    val llk = transformed.select(probabilityColName).map(_.getAs[Vector](0)).map { probs =>
+        val likelihood = probs.toArray.zip(model.weights).map { case (a, b) => a * b }.sum
+        math.log(likelihood)
+    }.rdd.sum()
+    assert(model.computeLogLikelihood(dataset) === llk)
   }
 
   test("read/write") {

From 6973301f0e90a1aa3793da1535d3afff128bcda5 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Tue, 11 Oct 2016 21:25:20 +0800
Subject: [PATCH 02/19] update pr

---
 .../spark/ml/clustering/GaussianMixture.scala | 19 ++++++++++----
 .../org/apache/spark/mllib/util/MLUtils.scala |  2 +-
 .../ml/clustering/GaussianMixtureSuite.scala  | 26 +++++++++++++++++++
 3 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index ba42bc0c50b89..78d63e10c989d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -135,11 +135,20 @@ class GaussianMixtureModel private[ml] (
   @Since("2.1.0")
   def computeLogLikelihood(dataset: Dataset[_]): Double = {
     SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT)
-    val sc = dataset.sparkSession.sparkContext
-    val bcweightVec = sc.broadcast(Vectors.dense(weights))
-    transform(dataset).select($(probabilityCol)).map {
-      case Row(probs: Vector) =>
-        val likelihood = BLAS.dot(probs, bcweightVec.value)
+
+    val spark = dataset.sparkSession
+
+    import spark.implicits._
+
+    val bcWeightAndDists =
+      spark.sparkContext.broadcast(weights.zip(gaussians))
+
+    dataset.select(col($(featuresCol))).map {
+      case Row(point: Vector) =>
+        val likelihood = bcWeightAndDists.value.map {
+          case (weight, dist) =>
+            EPSILON + weight * dist.pdf(point)
+        }.sum
         math.log(likelihood)
     }.reduce(_ + _)
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 6bb3271aacb44..112b82a574b40 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -34,7 +34,7 @@ import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.random.BernoulliCellSampler
 
 /**
- * Helper methods to load, save and pre-process data used in ML Lib.
+ * Helper methods to load, save and pre-process data used in MLLib.
  */
 @Since("0.8.0")
 object MLUtils extends Logging {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
index 83b336e4ce256..a157a923c35d0 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
@@ -149,6 +149,32 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
     assert(model.computeLogLikelihood(dataset) === llk)
   }
 
+  test("check LogLikelihood") {
+    val rdd = sc.parallelize(1 to 5).map(i => TestRow(Vectors.dense(i, i + 1)))
+    val dataset = spark.createDataFrame(rdd)
+
+    val gaussian1 = new MultivariateGaussian(Vectors.dense(Array(-1.0, -1.0)),
+      Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0)))
+    val gaussian2 = new MultivariateGaussian(Vectors.dense(Array(1.0, 1.0)),
+      Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0)))
+    val model = new GaussianMixtureModel("gmm", Array(0.2, 0.8), Array(gaussian1, gaussian2))
+
+    val llk = model.computeLogLikelihood(dataset)
+    assert(llk ~== -52.804472030823533 relTol 1E-4)
+    /*
+       Using the following Python code to compute the log-likelihood:
+
+       import numpy as np
+       from scipy.stats import multivariate_normal
+       data = np.array([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]])
+       pdf1 = multivariate_normal.pdf(data, mean=[-1, -1], cov=np.array([[1, 0], [0, 1]]))
+       pdf2 = multivariate_normal.pdf(data, mean=[1, 1], cov=np.array([[1, 0], [0, 1]]))
+       pdf = pdf1 * 0.2 + pdf2 * 0.8
+       > sum(np.log(pdf))
+        -52.804472030823533
+     */
+  }
+
   test("read/write") {
     def checkModelData(model: GaussianMixtureModel, model2: GaussianMixtureModel): Unit = {
       assert(model.weights === model2.weights)

From 9e316b9bd88be30d75e2628134d29784feb8c502 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Wed, 12 Oct 2016 11:01:52 +0800
Subject: [PATCH 03/19] update tol

---
 .../scala/org/apache/spark/ml/clustering/GaussianMixture.scala  | 2 --
 .../org/apache/spark/ml/clustering/GaussianMixtureSuite.scala   | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index 78d63e10c989d..2106ab0928a12 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -138,8 +138,6 @@ class GaussianMixtureModel private[ml] (
 
     val spark = dataset.sparkSession
 
-    import spark.implicits._
-
     val bcWeightAndDists =
       spark.sparkContext.broadcast(weights.zip(gaussians))
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
index a157a923c35d0..2bbe669ce738f 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
@@ -160,7 +160,7 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
     val model = new GaussianMixtureModel("gmm", Array(0.2, 0.8), Array(gaussian1, gaussian2))
 
     val llk = model.computeLogLikelihood(dataset)
-    assert(llk ~== -52.804472030823533 relTol 1E-4)
+    assert(llk ~== -52.804472030823533 relTol 1E-6)
     /*
        Using the following Python code to compute the log-likelihood:
 

From 05f60b964695eef1e6a1551deeb8466a9a96da2b Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Wed, 12 Oct 2016 16:33:25 +0800
Subject: [PATCH 04/19] add implicits

---
 .../scala/org/apache/spark/ml/clustering/GaussianMixture.scala   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index 2106ab0928a12..18e615374704b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -137,6 +137,7 @@ class GaussianMixtureModel private[ml] (
     SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT)
 
     val spark = dataset.sparkSession
+    import spark.implicits._
 
     val bcWeightAndDists =
       spark.sparkContext.broadcast(weights.zip(gaussians))

From 45bb563a6ce8355de59f0e76a86e8f1a6a7dc15d Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Thu, 8 Dec 2016 13:21:56 +0800
Subject: [PATCH 05/19] fix isotonic

---
 .../org/apache/spark/ml/clustering/GaussianMixtureSuite.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
index 2bbe669ce738f..5ed7fa3166bb7 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.ml.clustering
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, Vector, Vectors}
+import org.apache.spark.ml.linalg._
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.stat.distribution.MultivariateGaussian
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}

From c992a1a90ea991a215e7b00e643932e6ee060a04 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Tue, 10 Jan 2017 14:08:29 +0800
Subject: [PATCH 06/19] fix conflict and update version

---
 .../scala/org/apache/spark/ml/clustering/GaussianMixture.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index 18e615374704b..127e9a3942ea9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -132,7 +132,7 @@ class GaussianMixtureModel private[ml] (
   /**
    * Return the total log-likelihood for this model on the given data.
    */
-  @Since("2.1.0")
+  @Since("2.2.0")
   def computeLogLikelihood(dataset: Dataset[_]): Double = {
     SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT)
 

From 2bbdf5b2756dc2c0220d31632f0883e375e3e675 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Tue, 10 Jan 2017 15:27:45 +0800
Subject: [PATCH 07/19] fix a bug in test

---
 .../spark/ml/clustering/GaussianMixtureSuite.scala    | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
index 5ed7fa3166bb7..815a05e56da70 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
@@ -142,11 +142,14 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
     assert(!model.hasSummary)
 
     // Check validity of LogLikelihood
-    val llk = transformed.select(probabilityColName).map(_.getAs[Vector](0)).map { probs =>
-        val likelihood = probs.toArray.zip(model.weights).map { case (a, b) => a * b }.sum
+    transformed.select(probabilityColName).rdd.map(_.getAs[Vector](0))
+    val llk = transformed.select(probabilityColName).rdd.map(_.getAs[Vector](0))
+      .map { probs =>
+        val likelihood = probs.toArray.zip(model.weights).map {
+          case (a, b) => a * b }.sum
         math.log(likelihood)
-    }.rdd.sum()
-    assert(model.computeLogLikelihood(dataset) === llk)
+    }.sum()
+    assert(model.computeLogLikelihood(dataset) ~== llk relTol 1E-8)
   }
 
   test("check LogLikelihood") {

From d5bb62b3019f9b61ba715158b73f4eb0a19a2f5f Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Tue, 10 Jan 2017 18:23:24 +0800
Subject: [PATCH 08/19] fix test

---
 .../apache/spark/ml/clustering/GaussianMixture.scala   | 10 +++-------
 .../spark/ml/clustering/GaussianMixtureSuite.scala     | 10 ----------
 2 files changed, 3 insertions(+), 17 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index 127e9a3942ea9..f2928753fd579 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -135,18 +135,14 @@ class GaussianMixtureModel private[ml] (
   @Since("2.2.0")
   def computeLogLikelihood(dataset: Dataset[_]): Double = {
     SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT)
-
     val spark = dataset.sparkSession
     import spark.implicits._
 
-    val bcWeightAndDists =
-      spark.sparkContext.broadcast(weights.zip(gaussians))
-
+    val bcWeightAndDists = spark.sparkContext.broadcast(weights.zip(gaussians))
     dataset.select(col($(featuresCol))).map {
-      case Row(point: Vector) =>
+      case Row(feature: Vector) =>
         val likelihood = bcWeightAndDists.value.map {
-          case (weight, dist) =>
-            EPSILON + weight * dist.pdf(point)
+          case (weight, dist) => EPSILON + weight * dist.pdf(feature)
         }.sum
         math.log(likelihood)
     }.reduce(_ + _)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
index 815a05e56da70..47274f5aa78c4 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
@@ -140,16 +140,6 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
 
     model.setSummary(None)
     assert(!model.hasSummary)
-
-    // Check validity of LogLikelihood
-    transformed.select(probabilityColName).rdd.map(_.getAs[Vector](0))
-    val llk = transformed.select(probabilityColName).rdd.map(_.getAs[Vector](0))
-      .map { probs =>
-        val likelihood = probs.toArray.zip(model.weights).map {
-          case (a, b) => a * b }.sum
-        math.log(likelihood)
-    }.sum()
-    assert(model.computeLogLikelihood(dataset) ~== llk relTol 1E-8)
   }
 
   test("check LogLikelihood") {

From dd4ee788301cd37b287378a631df6324e2546df0 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Thu, 12 Jan 2017 11:14:03 +0800
Subject: [PATCH 09/19] update test

---
 .../ml/clustering/GaussianMixtureSuite.scala  | 35 +++++--------------
 1 file changed, 8 insertions(+), 27 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
index 47274f5aa78c4..9149dbf3f7994 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.ml.clustering
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.ml.linalg._
+import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, Vector, Vectors}
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.stat.distribution.MultivariateGaussian
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
@@ -142,32 +142,6 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
     assert(!model.hasSummary)
   }
 
-  test("check LogLikelihood") {
-    val rdd = sc.parallelize(1 to 5).map(i => TestRow(Vectors.dense(i, i + 1)))
-    val dataset = spark.createDataFrame(rdd)
-
-    val gaussian1 = new MultivariateGaussian(Vectors.dense(Array(-1.0, -1.0)),
-      Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0)))
-    val gaussian2 = new MultivariateGaussian(Vectors.dense(Array(1.0, 1.0)),
-      Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0)))
-    val model = new GaussianMixtureModel("gmm", Array(0.2, 0.8), Array(gaussian1, gaussian2))
-
-    val llk = model.computeLogLikelihood(dataset)
-    assert(llk ~== -52.804472030823533 relTol 1E-6)
-    /*
-       Using the following Python code to compute the log-likelihood:
-
-       import numpy as np
-       from scipy.stats import multivariate_normal
-       data = np.array([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6]])
-       pdf1 = multivariate_normal.pdf(data, mean=[-1, -1], cov=np.array([[1, 0], [0, 1]]))
-       pdf2 = multivariate_normal.pdf(data, mean=[1, 1], cov=np.array([[1, 0], [0, 1]]))
-       pdf = pdf1 * 0.2 + pdf2 * 0.8
-       > sum(np.log(pdf))
-        -52.804472030823533
-     */
-  }
-
   test("read/write") {
     def checkModelData(model: GaussianMixtureModel, model2: GaussianMixtureModel): Unit = {
       assert(model.weights === model2.weights)
@@ -233,6 +207,10 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
                 [,1]     [,2]
       [1,] 0.2961543 0.160783
       [2,] 0.1607830 1.008878
+
+      model$loglik
+
+      [1] -46.89499
      */
     val weights = Array(0.5333333, 0.4666667)
     val means = Array(Vectors.dense(10.363673, 9.897081), Vectors.dense(0.11731091, -0.06192351))
@@ -245,6 +223,9 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
     val expected = new GaussianMixtureModel("dummy", weights, gaussians)
     val actual = new GaussianMixture().setK(2).setSeed(seed).fit(rDataset)
     modelEquals(expected, actual)
+
+    val llk = expected.computeLogLikelihood(rDataset)
+    assert(llk ~== -46.89499 absTol 1E-6)
   }
 
   test("upper triangular matrix unpacking") {

From 8d8bb242698f3342133cbc473dbf4a9b64281cf7 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Thu, 12 Jan 2017 11:41:50 +0800
Subject: [PATCH 10/19] update tol

---
 .../org/apache/spark/ml/clustering/GaussianMixtureSuite.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
index 9149dbf3f7994..a25bea5505c6b 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
@@ -225,7 +225,7 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
     modelEquals(expected, actual)
 
     val llk = expected.computeLogLikelihood(rDataset)
-    assert(llk ~== -46.89499 absTol 1E-6)
+    assert(llk ~== -46.89499 absTol 1E-5)
   }
 
   test("upper triangular matrix unpacking") {

From fe2c424a4aa08f5f50387069db26f179e50395d4 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Mon, 16 Jan 2017 13:01:19 +0800
Subject: [PATCH 11/19] add llk in summary

---
 .../spark/ml/clustering/GaussianMixture.scala | 26 ++++---------------
 .../ml/clustering/GaussianMixtureSuite.scala  |  2 +-
 python/pyspark/ml/clustering.py               |  8 ++++++
 3 files changed, 14 insertions(+), 22 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index f2928753fd579..db5fff5af86ef 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -129,25 +129,6 @@ class GaussianMixtureModel private[ml] (
     Vectors.dense(probs)
   }
 
-  /**
-   * Return the total log-likelihood for this model on the given data.
-   */
-  @Since("2.2.0")
-  def computeLogLikelihood(dataset: Dataset[_]): Double = {
-    SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT)
-    val spark = dataset.sparkSession
-    import spark.implicits._
-
-    val bcWeightAndDists = spark.sparkContext.broadcast(weights.zip(gaussians))
-    dataset.select(col($(featuresCol))).map {
-      case Row(feature: Vector) =>
-        val likelihood = bcWeightAndDists.value.map {
-          case (weight, dist) => EPSILON + weight * dist.pdf(feature)
-        }.sum
-        math.log(likelihood)
-    }.reduce(_ + _)
-  }
-
   /**
    * Retrieve Gaussian distributions as a DataFrame.
    * Each row represents a Gaussian Distribution.
@@ -435,7 +416,7 @@ class GaussianMixture @Since("2.0.0") (
 
     val model = copyValues(new GaussianMixtureModel(uid, weights, gaussianDists)).setParent(this)
     val summary = new GaussianMixtureSummary(model.transform(dataset),
-      $(predictionCol), $(probabilityCol), $(featuresCol), $(k))
+      $(predictionCol), $(probabilityCol), $(featuresCol), $(k), logLikelihood)
     model.setSummary(Some(summary))
     instr.logSuccess(model)
     model
@@ -693,6 +674,7 @@ private class ExpectationAggregator(
  *                        in `predictions`.
  * @param featuresCol  Name for column of features in `predictions`.
  * @param k  Number of clusters.
+ * @param logLikelihood  Total log-likelihood for this model on the given data.
  */
 @Since("2.0.0")
 @Experimental
@@ -701,7 +683,9 @@ class GaussianMixtureSummary private[clustering] (
     predictionCol: String,
     @Since("2.0.0") val probabilityCol: String,
     featuresCol: String,
-    k: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k) {
+    k: Int,
+    @Since("2.2.0") val logLikelihood: Double)
+  extends ClusteringSummary(predictions, predictionCol, featuresCol, k) {
 
   /**
    * Probability of each cluster.
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
index a25bea5505c6b..e54eb2750c389 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
@@ -224,7 +224,7 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
     val actual = new GaussianMixture().setK(2).setSeed(seed).fit(rDataset)
     modelEquals(expected, actual)
 
-    val llk = expected.computeLogLikelihood(rDataset)
+    val llk = actual.summary.logLikelihood
     assert(llk ~== -46.89499 absTol 1E-5)
   }
 
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index 25f97f5696a14..8e68c68852c5d 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -281,6 +281,14 @@ def probability(self):
         """
         return self._call_java("probability")
 
+    @property
+    @since("2.2.0")
+    def logLikelihood(self):
+        """
+        Total log-likelihood for this model on the given data.
+        """
+        return self._call_java("logLikelihood")
+
 
 class KMeansSummary(ClusteringSummary):
     """

From 68f72fa84afbe9381bebbb620c0ceda7e48dd36e Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Mon, 16 Jan 2017 16:11:10 +0800
Subject: [PATCH 12/19] add mima exclueds

---
 project/MimaExcludes.scala | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 2314d7f45cb21..ba667d2bb5d60 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -44,6 +44,9 @@ object MimaExcludes {
 
     // [SPARK-18537] Add a REST api to spark streaming
     ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.streaming.scheduler.StreamingListener.onStreamingStarted")
+
+    // [SPARK-14272][ML] Evaluate GaussianMixtureModel with LogLikelihood
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.clustering.GaussianMixtureSummary.this")
   )
 
   // Exclude rules for 2.1.x

From d3336429dfb6564f93ddaa615771882fc72eed58 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Mon, 16 Jan 2017 16:17:17 +0800
Subject: [PATCH 13/19] fix mima

---
 project/MimaExcludes.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index ba667d2bb5d60..67b00af436c54 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -43,7 +43,7 @@ object MimaExcludes {
     ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.catalog.Catalog.recoverPartitions"),
 
     // [SPARK-18537] Add a REST api to spark streaming
-    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.streaming.scheduler.StreamingListener.onStreamingStarted")
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.streaming.scheduler.StreamingListener.onStreamingStarted"),
 
     // [SPARK-14272][ML] Evaluate GaussianMixtureModel with LogLikelihood
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.clustering.GaussianMixtureSummary.this")

From d6fa8fa83ebfb3a77185da1c63991a778841e84b Mon Sep 17 00:00:00 2001
From: Ruifeng Zheng <ruifengz@foxmail.com>
Date: Tue, 17 Jan 2017 17:13:22 +0800
Subject: [PATCH 14/19] add test

---
 python/pyspark/ml/clustering.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index 8e68c68852c5d..7ed226a9df34c 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -175,6 +175,7 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
     3
     >>> summary.clusterSizes
     [2, 2, 2]
+    >>> summary.logLikelihood
     >>> weights = model.weights
     >>> len(weights)
     3

From fd85c5d221a0cc52c8b5f4662182d487e34db63b Mon Sep 17 00:00:00 2001
From: Ruifeng Zheng <ruifengz@foxmail.com>
Date: Wed, 18 Jan 2017 09:37:14 +0800
Subject: [PATCH 15/19] add output of summary

---
 python/pyspark/ml/clustering.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index 7ed226a9df34c..19a1a2815880a 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -176,6 +176,7 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
     >>> summary.clusterSizes
     [2, 2, 2]
     >>> summary.logLikelihood
+    8.1463602464817928
     >>> weights = model.weights
     >>> len(weights)
     3

From eebae43c84a1179260648f7f5cdbb63a60fcc40d Mon Sep 17 00:00:00 2001
From: Ruifeng Zheng <ruifengz@foxmail.com>
Date: Wed, 18 Jan 2017 14:31:47 +0800
Subject: [PATCH 16/19] update doc test

---
 python/pyspark/ml/clustering.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index 19a1a2815880a..79c7da0b63131 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -176,7 +176,7 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
     >>> summary.clusterSizes
     [2, 2, 2]
     >>> summary.logLikelihood
-    8.1463602464817928
+    8.146360246481793
     >>> weights = model.weights
     >>> len(weights)
     3

From eb27bcc9c571cda08303a65b65c2d3d947e900ac Mon Sep 17 00:00:00 2001
From: Ruifeng Zheng <ruifengz@foxmail.com>
Date: Wed, 18 Jan 2017 18:48:43 +0800
Subject: [PATCH 17/19] update doc test

---
 python/pyspark/ml/clustering.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index 79c7da0b63131..c6c1a0033190e 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -176,7 +176,7 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIte
     >>> summary.clusterSizes
     [2, 2, 2]
     >>> summary.logLikelihood
-    8.146360246481793
+    8.14636...
     >>> weights = model.weights
     >>> len(weights)
     3

From e3bc962cb81a11bdcf4c910f6cab319769d177ca Mon Sep 17 00:00:00 2001
From: Ruifeng Zheng <ruifengz@foxmail.com>
Date: Thu, 19 Jan 2017 15:23:18 +0800
Subject: [PATCH 18/19] update mima

---
 project/MimaExcludes.scala | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index e2934f75b0fcf..c79c19ab49923 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -46,7 +46,10 @@ object MimaExcludes {
     ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.streaming.scheduler.StreamingListener.onStreamingStarted"),
 
     // [SPARK-19148][SQL] do not expose the external table concept in Catalog
-    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.catalog.Catalog.createTable")
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.catalog.Catalog.createTable"),
+
+    // [SPARK-14272][ML] Evaluate GaussianMixtureModel with LogLikelihood.
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.clustering.GaussianMixtureSummary.this")
   )
 
   // Exclude rules for 2.1.x
@@ -933,9 +936,6 @@ object MimaExcludes {
       ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.classification.RandomForestClassificationModel.setFeatureSubsetStrategy"),
       ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.numTrees"),
       ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.ml.regression.RandomForestRegressionModel.setFeatureSubsetStrategy")
-    ) ++ Seq(
-      // [SPARK-14272][ML] Evaluate GaussianMixtureModel with LogLikelihood.
-      ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.clustering.GaussianMixtureSummary.this")
     )
   }
 

From cbec946583536283bf31dd5fb4f61b724e502e68 Mon Sep 17 00:00:00 2001
From: Ruifeng Zheng <ruifengz@foxmail.com>
Date: Thu, 19 Jan 2017 15:26:19 +0800
Subject: [PATCH 19/19] update pr desc

---
 project/MimaExcludes.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index c79c19ab49923..bf628210a16e6 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -48,7 +48,7 @@ object MimaExcludes {
     // [SPARK-19148][SQL] do not expose the external table concept in Catalog
     ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.catalog.Catalog.createTable"),
 
-    // [SPARK-14272][ML] Evaluate GaussianMixtureModel with LogLikelihood.
+    // [SPARK-14272][ML] Add logLikelihood in GaussianMixtureSummary
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.clustering.GaussianMixtureSummary.this")
   )