From 2f3867676d718d726d0a6168520a5fda84d06399 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Mon, 9 May 2016 21:09:03 +0800
Subject: [PATCH 1/4] recreate pr

---
 data/mllib/sample_kmeans_data.txt             |  6 ++
 docs/ml-clustering.md                         | 46 ++++++++++++-
 .../ml/JavaBisectingKMeansExample.java        | 36 +++-------
 .../python/ml/bisecting_k_means_example.py    | 24 +++----
 .../examples/ml/BisectingKMeansExample.scala  | 67 +++++++++++++++++++
 5 files changed, 138 insertions(+), 41 deletions(-)
 create mode 100644 data/mllib/sample_kmeans_data.txt
 create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala
diff --git a/data/mllib/sample_kmeans_data.txt b/data/mllib/sample_kmeans_data.txt
new file mode 100644
index 0000000000000..50013776b182a
--- /dev/null
+++ b/data/mllib/sample_kmeans_data.txt
@@ -0,0 +1,6 @@
+0 1:0.0 2:0.0 3:0.0
+1 1:0.1 2:0.1 3:0.1
+2 1:0.2 2:0.2 3:0.2
+3 1:9.0 2:9.0 3:9.0
+4 1:9.1 2:9.1 3:9.1
+5 1:9.2 2:9.2 3:9.2
diff --git a/docs/ml-clustering.md b/docs/ml-clustering.md
index 440c455cd077c..6a8c81f7339b8 100644
--- a/docs/ml-clustering.md
+++ b/docs/ml-clustering.md
@@ -104,4 +104,48 @@ Refer to the [Java API docs](api/java/org/apache/spark/ml/clustering/LDA.html) f
 {% include_example java/org/apache/spark/examples/ml/JavaLDAExample.java %}
 </div>
 
-</div>
\ No newline at end of file
+</div>
+
+## Bisecting k-means
+
+
+Bisecting k-means is a kind of [hierarchical clustering](https://en.wikipedia.org/wiki/Hierarchical_clustering) using a
+divisive (or "top-down") approach: all observations start in one cluster, and splits are performed recursively as one
+moves down the hierarchy.
+
+Bisecting K-means can often be much faster than regular K-means, but it will generally produce a different clustering.
+
+`BisectingKMeans` is implemented as an `Estimator` and generates a `BisectingKMeansModel` as the base model.
+
+The implementation in ML has the following parameters:
+
+* *k*: the desired number of leaf clusters (default: 4). The actual number could be smaller if there are no divisible leaf clusters.
+* *maxIter*: the max number of k-means iterations to split clusters (default: 20)
+* *minDivisibleClusterSize*: the minimum number of points (if >= 1.0) or the minimum proportion of points (if < 1.0) of a divisible cluster (default: 1)
+* *seed*: a random seed (default: hash value of the class name)
+* *featuresCol*: the features column name (default: "features")
+* *predictionCol*: the prediction column name (default: "prediction")
+
+### Example
+
+<div class="codetabs">
+
+<div data-lang="scala" markdown="1">
+Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.clustering.BisectingKMeans) for more details.
+
+{% include_example scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala %}
+</div>
+
+<div data-lang="java" markdown="1">
+Refer to the [Java API docs](api/java/org/apache/spark/ml/clustering/BisectingKMeans.html) for more details.
+
+{% include_example java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java %}
+</div>
+
+<div data-lang="python" markdown="1">
+Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.clustering.BisectingKMeans) for more details.
+
+{% include_example python/ml/bisecting_k_means_example.py %}
+</div>
+
+</div>
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
index 810ad905c56af..722f5105107f4 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
@@ -17,23 +17,14 @@
 
 package org.apache.spark.examples.ml;
 
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.SparkSession;
 // $example on$
 import org.apache.spark.ml.clustering.BisectingKMeans;
 import org.apache.spark.ml.clustering.BisectingKMeansModel;
 import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.VectorUDT;
-import org.apache.spark.mllib.linalg.Vectors;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
-import org.apache.spark.sql.types.Metadata;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
 // $example off$
+import org.apache.spark.sql.SparkSession;
 
 
 /**
@@ -48,26 +39,19 @@ public static void main(String[] args) {
       .getOrCreate();
 
     // $example on$
-    List<Row> data = Arrays.asList(
-      RowFactory.create(Vectors.dense(0.1, 0.1, 0.1)),
-      RowFactory.create(Vectors.dense(0.3, 0.3, 0.25)),
-      RowFactory.create(Vectors.dense(0.1, 0.1, -0.1)),
-      RowFactory.create(Vectors.dense(20.3, 20.1, 19.9)),
-      RowFactory.create(Vectors.dense(20.2, 20.1, 19.7)),
-      RowFactory.create(Vectors.dense(18.9, 20.0, 19.7))
-    );
-
-    StructType schema = new StructType(new StructField[]{
-      new StructField("features", new VectorUDT(), false, Metadata.empty()),
-    });
-
-    Dataset<Row> dataset = spark.createDataFrame(data, schema);
+    // Loads data.
+    Dataset<Row> dataset = spark.read().format("libsvm").load("data/mllib/sample_kmeans_data.txt");
 
-    BisectingKMeans bkm = new BisectingKMeans().setK(2);
+    // Trains a bisecting k-means model.
+    BisectingKMeans bkm = new BisectingKMeans().setK(2).setSeed(1);
     BisectingKMeansModel model = bkm.fit(dataset);
 
-    System.out.println("Compute Cost: " + model.computeCost(dataset));
+    // Evaluate clustering.
+    double cost = model.computeCost(dataset);
+    System.out.println("Compute Cost: " + cost);
 
+    // Shows the result.
+    System.out.println("Final Centers: ");
     Vector[] clusterCenters = model.clusterCenters();
     for (int i = 0; i < clusterCenters.length; i++) {
       Vector clusterCenter = clusterCenters[i];
diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py
index 540a4bc3e4bf9..d57cb555a3452 100644
--- a/examples/src/main/python/ml/bisecting_k_means_example.py
+++ b/examples/src/main/python/ml/bisecting_k_means_example.py
@@ -19,9 +19,6 @@
 
 # $example on$
 from pyspark.ml.clustering import BisectingKMeans, BisectingKMeansModel
-from pyspark.mllib.linalg import VectorUDT, _convert_to_vector, Vectors
-from pyspark.mllib.linalg import Vectors
-from pyspark.sql.types import Row
 # $example off$
 from pyspark.sql import SparkSession
 
@@ -36,21 +33,20 @@
         .getOrCreate()
 
     # $example on$
-    data = spark.read.text("data/mllib/kmeans_data.txt").rdd
-    parsed = data\
-        .map(lambda row: Row(features=Vectors.dense([float(x) for x in row.value.split(' ')])))
-    training = spark.createDataFrame(parsed)
+    # Loads data.
+    dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt")
 
-    kmeans = BisectingKMeans().setK(2).setSeed(1).setFeaturesCol("features")
+    # Trains a bisecting k-means model.
+    bkm = BisectingKMeans().setK(2).setSeed(1)
+    model = bkm.fit(dataset)
 
-    model = kmeans.fit(training)
+    # Evaluate clustering.
+    cost = model.computeCost(dataset)
+    print("Compute Cost = " + str(cost))
 
-    # Evaluate clustering
-    cost = model.computeCost(training)
-    print("Bisecting K-means Cost = " + str(cost))
-
-    centers = model.clusterCenters()
+    # Shows the result.
     print("Cluster Centers: ")
+    centers = model.clusterCenters()
     for center in centers:
         print(center)
     # $example off$
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala
new file mode 100644
index 0000000000000..b338551cecbb3
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml
+
+// scalastyle:off println
+
+// $example on$
+import org.apache.spark.ml.clustering.BisectingKMeans
+// $example off$
+import org.apache.spark.sql.SparkSession
+
+/**
+ * An example demonstrating a bisecting k-means clustering.
+ * Run with
+ * {{{
+ * bin/run-example ml.BisectingKMeansExample
+ * }}}
+ */
+object BisectingKMeansExample {
+
+  def main(args: Array[String]): Unit = {
+    // Creates a SparkSession
+    val spark = SparkSession
+      .builder
+      .appName("BisectingKMeansExample")
+      .getOrCreate()
+
+    // $example on$
+    // Loads data.
+    val dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt")
+
+    // Trains a bisecting k-means model.
+    val bkm = new BisectingKMeans().setK(2).setSeed(1)
+    val model = bkm.fit(dataset)
+
+    // Evaluate clustering.
+    val cost = model.computeCost(dataset)
+    println(s"Compute Cost: $cost")
+
+    // Shows the result.
+    println("Final Centers: ")
+    model.clusterCenters.zipWithIndex.foreach {
+      case (clusterCenter, i) =>
+        println(s"Cluster Center $i: $clusterCenter")
+    }
+    // $example off$
+
+    spark.stop()
+  }
+}
+// scalastyle:on println
+

From 449c2f676ca52c0c6d78d857fbdf5f9b4a4e3748 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Wed, 11 May 2016 10:21:14 +0800
Subject: [PATCH 2/4] del params in guide; add run cmds in comments

---
 docs/ml-clustering.md                                    | 9 ---------
 .../spark/examples/ml/JavaBisectingKMeansExample.java    | 6 +++++-
 examples/src/main/python/ml/bisecting_k_means_example.py | 6 ++++--
 .../spark/examples/ml/BisectingKMeansExample.scala       | 2 +-
 4 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/docs/ml-clustering.md b/docs/ml-clustering.md
index 6a8c81f7339b8..1245b8bbc844c 100644
--- a/docs/ml-clustering.md
+++ b/docs/ml-clustering.md
@@ -117,15 +117,6 @@ Bisecting K-means can often be much faster than regular K-means, but it will gen
 
 `BisectingKMeans` is implemented as an `Estimator` and generates a `BisectingKMeansModel` as the base model.
 
-The implementation in ML has the following parameters:
-
-* *k*: the desired number of leaf clusters (default: 4). The actual number could be smaller if there are no divisible leaf clusters.
-* *maxIter*: the max number of k-means iterations to split clusters (default: 20)
-* *minDivisibleClusterSize*: the minimum number of points (if >= 1.0) or the minimum proportion of points (if < 1.0) of a divisible cluster (default: 1)
-* *seed*: a random seed (default: hash value of the class name)
-* *featuresCol*: the features column name (default: "features")
-* *predictionCol*: the prediction column name (default: "prediction")
-
 ### Example
 
 <div class="codetabs">
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
index 722f5105107f4..7055e055f8dfa 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
@@ -28,7 +28,11 @@
 
 
 /**
- * An example demonstrating a bisecting k-means clustering.
+ * An example demonstrating bisecting k-means clustering.
+ * Run with
+ * <pre>
+ * bin/run-example ml.JavaBisectingKMeansExample
+ * </pre>
  */
 public class JavaBisectingKMeansExample {
 
diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py
index d57cb555a3452..411a7db0db608 100644
--- a/examples/src/main/python/ml/bisecting_k_means_example.py
+++ b/examples/src/main/python/ml/bisecting_k_means_example.py
@@ -18,12 +18,14 @@
 from __future__ import print_function
 
 # $example on$
-from pyspark.ml.clustering import BisectingKMeans, BisectingKMeansModel
+from pyspark.ml.clustering import BisectingKMeans
 # $example off$
 from pyspark.sql import SparkSession
 
 """
-A simple example demonstrating a bisecting k-means clustering.
+An example demonstrating bisecting k-means clustering.
+Run with:
+  bin/spark-submit examples/src/main/python/ml/bisecting_k_means_example.py
 """
 
 if __name__ == "__main__":
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala
index b338551cecbb3..76e98af7a178a 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala
@@ -25,7 +25,7 @@ import org.apache.spark.ml.clustering.BisectingKMeans
 import org.apache.spark.sql.SparkSession
 
 /**
- * An example demonstrating a bisecting k-means clustering.
+ * An example demonstrating bisecting k-means clustering.
  * Run with
  * {{{
  * bin/run-example ml.BisectingKMeansExample

From 3adcea3aa1a78d42c4ea11b642a370b5f2de6e5f Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Wed, 11 May 2016 14:56:41 +0800
Subject: [PATCH 3/4] change cost str; del i in result

---
 .../spark/examples/ml/JavaBisectingKMeansExample.java  |  9 ++++-----
 .../src/main/python/ml/bisecting_k_means_example.py    |  2 +-
 .../spark/examples/ml/BisectingKMeansExample.scala     | 10 ++++------
 3 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
index 7055e055f8dfa..41fbd1b197d37 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
@@ -52,14 +52,13 @@ public static void main(String[] args) {
 
     // Evaluate clustering.
     double cost = model.computeCost(dataset);
-    System.out.println("Compute Cost: " + cost);
+    System.out.println("Within Set Sum of Squared Errors = " + cost);
 
     // Shows the result.
-    System.out.println("Final Centers: ");
+    System.out.println("Cluster Centers: ");
     Vector[] clusterCenters = model.clusterCenters();
-    for (int i = 0; i < clusterCenters.length; i++) {
-      Vector clusterCenter = clusterCenters[i];
-      System.out.println("Cluster Center " + i + ": " + clusterCenter);
+    for (Vector clusterCenter : clusterCenters) {
+      System.out.println(clusterCenter);
     }
     // $example off$
 
diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py
index 411a7db0db608..ee0399ac5eb20 100644
--- a/examples/src/main/python/ml/bisecting_k_means_example.py
+++ b/examples/src/main/python/ml/bisecting_k_means_example.py
@@ -44,7 +44,7 @@
 
     # Evaluate clustering.
     cost = model.computeCost(dataset)
-    print("Compute Cost = " + str(cost))
+    print("Within Set Sum of Squared Errors = " + str(cost))
 
     # Shows the result.
     print("Cluster Centers: ")
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala
index 76e98af7a178a..5f8f2c99cbaf4 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala
@@ -50,14 +50,12 @@ object BisectingKMeansExample {
 
     // Evaluate clustering.
     val cost = model.computeCost(dataset)
-    println(s"Compute Cost: $cost")
+    println(s"Within Set Sum of Squared Errors = $cost")
 
     // Shows the result.
-    println("Final Centers: ")
-    model.clusterCenters.zipWithIndex.foreach {
-      case (clusterCenter, i) =>
-        println(s"Cluster Center $i: $clusterCenter")
-    }
+    println("Cluster Centers: ")
+    val centers = model.clusterCenters
+    centers.foreach(println)
     // $example off$
 
     spark.stop()

From 8cd45d33281f1435ddef5fef00c0b10e349b76dd Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Wed, 11 May 2016 14:58:08 +0800
Subject: [PATCH 4/4] rename one var

---
 .../spark/examples/ml/JavaBisectingKMeansExample.java       | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
index 41fbd1b197d37..62871448e36f5 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java
@@ -56,9 +56,9 @@ public static void main(String[] args) {
 
     // Shows the result.
     System.out.println("Cluster Centers: ");
-    Vector[] clusterCenters = model.clusterCenters();
-    for (Vector clusterCenter : clusterCenters) {
-      System.out.println(clusterCenter);
+    Vector[] centers = model.clusterCenters();
+    for (Vector center : centers) {
+      System.out.println(center);
     }
     // $example off$