From 2f3867676d718d726d0a6168520a5fda84d06399 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Mon, 9 May 2016 21:09:03 +0800 Subject: [PATCH 1/4] recreate pr --- data/mllib/sample_kmeans_data.txt | 6 ++ docs/ml-clustering.md | 46 ++++++++++++- .../ml/JavaBisectingKMeansExample.java | 36 +++------- .../python/ml/bisecting_k_means_example.py | 24 +++---- .../examples/ml/BisectingKMeansExample.scala | 67 +++++++++++++++++++ 5 files changed, 138 insertions(+), 41 deletions(-) create mode 100644 data/mllib/sample_kmeans_data.txt create mode 100644 examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala diff --git a/data/mllib/sample_kmeans_data.txt b/data/mllib/sample_kmeans_data.txt new file mode 100644 index 0000000000000..50013776b182a --- /dev/null +++ b/data/mllib/sample_kmeans_data.txt @@ -0,0 +1,6 @@ +0 1:0.0 2:0.0 3:0.0 +1 1:0.1 2:0.1 3:0.1 +2 1:0.2 2:0.2 3:0.2 +3 1:9.0 2:9.0 3:9.0 +4 1:9.1 2:9.1 3:9.1 +5 1:9.2 2:9.2 3:9.2 diff --git a/docs/ml-clustering.md b/docs/ml-clustering.md index 440c455cd077c..6a8c81f7339b8 100644 --- a/docs/ml-clustering.md +++ b/docs/ml-clustering.md @@ -104,4 +104,48 @@ Refer to the [Java API docs](api/java/org/apache/spark/ml/clustering/LDA.html) f {% include_example java/org/apache/spark/examples/ml/JavaLDAExample.java %} - \ No newline at end of file + + +## Bisecting k-means + + +Bisecting k-means is a kind of [hierarchical clustering](https://en.wikipedia.org/wiki/Hierarchical_clustering) using a +divisive (or "top-down") approach: all observations start in one cluster, and splits are performed recursively as one +moves down the hierarchy. + +Bisecting K-means can often be much faster than regular K-means, but it will generally produce a different clustering. + +`BisectingKMeans` is implemented as an `Estimator` and generates a `BisectingKMeansModel` as the base model. + +The implementation in ML has the following parameters: + +* *k*: the desired number of leaf clusters (default: 4). The actual number could be smaller if there are no divisible leaf clusters. +* *maxIter*: the max number of k-means iterations to split clusters (default: 20) +* *minDivisibleClusterSize*: the minimum number of points (if >= 1.0) or the minimum proportion of points (if < 1.0) of a divisible cluster (default: 1) +* *seed*: a random seed (default: hash value of the class name) +* *featuresCol*: the features column name (default: "features") +* *predictionCol*: the prediction column name (default: "prediction") + +### Example + +
+ +
+Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.clustering.BisectingKMeans) for more details. + +{% include_example scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala %} +
+ +
+Refer to the [Java API docs](api/java/org/apache/spark/ml/clustering/BisectingKMeans.html) for more details. + +{% include_example java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java %} +
+ +
+Refer to the [Python API docs](api/python/pyspark.ml.html#pyspark.ml.clustering.BisectingKMeans) for more details. + +{% include_example python/ml/bisecting_k_means_example.py %} +
+ +
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java index 810ad905c56af..722f5105107f4 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java @@ -17,23 +17,14 @@ package org.apache.spark.examples.ml; -import java.util.Arrays; -import java.util.List; - -import org.apache.spark.sql.RowFactory; -import org.apache.spark.sql.SparkSession; // $example on$ import org.apache.spark.ml.clustering.BisectingKMeans; import org.apache.spark.ml.clustering.BisectingKMeansModel; import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.linalg.VectorUDT; -import org.apache.spark.mllib.linalg.Vectors; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; -import org.apache.spark.sql.types.Metadata; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; // $example off$ +import org.apache.spark.sql.SparkSession; /** @@ -48,26 +39,19 @@ public static void main(String[] args) { .getOrCreate(); // $example on$ - List data = Arrays.asList( - RowFactory.create(Vectors.dense(0.1, 0.1, 0.1)), - RowFactory.create(Vectors.dense(0.3, 0.3, 0.25)), - RowFactory.create(Vectors.dense(0.1, 0.1, -0.1)), - RowFactory.create(Vectors.dense(20.3, 20.1, 19.9)), - RowFactory.create(Vectors.dense(20.2, 20.1, 19.7)), - RowFactory.create(Vectors.dense(18.9, 20.0, 19.7)) - ); - - StructType schema = new StructType(new StructField[]{ - new StructField("features", new VectorUDT(), false, Metadata.empty()), - }); - - Dataset dataset = spark.createDataFrame(data, schema); + // Loads data. + Dataset dataset = spark.read().format("libsvm").load("data/mllib/sample_kmeans_data.txt"); - BisectingKMeans bkm = new BisectingKMeans().setK(2); + // Trains a bisecting k-means model. + BisectingKMeans bkm = new BisectingKMeans().setK(2).setSeed(1); BisectingKMeansModel model = bkm.fit(dataset); - System.out.println("Compute Cost: " + model.computeCost(dataset)); + // Evaluate clustering. + double cost = model.computeCost(dataset); + System.out.println("Compute Cost: " + cost); + // Shows the result. + System.out.println("Final Centers: "); Vector[] clusterCenters = model.clusterCenters(); for (int i = 0; i < clusterCenters.length; i++) { Vector clusterCenter = clusterCenters[i]; diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py index 540a4bc3e4bf9..d57cb555a3452 100644 --- a/examples/src/main/python/ml/bisecting_k_means_example.py +++ b/examples/src/main/python/ml/bisecting_k_means_example.py @@ -19,9 +19,6 @@ # $example on$ from pyspark.ml.clustering import BisectingKMeans, BisectingKMeansModel -from pyspark.mllib.linalg import VectorUDT, _convert_to_vector, Vectors -from pyspark.mllib.linalg import Vectors -from pyspark.sql.types import Row # $example off$ from pyspark.sql import SparkSession @@ -36,21 +33,20 @@ .getOrCreate() # $example on$ - data = spark.read.text("data/mllib/kmeans_data.txt").rdd - parsed = data\ - .map(lambda row: Row(features=Vectors.dense([float(x) for x in row.value.split(' ')]))) - training = spark.createDataFrame(parsed) + # Loads data. + dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt") - kmeans = BisectingKMeans().setK(2).setSeed(1).setFeaturesCol("features") + # Trains a bisecting k-means model. + bkm = BisectingKMeans().setK(2).setSeed(1) + model = bkm.fit(dataset) - model = kmeans.fit(training) + # Evaluate clustering. + cost = model.computeCost(dataset) + print("Compute Cost = " + str(cost)) - # Evaluate clustering - cost = model.computeCost(training) - print("Bisecting K-means Cost = " + str(cost)) - - centers = model.clusterCenters() + # Shows the result. print("Cluster Centers: ") + centers = model.clusterCenters() for center in centers: print(center) # $example off$ diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala new file mode 100644 index 0000000000000..b338551cecbb3 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.ml + +// scalastyle:off println + +// $example on$ +import org.apache.spark.ml.clustering.BisectingKMeans +// $example off$ +import org.apache.spark.sql.SparkSession + +/** + * An example demonstrating a bisecting k-means clustering. + * Run with + * {{{ + * bin/run-example ml.BisectingKMeansExample + * }}} + */ +object BisectingKMeansExample { + + def main(args: Array[String]): Unit = { + // Creates a SparkSession + val spark = SparkSession + .builder + .appName("BisectingKMeansExample") + .getOrCreate() + + // $example on$ + // Loads data. + val dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt") + + // Trains a bisecting k-means model. + val bkm = new BisectingKMeans().setK(2).setSeed(1) + val model = bkm.fit(dataset) + + // Evaluate clustering. + val cost = model.computeCost(dataset) + println(s"Compute Cost: $cost") + + // Shows the result. + println("Final Centers: ") + model.clusterCenters.zipWithIndex.foreach { + case (clusterCenter, i) => + println(s"Cluster Center $i: $clusterCenter") + } + // $example off$ + + spark.stop() + } +} +// scalastyle:on println + From 449c2f676ca52c0c6d78d857fbdf5f9b4a4e3748 Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Wed, 11 May 2016 10:21:14 +0800 Subject: [PATCH 2/4] del params in guide; add run cmds in comments --- docs/ml-clustering.md | 9 --------- .../spark/examples/ml/JavaBisectingKMeansExample.java | 6 +++++- examples/src/main/python/ml/bisecting_k_means_example.py | 6 ++++-- .../spark/examples/ml/BisectingKMeansExample.scala | 2 +- 4 files changed, 10 insertions(+), 13 deletions(-) diff --git a/docs/ml-clustering.md b/docs/ml-clustering.md index 6a8c81f7339b8..1245b8bbc844c 100644 --- a/docs/ml-clustering.md +++ b/docs/ml-clustering.md @@ -117,15 +117,6 @@ Bisecting K-means can often be much faster than regular K-means, but it will gen `BisectingKMeans` is implemented as an `Estimator` and generates a `BisectingKMeansModel` as the base model. -The implementation in ML has the following parameters: - -* *k*: the desired number of leaf clusters (default: 4). The actual number could be smaller if there are no divisible leaf clusters. -* *maxIter*: the max number of k-means iterations to split clusters (default: 20) -* *minDivisibleClusterSize*: the minimum number of points (if >= 1.0) or the minimum proportion of points (if < 1.0) of a divisible cluster (default: 1) -* *seed*: a random seed (default: hash value of the class name) -* *featuresCol*: the features column name (default: "features") -* *predictionCol*: the prediction column name (default: "prediction") - ### Example
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java index 722f5105107f4..7055e055f8dfa 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java @@ -28,7 +28,11 @@ /** - * An example demonstrating a bisecting k-means clustering. + * An example demonstrating bisecting k-means clustering. + * Run with + *
+ * bin/run-example ml.JavaBisectingKMeansExample
+ * 
*/ public class JavaBisectingKMeansExample { diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py index d57cb555a3452..411a7db0db608 100644 --- a/examples/src/main/python/ml/bisecting_k_means_example.py +++ b/examples/src/main/python/ml/bisecting_k_means_example.py @@ -18,12 +18,14 @@ from __future__ import print_function # $example on$ -from pyspark.ml.clustering import BisectingKMeans, BisectingKMeansModel +from pyspark.ml.clustering import BisectingKMeans # $example off$ from pyspark.sql import SparkSession """ -A simple example demonstrating a bisecting k-means clustering. +An example demonstrating bisecting k-means clustering. +Run with: + bin/spark-submit examples/src/main/python/ml/bisecting_k_means_example.py """ if __name__ == "__main__": diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala index b338551cecbb3..76e98af7a178a 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala @@ -25,7 +25,7 @@ import org.apache.spark.ml.clustering.BisectingKMeans import org.apache.spark.sql.SparkSession /** - * An example demonstrating a bisecting k-means clustering. + * An example demonstrating bisecting k-means clustering. * Run with * {{{ * bin/run-example ml.BisectingKMeansExample From 3adcea3aa1a78d42c4ea11b642a370b5f2de6e5f Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Wed, 11 May 2016 14:56:41 +0800 Subject: [PATCH 3/4] change cost str; del i in result --- .../spark/examples/ml/JavaBisectingKMeansExample.java | 9 ++++----- .../src/main/python/ml/bisecting_k_means_example.py | 2 +- .../spark/examples/ml/BisectingKMeansExample.scala | 10 ++++------ 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java index 7055e055f8dfa..41fbd1b197d37 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java @@ -52,14 +52,13 @@ public static void main(String[] args) { // Evaluate clustering. double cost = model.computeCost(dataset); - System.out.println("Compute Cost: " + cost); + System.out.println("Within Set Sum of Squared Errors = " + cost); // Shows the result. - System.out.println("Final Centers: "); + System.out.println("Cluster Centers: "); Vector[] clusterCenters = model.clusterCenters(); - for (int i = 0; i < clusterCenters.length; i++) { - Vector clusterCenter = clusterCenters[i]; - System.out.println("Cluster Center " + i + ": " + clusterCenter); + for (Vector clusterCenter : clusterCenters) { + System.out.println(clusterCenter); } // $example off$ diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py index 411a7db0db608..ee0399ac5eb20 100644 --- a/examples/src/main/python/ml/bisecting_k_means_example.py +++ b/examples/src/main/python/ml/bisecting_k_means_example.py @@ -44,7 +44,7 @@ # Evaluate clustering. cost = model.computeCost(dataset) - print("Compute Cost = " + str(cost)) + print("Within Set Sum of Squared Errors = " + str(cost)) # Shows the result. print("Cluster Centers: ") diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala index 76e98af7a178a..5f8f2c99cbaf4 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/BisectingKMeansExample.scala @@ -50,14 +50,12 @@ object BisectingKMeansExample { // Evaluate clustering. val cost = model.computeCost(dataset) - println(s"Compute Cost: $cost") + println(s"Within Set Sum of Squared Errors = $cost") // Shows the result. - println("Final Centers: ") - model.clusterCenters.zipWithIndex.foreach { - case (clusterCenter, i) => - println(s"Cluster Center $i: $clusterCenter") - } + println("Cluster Centers: ") + val centers = model.clusterCenters + centers.foreach(println) // $example off$ spark.stop() From 8cd45d33281f1435ddef5fef00c0b10e349b76dd Mon Sep 17 00:00:00 2001 From: Zheng RuiFeng Date: Wed, 11 May 2016 14:58:08 +0800 Subject: [PATCH 4/4] rename one var --- .../spark/examples/ml/JavaBisectingKMeansExample.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java index 41fbd1b197d37..62871448e36f5 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBisectingKMeansExample.java @@ -56,9 +56,9 @@ public static void main(String[] args) { // Shows the result. System.out.println("Cluster Centers: "); - Vector[] clusterCenters = model.clusterCenters(); - for (Vector clusterCenter : clusterCenters) { - System.out.println(clusterCenter); + Vector[] centers = model.clusterCenters(); + for (Vector center : centers) { + System.out.println(center); } // $example off$