[SPARK-6518][MLLIB][EXAMPLE][DOC] Add example code and user guide for bisecting k-means

yu-iskw · jkbradley · commit e1adf6d7d1c7 · 2015-12-16T10:55:54.000-08:00
This PR includes only an example code in order to finish it quickly. I'll send another PR for the docs soon. Author: Yu ISHIKAWA <yuu.ishikawa@gmail.com> Closes apache#9952 from yu-iskw/SPARK-6518. (cherry picked from commit 7b6dc29) Signed-off-by: Joseph K. Bradley <joseph@databricks.com>
diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
@@ -718,6 +718,41 @@ sameModel = LDAModel.load(sc, "myModelPath")
 
 </div>
 
+## Bisecting k-means
+
+Bisecting K-means can often be much faster than regular K-means, but it will generally produce a different clustering.
+
+Bisecting k-means is a kind of [hierarchical clustering](https://en.wikipedia.org/wiki/Hierarchical_clustering).
+Hierarchical clustering is one of the most commonly used  method of cluster analysis which seeks to build a hierarchy of clusters.
+Strategies for hierarchical clustering generally fall into two types:
+
+- Agglomerative: This is a "bottom up" approach: each observation starts in its own cluster, and pairs of clusters are merged as one moves up the hierarchy.
+- Divisive: This is a "top down" approach: all observations start in one cluster, and splits are performed recursively as one moves down the hierarchy.
+
+Bisecting k-means algorithm is a kind of divisive algorithms.
+The implementation in MLlib has the following parameters:
+
+* *k*: the desired number of leaf clusters (default: 4). The actual number could be smaller if there are no divisible leaf clusters.
+* *maxIterations*: the max number of k-means iterations to split clusters (default: 20)
+* *minDivisibleClusterSize*: the minimum number of points (if >= 1.0) or the minimum proportion of points (if < 1.0) of a divisible cluster (default: 1)
+* *seed*: a random seed (default: hash value of the class name)
+
+**Examples**
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+Refer to the [`BisectingKMeans` Scala docs](api/scala/index.html#org.apache.spark.mllib.clustering.BisectingKMeans) and [`BisectingKMeansModel` Scala docs](api/scala/index.html#org.apache.spark.mllib.clustering.BisectingKMeansModel) for details on the API.
+
+{% include_example scala/org/apache/spark/examples/mllib/BisectingKMeansExample.scala %}
+</div>
+
+<div data-lang="java" markdown="1">
+Refer to the [`BisectingKMeans` Java docs](api/java/org/apache/spark/mllib/clustering/BisectingKMeans.html) and [`BisectingKMeansModel` Java docs](api/java/org/apache/spark/mllib/clustering/BisectingKMeansModel.html) for details on the API.
+
+{% include_example java/org/apache/spark/examples/mllib/JavaBisectingKMeansExample.java %}
+</div>
+</div>
+
 ## Streaming k-means
 
 When data arrive in a stream, we may want to estimate clusters dynamically,
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
@@ -49,6 +49,7 @@ We list major functionality from both below, with links to detailed guides.
   * [Gaussian mixture](mllib-clustering.html#gaussian-mixture)
   * [power iteration clustering (PIC)](mllib-clustering.html#power-iteration-clustering-pic)
   * [latent Dirichlet allocation (LDA)](mllib-clustering.html#latent-dirichlet-allocation-lda)
+  * [bisecting k-means](mllib-clustering.html#bisecting-kmeans)
   * [streaming k-means](mllib-clustering.html#streaming-k-means)
 * [Dimensionality reduction](mllib-dimensionality-reduction.html)
   * [singular value decomposition (SVD)](mllib-dimensionality-reduction.html#singular-value-decomposition-svd)
diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaBisectingKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaBisectingKMeansExample.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+import java.util.ArrayList;
+
+// $example on$
+import com.google.common.collect.Lists;
+// $example off$
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+// $example on$
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.mllib.clustering.BisectingKMeans;
+import org.apache.spark.mllib.clustering.BisectingKMeansModel;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+// $example off$
+
+/**
+ * Java example for graph clustering using power iteration clustering (PIC).
+ */
+public class JavaBisectingKMeansExample {
+  public static void main(String[] args) {
+    SparkConf sparkConf = new SparkConf().setAppName("JavaBisectingKMeansExample");
+    JavaSparkContext sc = new JavaSparkContext(sparkConf);
+
+    // $example on$
+    ArrayList<Vector> localData = Lists.newArrayList(
+      Vectors.dense(0.1, 0.1),   Vectors.dense(0.3, 0.3),
+      Vectors.dense(10.1, 10.1), Vectors.dense(10.3, 10.3),
+      Vectors.dense(20.1, 20.1), Vectors.dense(20.3, 20.3),
+      Vectors.dense(30.1, 30.1), Vectors.dense(30.3, 30.3)
+    );
+    JavaRDD<Vector> data = sc.parallelize(localData, 2);
+
+    BisectingKMeans bkm = new BisectingKMeans()
+      .setK(4);
+    BisectingKMeansModel model = bkm.run(data);
+
+    System.out.println("Compute Cost: " + model.computeCost(data));
+    for (Vector center: model.clusterCenters()) {
+      System.out.println("");
+    }
+    Vector[] clusterCenters = model.clusterCenters();
+    for (int i = 0; i < clusterCenters.length; i++) {
+      Vector clusterCenter = clusterCenters[i];
+      System.out.println("Cluster Center " + i + ": " + clusterCenter);
+    }
+    // $example off$
+
+    sc.stop();
+  }
+}
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/BisectingKMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/BisectingKMeansExample.scala
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib
+
+// scalastyle:off println
+// $example on$
+import org.apache.spark.mllib.clustering.BisectingKMeans
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
+// $example off$
+import org.apache.spark.{SparkConf, SparkContext}
+
+/**
+ * An example demonstrating a bisecting k-means clustering in spark.mllib.
+ *
+ * Run with
+ * {{{
+ * bin/run-example mllib.BisectingKMeansExample
+ * }}}
+ */
+object BisectingKMeansExample {
+
+  def main(args: Array[String]) {
+    val sparkConf = new SparkConf().setAppName("mllib.BisectingKMeansExample")
+    val sc = new SparkContext(sparkConf)
+
+    // $example on$
+    // Loads and parses data
+    def parse(line: String): Vector = Vectors.dense(line.split(" ").map(_.toDouble))
+    val data = sc.textFile("data/mllib/kmeans_data.txt").map(parse).cache()
+
+    // Clustering the data into 6 clusters by BisectingKMeans.
+    val bkm = new BisectingKMeans().setK(6)
+    val model = bkm.run(data)
+
+    // Show the compute cost and the cluster centers
+    println(s"Compute Cost: ${model.computeCost(data)}")
+    model.clusterCenters.zipWithIndex.foreach { case (center, idx) =>
+      println(s"Cluster Center ${idx}: ${center}")
+    }
+    // $example off$
+
+    sc.stop()
+  }
+}
+// scalastyle:on println