@@ -39,27 +39,23 @@ import org.apache.spark.{SparkConf, SparkContext}
39
39
* n: Number of sampled points on innermost circle.. There are proportionally more points
40
40
* within the outer/larger circles
41
41
* maxIterations: Number of Power Iterations
42
- * outerRadius: radius of the outermost of the concentric circles
43
42
* }}}
44
43
*
45
44
* Here is a sample run and output:
46
45
*
47
- * ./bin/run-example mllib.PowerIterationClusteringExample -k 3 --n 30 --maxIterations 15
48
- *
49
- * Cluster assignments: 1 -> [0,1,2,3,4],2 -> [5,6,7,8,9,10,11,12,13,14],
50
- * 0 -> [15,16,17,18,19,20,21,22,23,24,25,26,27,28,29]
46
+ * ./bin/run-example mllib.PowerIterationClusteringExample -k 2 --n 10 --maxIterations 15
51
47
*
48
+ * Cluster assignments: 1 -> [0,1,2,3,4,5,6,7,8,9],
49
+ * 0 -> [10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29]
52
50
*
53
51
* If you use it as a template to create your own app, please use `spark-submit` to submit your app.
54
52
*/
55
53
object PowerIterationClusteringExample {
56
54
57
55
case class Params (
58
- input : String = null ,
59
- k : Int = 3 ,
60
- numPoints : Int = 5 ,
61
- maxIterations : Int = 10 ,
62
- outerRadius : Double = 3.0
56
+ k : Int = 2 ,
57
+ numPoints : Int = 10 ,
58
+ maxIterations : Int = 15
63
59
) extends AbstractParams [Params ]
64
60
65
61
def main (args : Array [String ]) {
@@ -68,17 +64,14 @@ object PowerIterationClusteringExample {
68
64
val parser = new OptionParser [Params ](" PowerIterationClusteringExample" ) {
69
65
head(" PowerIterationClusteringExample: an example PIC app using concentric circles." )
70
66
opt[Int ]('k' , " k" )
71
- .text(s " number of circles (/ clusters), default: ${defaultParams.k}" )
67
+ .text(s " number of circles (clusters), default: ${defaultParams.k}" )
72
68
.action((x, c) => c.copy(k = x))
73
69
opt[Int ]('n' , " n" )
74
70
.text(s " number of points in smallest circle, default: ${defaultParams.numPoints}" )
75
71
.action((x, c) => c.copy(numPoints = x))
76
72
opt[Int ](" maxIterations" )
77
73
.text(s " number of iterations, default: ${defaultParams.maxIterations}" )
78
74
.action((x, c) => c.copy(maxIterations = x))
79
- opt[Double ]('r' , " r" )
80
- .text(s " radius of outermost circle, default: ${defaultParams.outerRadius}" )
81
- .action((x, c) => c.copy(outerRadius = x))
82
75
}
83
76
84
77
parser.parse(args, defaultParams).map { params =>
@@ -96,20 +89,21 @@ object PowerIterationClusteringExample {
96
89
97
90
Logger .getRootLogger.setLevel(Level .WARN )
98
91
99
- val circlesRdd = generateCirclesRdd(sc, params.k, params.numPoints, params.outerRadius )
92
+ val circlesRdd = generateCirclesRdd(sc, params.k, params.numPoints)
100
93
val model = new PowerIterationClustering ()
101
94
.setK(params.k)
102
95
.setMaxIterations(params.maxIterations)
96
+ .setInitializationMode(" degree" )
103
97
.run(circlesRdd)
104
98
105
99
val clusters = model.assignments.collect().groupBy(_.cluster).mapValues(_.map(_.id))
106
- val assignments = clusters.toList.sortBy { case (k, v) => v.length}
100
+ val assignments = clusters.toList.sortBy { case (k, v) => v.length }
107
101
val assignmentsStr = assignments
108
102
.map { case (k, v) =>
109
103
s " $k -> ${v.sorted.mkString(" [" , " ," , " ]" )}"
110
- }.mkString(" ," )
104
+ }.mkString(" , " )
111
105
val sizesStr = assignments.map {
112
- _._2.size
106
+ _._2.length
113
107
}.sorted.mkString(" (" , " ," , " )" )
114
108
println(s " Cluster assignments: $assignmentsStr\n cluster sizes: $sizesStr" )
115
109
@@ -123,20 +117,17 @@ object PowerIterationClusteringExample {
123
117
}
124
118
}
125
119
126
- def generateCirclesRdd (sc : SparkContext ,
127
- nCircles : Int = 3 ,
128
- nPoints : Int = 30 ,
129
- outerRadius : Double ): RDD [(Long , Long , Double )] = {
130
-
131
- val radii = Array .tabulate(nCircles) { cx => outerRadius / (nCircles - cx)}
132
- val groupSizes = Array .tabulate(nCircles) { cx => (cx + 1 ) * nPoints}
133
- val points = (0 until nCircles).flatMap { cx =>
134
- generateCircle(radii(cx), groupSizes(cx))
120
+ def generateCirclesRdd (
121
+ sc : SparkContext ,
122
+ nCircles : Int ,
123
+ nPoints : Int ): RDD [(Long , Long , Double )] = {
124
+ val points = (1 to nCircles).flatMap { i =>
125
+ generateCircle(i, i * nPoints)
135
126
}.zipWithIndex
136
127
val rdd = sc.parallelize(points)
137
128
val distancesRdd = rdd.cartesian(rdd).flatMap { case (((x0, y0), i0), ((x1, y1), i1)) =>
138
129
if (i0 < i1) {
139
- Some ((i0.toLong, i1.toLong, gaussianSimilarity((x0, y0), (x1, y1), 1.0 )))
130
+ Some ((i0.toLong, i1.toLong, gaussianSimilarity((x0, y0), (x1, y1))))
140
131
} else {
141
132
None
142
133
}
@@ -147,11 +138,9 @@ object PowerIterationClusteringExample {
147
138
/**
148
139
* Gaussian Similarity: http://en.wikipedia.org/wiki/Radial_basis_function_kernel
149
140
*/
150
- def gaussianSimilarity (p1 : (Double , Double ), p2 : (Double , Double ), sigma : Double ): Double = {
151
- val coeff = 1.0 / (math.sqrt(2.0 * math.Pi ) * sigma)
152
- val expCoeff = - 1.0 / 2.0 * math.pow(sigma, 2.0 )
141
+ def gaussianSimilarity (p1 : (Double , Double ), p2 : (Double , Double )): Double = {
153
142
val ssquares = (p1._1 - p2._1) * (p1._1 - p2._1) + (p1._2 - p2._2) * (p1._2 - p2._2)
154
- coeff * math.exp(expCoeff * ssquares )
143
+ math.exp(- ssquares / 2.0 )
155
144
}
156
145
}
157
146
0 commit comments