Backport sampling fixes from dev (suggested by Henry Milner)

mateiz · mateiz · commit 3d24281fbf2c · 2012-09-29T21:55:52.000-07:00
diff --git a/core/src/main/scala/spark/Partitioner.scala b/core/src/main/scala/spark/Partitioner.scala
@@ -41,7 +41,7 @@ class RangePartitioner[K <% Ordered[K]: ClassManifest, V](
       Array()
     } else {
       val rddSize = rdd.count()
-      val maxSampleSize = partitions * 10.0
+      val maxSampleSize = partitions * 20.0
       val frac = math.min(maxSampleSize / math.max(rddSize, 1), 1.0)
       val rddSample = rdd.sample(true, frac, 1).map(_._1).collect().sortWith(_ < _)
       if (rddSample.length == 0) {
diff --git a/core/src/main/scala/spark/RDD.scala b/core/src/main/scala/spark/RDD.scala
@@ -97,32 +97,31 @@ abstract class RDD[T: ClassManifest](@transient sc: SparkContext) extends Serial
     var multiplier = 3.0
     var initialCount = count()
     var maxSelected = 0
-    
-    if (initialCount > Integer.MAX_VALUE) {
-      maxSelected = Integer.MAX_VALUE
+
+    if (initialCount > Integer.MAX_VALUE - 1) {
+      maxSelected = Integer.MAX_VALUE - 1
     } else {
       maxSelected = initialCount.toInt
     }
-    
+
     if (num > initialCount) {
       total = maxSelected
-      fraction = Math.min(multiplier * (maxSelected + 1) / initialCount, 1.0)
+      fraction = math.min(multiplier * (maxSelected + 1) / initialCount, 1.0)
     } else if (num < 0) {
       throw(new IllegalArgumentException("Negative number of elements requested"))
     } else {
-      fraction = Math.min(multiplier * (num + 1) / initialCount, 1.0)
-      total = num.toInt
+      fraction = math.min(multiplier * (num + 1) / initialCount, 1.0)
+      total = num
     }
-  
-    var samples = this.sample(withReplacement, fraction, seed).collect()
-  
+
+    val rand = new Random(seed)
+    var samples = this.sample(withReplacement, fraction, rand.nextInt).collect()
+
     while (samples.length < total) {
-      samples = this.sample(withReplacement, fraction, seed).collect()
+      samples = this.sample(withReplacement, fraction, rand.nextInt).collect()
     }
-  
-    val arr = samples.take(total)
-  
-    return arr
+
+    Utils.randomizeInPlace(samples, rand).take(total)
   }
 
   def union(other: RDD[T]): RDD[T] = new UnionRDD(sc, Array(this, other))
diff --git a/core/src/main/scala/spark/SampledRDD.scala b/core/src/main/scala/spark/SampledRDD.scala
@@ -1,9 +1,11 @@
 package spark
 
 import java.util.Random
+import cern.jet.random.Poisson
+import cern.jet.random.engine.DRand
 
 class SampledRDDSplit(val prev: Split, val seed: Int) extends Split with Serializable {
-  override val index = prev.index
+  override val index: Int = prev.index
 }
 
 class SampledRDD[T: ClassManifest](
@@ -15,7 +17,7 @@ class SampledRDD[T: ClassManifest](
 
   @transient
   val splits_ = {
-    val rg = new Random(seed);
+    val rg = new Random(seed)
     prev.splits.map(x => new SampledRDDSplit(x, rg.nextInt))
   }
 
@@ -28,19 +30,21 @@ class SampledRDD[T: ClassManifest](
 
   override def compute(splitIn: Split) = {
     val split = splitIn.asInstanceOf[SampledRDDSplit]
-    val rg = new Random(split.seed);
-    // Sampling with replacement (TODO: use reservoir sampling to make this more efficient?)
     if (withReplacement) {
-      val oldData = prev.iterator(split.prev).toArray
-      val sampleSize = (oldData.size * frac).ceil.toInt
-      val sampledData = { 
-        // all of oldData's indices are candidates, even if sampleSize < oldData.size
-        for (i <- 1 to sampleSize)
-          yield oldData(rg.nextInt(oldData.size)) 
+      // For large datasets, the expected number of occurrences of each element in a sample with
+      // replacement is Poisson(frac). We use that to get a count for each element.
+      val poisson = new Poisson(frac, new DRand(split.seed))
+      prev.iterator(split.prev).flatMap { element =>
+        val count = poisson.nextInt()
+        if (count == 0) {
+          Iterator.empty  // Avoid object allocation when we return 0 items, which is quite often
+        } else {
+          Iterator.fill(count)(element)
+        }
       }
-      sampledData.iterator
     } else { // Sampling without replacement
-      prev.iterator(split.prev).filter(x => (rg.nextDouble <= frac))
+      val rand = new Random(split.seed)
+      prev.iterator(split.prev).filter(x => (rand.nextDouble <= frac))
     }
   }
 }
diff --git a/core/src/main/scala/spark/Utils.scala b/core/src/main/scala/spark/Utils.scala
@@ -5,8 +5,7 @@ import java.net.InetAddress
 import java.util.concurrent.{Executors, ThreadFactory, ThreadPoolExecutor}
 
 import scala.collection.mutable.ArrayBuffer
-import scala.util.Random
-import java.util.{Locale, UUID}
+import java.util.{Locale, UUID, Random}
 
 /**
  * Various utility methods used by Spark.
@@ -104,20 +103,27 @@ object Utils {
     }
   }
 
-  // Shuffle the elements of a collection into a random order, returning the
-  // result in a new collection. Unlike scala.util.Random.shuffle, this method
-  // uses a local random number generator, avoiding inter-thread contention.
-  def randomize[T](seq: TraversableOnce[T]): Seq[T] = {
-    val buf = new ArrayBuffer[T]()
-    buf ++= seq
-    val rand = new Random()
-    for (i <- (buf.size - 1) to 1 by -1) {
+  /**
+   * Shuffle the elements of a collection into a random order, returning the
+   * result in a new collection. Unlike scala.util.Random.shuffle, this method
+   * uses a local random number generator, avoiding inter-thread contention.
+   */
+  def randomize[T: ClassManifest](seq: TraversableOnce[T]): Seq[T] = {
+    randomizeInPlace(seq.toArray)
+  }
+
+  /**
+   * Shuffle the elements of an array into a random order, modifying the
+   * original array. Returns the original array.
+   */
+  def randomizeInPlace[T](arr: Array[T], rand: Random = new Random): Array[T] = {
+    for (i <- (arr.length - 1) to 1 by -1) {
       val j = rand.nextInt(i)
-      val tmp = buf(j)
-      buf(j) = buf(i)
-      buf(i) = tmp
+      val tmp = arr(j)
+      arr(j) = arr(i)
+      arr(i) = tmp
     }
-    buf
+    arr
   }
 
   /**
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
@@ -58,7 +58,8 @@ object SparkBuild extends Build {
       "com.google.protobuf" % "protobuf-java" % "2.4.1",
       "de.javakaffee" % "kryo-serializers" % "0.9",
       "org.jboss.netty" % "netty" % "3.2.6.Final",
-      "it.unimi.dsi" % "fastutil" % "6.4.2"
+      "it.unimi.dsi" % "fastutil" % "6.4.2",
+      "colt" % "colt" % "1.2.0"
     )
   ) ++ assemblySettings ++ Seq(test in assembly := {})
 
@@ -68,8 +69,7 @@ object SparkBuild extends Build {
   ) ++ assemblySettings ++ Seq(test in assembly := {})
 
   def examplesSettings = sharedSettings ++ Seq(
-    name := "spark-examples",
-    libraryDependencies += "colt" % "colt" % "1.2.0"
+    name := "spark-examples"
   )
 
   def bagelSettings = sharedSettings ++ Seq(name := "spark-bagel")