Add support to UDF and modify the test case

xinyunh · xinyunh · commit a5212a5a98bc · 2015-08-04T13:38:12.000-07:00
diff --git a/src/main/scala/org/apache/spark/sql/hbase/HBaseCustomFilter.scala b/src/main/scala/org/apache/spark/sql/hbase/HBaseCustomFilter.scala
@@ -166,7 +166,7 @@ private[hbase] class HBaseCustomFilter extends FilterBase with Writable {
    * @param node the node to reset children on
    * @return
    */
-  def resetNode(node: Node) = {
+  private def resetNode(node: Node) = {
     if (node != null && node.cpr != null) {
       node.currentValue = node.cpr.start.orNull
       if (node.currentValue != null && !node.cpr.startInclusive) {
@@ -401,7 +401,7 @@ private[hbase] class HBaseCustomFilter extends FilterBase with Writable {
    * @param node the node to start with
    * @return (return code, the row key after successful increment)
    */
-  def increment(node: Node): (ReturnCode, HBaseRawType) = {
+  private def increment(node: Node): (ReturnCode, HBaseRawType) = {
     var currentNode: Node = node
     while (currentNode.parent != null) {
       if (addOne(currentNode)) {
@@ -439,7 +439,7 @@ private[hbase] class HBaseCustomFilter extends FilterBase with Writable {
    * @param node the node to add 1 to
    * @return whether the addition can be made within the value domain
    */
-  def addOne(node: Node): Boolean = {
+  private def addOne(node: Node): Boolean = {
     val dt = node.dt
     val value = node.currentValue
     var canAddOne: Boolean = true
@@ -569,7 +569,7 @@ private[hbase] class HBaseCustomFilter extends FilterBase with Writable {
    * do a full evaluation for the remaining predicate based on all the cell values
    * @param kvs the list of cell
    */
-  def fullEvalution(kvs: java.util.List[Cell]) = {
+  private def fullEvalution(kvs: java.util.List[Cell]) = {
     resetRow(workingRow)
     cellMap.clear()
     for (i <- 0 to kvs.size() - 1) {
@@ -609,9 +609,14 @@ private[hbase] class HBaseCustomFilter extends FilterBase with Writable {
   }
 
   override def filterRowCells(kvs: java.util.List[Cell]) = {
-    if (remainingPredicate != null) {
-      fullEvalution(kvs)
-    }
+    // In coprocessor, if the call to filterKeyValue returns INCLUDE on the very last record,
+    // the scanner runs past the end and never call filterKeyValue() before reaching here, leading
+    // to empty kvs and a subsequent NPE. This is observed with HBase 0.98.5.
+    //
+    // If a later HBase release has this addressed, this check will be made unnecessary
+    // to save some CPU cycles
+    if (kvs.isEmpty) filterRowFlag = true
+    else if (remainingPredicate != null) fullEvalution(kvs)
   }
 
   override def hasFilterRow: Boolean = {
diff --git a/src/main/scala/org/apache/spark/sql/hbase/HBaseSQLReaderRDD.scala b/src/main/scala/org/apache/spark/sql/hbase/HBaseSQLReaderRDD.scala
@@ -221,14 +221,21 @@ class HBaseSQLReaderRDD(val relation: HBaseRelation,
         s
     }
 
+    if (!useCustomFilter) {
+      def addOtherFilter(rdd: RDD[Row]): Unit = rdd match {
+        case hcsRDD: HBaseCoprocessorSQLReaderRDD => hcsRDD.otherFilters = otherFilters
+        case _ => if (rdd.dependencies.nonEmpty) addOtherFilter(rdd.firstParent[Row])
+      }
+      addOtherFilter(newSubplanRDD)
+    }
+
     val outputDataType: Seq[DataType] = subplan.get.output.map(attr => attr.dataType)
     val taskContextPara: (Int, Int, Long, Int) = TaskContext.get() match {
       case t: TaskContextImpl => (t.stageId, t.partitionId, t.taskAttemptId, t.attemptNumber)
       case _ => (0, 0, 0L, 0)
     }
 
-    scan.setAttribute(CoprocessorConstants.COINDEX,
-      Bytes.toBytes(partitionIndex))
+    scan.setAttribute(CoprocessorConstants.COINDEX, Bytes.toBytes(partitionIndex))
     scan.setAttribute(CoprocessorConstants.COTYPE, HBaseSerializer.serialize(outputDataType))
     scan.setAttribute(CoprocessorConstants.COKEY, HBaseSerializer.serialize(newSubplanRDD))
     scan.setAttribute(CoprocessorConstants.COTASK, HBaseSerializer.serialize(taskContextPara))
diff --git a/src/main/scala/org/apache/spark/sql/hbase/SparkSqlRegionObserver.scala b/src/main/scala/org/apache/spark/sql/hbase/SparkSqlRegionObserver.scala
@@ -47,14 +47,10 @@ class HBaseCoprocessorSQLReaderRDD(var relation: HBaseRelation,
   private def createIterator(context: TaskContext): Iterator[Row] = {
     val otherFilter: (Row) => Boolean = {
       if (otherFilters.isDefined) {
-        if (relation.deploySuccessfully.isDefined && relation.deploySuccessfully.get) {
-          null
+        if (codegenEnabled) {
+          GeneratePredicate.generate(otherFilters.get, finalOutput)
         } else {
-          if (codegenEnabled) {
-            GeneratePredicate.generate(otherFilters.get, finalOutput)
-          } else {
-            InterpretedPredicate.create(otherFilters.get, finalOutput)
-          }
+          InterpretedPredicate.create(otherFilters.get, finalOutput)
         }
       } else null
     }
diff --git a/src/test/scala/org/apache/spark/sql/hbase/HBaseAdditionalQuerySuite.scala b/src/test/scala/org/apache/spark/sql/hbase/HBaseAdditionalQuerySuite.scala
@@ -179,14 +179,22 @@ class HBaseAdditionalQuerySuite extends TestBase {
 
   test("DataFrame Test") {
     val teachers: DataFrame = TestHbase.sql("Select * from spark_teacher_3key")
-    teachers.orderBy(Column("grade").asc, Column("class").asc).show(3)
+    val result = teachers.orderBy(Column("grade").asc, Column("class").asc)
+      .select("teacher_name").limit(3).collect()
+    result.foreach(println)
+    val exparr = Array(Array("teacher_1_1_1"), Array("teacher_1_2_1"), Array("teacher_1_3_1"))
+    val res = {
+      for (rx <- exparr.indices)
+      yield compareWithTol(result(rx).toSeq, exparr(rx), s"Row$rx failed")
+    }.foldLeft(true) { case (res1, newres) => res1 && newres}
+    assert(res, "One or more rows did not match expected")
   }
 
   test("UDF Test") {
-    def myFilter(date: String) = date contains "_1_2"
+    def myFilter(s: String) = s contains "_1_2"
     TestHbase.udf.register("myFilter", myFilter _)
-    val result = TestHbase.sql("Select * from spark_teacher_3key WHERE myFilter(teacher_name)")
-    result.foreach(println)
+    val result = TestHbase.sql("Select count(*) from spark_teacher_3key WHERE myFilter(teacher_name)")
+    result.foreach(r => require(r.getLong(0) == 3L))
   }
 
   test("group test for presplit table with coprocessor but without codegen") {