Skip to content

Commit d77bf0b

Browse files
JoshRosenrxin
authored andcommitted
[SPARK-12075][SQL] Speed up HiveComparisionTest by avoiding / speeding up TestHive.reset()
When profiling HiveCompatibilitySuite, I noticed that most of the time seems to be spent in expensive `TestHive.reset()` calls. This patch speeds up suites based on HiveComparisionTest, such as HiveCompatibilitySuite, with the following changes: - Avoid `TestHive.reset()` whenever possible: - Use a simple set of heuristics to guess whether we need to call `reset()` in between tests. - As a safety-net, automatically re-run failed tests by calling `reset()` before the re-attempt. - Speed up the expensive parts of `TestHive.reset()`: loading the `src` and `srcpart` tables took roughly 600ms per test, so we now avoid this by using a simple heuristic which only loads those tables by tests that reference them. This is based on simple string matching over the test queries which errs on the side of loading in more situations than might be strictly necessary. After these changes, HiveCompatibilitySuite seems to run in about 10 minutes. This PR is a revival of apache#6663, an earlier experimental PR from June, where I played around with several possible speedups for this suite. Author: Josh Rosen <joshrosen@databricks.com> Closes apache#10055 from JoshRosen/speculative-testhive-reset. (cherry picked from commit ef6790f) Signed-off-by: Reynold Xin <rxin@databricks.com>
1 parent 012de2c commit d77bf0b

File tree

3 files changed

+62
-14
lines changed

3 files changed

+62
-14
lines changed

sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -443,13 +443,6 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
443443
defaultOverrides()
444444

445445
runSqlHive("USE default")
446-
447-
// Just loading src makes a lot of tests pass. This is because some tests do something like
448-
// drop an index on src at the beginning. Since we just pass DDL to hive this bypasses our
449-
// Analyzer and thus the test table auto-loading mechanism.
450-
// Remove after we handle more DDL operations natively.
451-
loadTestTable("src")
452-
loadTestTable("srcpart")
453446
} catch {
454447
case e: Exception =>
455448
logError("FATAL ERROR: Failed to reset TestDB state.", e)

sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala

Lines changed: 61 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,11 @@ abstract class HiveComparisonTest
209209
}
210210

211211
val installHooksCommand = "(?i)SET.*hooks".r
212-
def createQueryTest(testCaseName: String, sql: String, reset: Boolean = true) {
212+
def createQueryTest(
213+
testCaseName: String,
214+
sql: String,
215+
reset: Boolean = true,
216+
tryWithoutResettingFirst: Boolean = false) {
213217
// testCaseName must not contain ':', which is not allowed to appear in a filename of Windows
214218
assert(!testCaseName.contains(":"))
215219

@@ -240,9 +244,6 @@ abstract class HiveComparisonTest
240244
test(testCaseName) {
241245
logDebug(s"=== HIVE TEST: $testCaseName ===")
242246

243-
// Clear old output for this testcase.
244-
outputDirectories.map(new File(_, testCaseName)).filter(_.exists()).foreach(_.delete())
245-
246247
val sqlWithoutComment =
247248
sql.split("\n").filterNot(l => l.matches("--.*(?<=[^\\\\]);")).mkString("\n")
248249
val allQueries =
@@ -269,11 +270,32 @@ abstract class HiveComparisonTest
269270
}.mkString("\n== Console version of this test ==\n", "\n", "\n")
270271
}
271272

272-
try {
273+
def doTest(reset: Boolean, isSpeculative: Boolean = false): Unit = {
274+
// Clear old output for this testcase.
275+
outputDirectories.map(new File(_, testCaseName)).filter(_.exists()).foreach(_.delete())
276+
273277
if (reset) {
274278
TestHive.reset()
275279
}
276280

281+
// Many tests drop indexes on src and srcpart at the beginning, so we need to load those
282+
// tables here. Since DROP INDEX DDL is just passed to Hive, it bypasses the analyzer and
283+
// thus the tables referenced in those DDL commands cannot be extracted for use by our
284+
// test table auto-loading mechanism. In addition, the tests which use the SHOW TABLES
285+
// command expect these tables to exist.
286+
val hasShowTableCommand = queryList.exists(_.toLowerCase.contains("show tables"))
287+
for (table <- Seq("src", "srcpart")) {
288+
val hasMatchingQuery = queryList.exists { query =>
289+
val normalizedQuery = query.toLowerCase.stripSuffix(";")
290+
normalizedQuery.endsWith(table) ||
291+
normalizedQuery.contains(s"from $table") ||
292+
normalizedQuery.contains(s"from default.$table")
293+
}
294+
if (hasShowTableCommand || hasMatchingQuery) {
295+
TestHive.loadTestTable(table)
296+
}
297+
}
298+
277299
val hiveCacheFiles = queryList.zipWithIndex.map {
278300
case (queryString, i) =>
279301
val cachedAnswerName = s"$testCaseName-$i-${getMd5(queryString)}"
@@ -430,12 +452,45 @@ abstract class HiveComparisonTest
430452
""".stripMargin
431453

432454
stringToFile(new File(wrongDirectory, testCaseName), errorMessage + consoleTestCase)
433-
fail(errorMessage)
455+
if (isSpeculative && !reset) {
456+
fail("Failed on first run; retrying")
457+
} else {
458+
fail(errorMessage)
459+
}
434460
}
435461
}
436462

437463
// Touch passed file.
438464
new FileOutputStream(new File(passedDirectory, testCaseName)).close()
465+
}
466+
467+
val canSpeculativelyTryWithoutReset: Boolean = {
468+
val excludedSubstrings = Seq(
469+
"into table",
470+
"create table",
471+
"drop index"
472+
)
473+
!queryList.map(_.toLowerCase).exists { query =>
474+
excludedSubstrings.exists(s => query.contains(s))
475+
}
476+
}
477+
478+
try {
479+
try {
480+
if (tryWithoutResettingFirst && canSpeculativelyTryWithoutReset) {
481+
doTest(reset = false, isSpeculative = true)
482+
} else {
483+
doTest(reset)
484+
}
485+
} catch {
486+
case tf: org.scalatest.exceptions.TestFailedException =>
487+
if (tryWithoutResettingFirst && canSpeculativelyTryWithoutReset) {
488+
logWarning("Test failed without reset(); retrying with reset()")
489+
doTest(reset = true)
490+
} else {
491+
throw tf
492+
}
493+
}
439494
} catch {
440495
case tf: org.scalatest.exceptions.TestFailedException => throw tf
441496
case originalException: Exception =>

sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQueryFileTest.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ abstract class HiveQueryFileTest extends HiveComparisonTest {
5959
runAll) {
6060
// Build a test case and submit it to scala test framework...
6161
val queriesString = fileToString(testCaseFile)
62-
createQueryTest(testCaseName, queriesString)
62+
createQueryTest(testCaseName, queriesString, reset = true, tryWithoutResettingFirst = true)
6363
} else {
6464
// Only output warnings for the built in whitelist as this clutters the output when the user
6565
// trying to execute a single test from the commandline.

0 commit comments

Comments
 (0)