From 5fc39ab6a7785616ddb6fa43d13368941e151496 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobias=20G=C3=B6dderz?= Date: Thu, 21 Aug 2025 14:17:47 +0200 Subject: [PATCH 1/4] Added a regression test --- arangod/Aql/ExecutionBlockImpl.tpp | 10 +++ ...mizer-rule-async-prefetch-noncluster-fp.js | 85 +++++++++++++++---- 2 files changed, 77 insertions(+), 18 deletions(-) diff --git a/arangod/Aql/ExecutionBlockImpl.tpp b/arangod/Aql/ExecutionBlockImpl.tpp index d8eb5f50aeb1..3c73d1e6f15b 100644 --- a/arangod/Aql/ExecutionBlockImpl.tpp +++ b/arangod/Aql/ExecutionBlockImpl.tpp @@ -1143,6 +1143,16 @@ auto ExecutionBlockImpl::executeFetcher(ExecutionContext& ctx, block->getPlanNode()->id().id(), ": ", block->getPlanNode()->getTypeString(), "]")}); } + + TRI_IF_FAILURE("AsyncPrefetch::delayDestructor") { + // Delay the destruction of `task` to make it likely that the + // Query, and with it the AqlItemBlockManager, is destructed + // before the PrefetchTask. That way we can assert that the + // result, and possible SharedAqlItemBlockPtrs in it, will be + // destroyed before the AqlItemBlockManager. + using namespace std::chrono_literals; + std::this_thread::sleep_for(1ms); + } }); if (!queued) { diff --git a/tests/js/client/aql/aql-optimizer-rule-async-prefetch-noncluster-fp.js b/tests/js/client/aql/aql-optimizer-rule-async-prefetch-noncluster-fp.js index 6485adfd0ea0..441cbd1b2a18 100644 --- a/tests/js/client/aql/aql-optimizer-rule-async-prefetch-noncluster-fp.js +++ b/tests/js/client/aql/aql-optimizer-rule-async-prefetch-noncluster-fp.js @@ -1,5 +1,4 @@ -/*jshint globalstrict:false, strict:false, maxlen: 500 */ -/*global assertEqual, assertNotEqual */ +/* jshint strict:true */ // ////////////////////////////////////////////////////////////////////////////// // / DISCLAIMER @@ -25,13 +24,11 @@ // ////////////////////////////////////////////////////////////////////////////// const jsunity = require("jsunity"); -const internal = require('internal'); -const db = internal.db; +const {assertEqual, assertNotEqual} = jsunity.jsUnity.assertions; const helper = require("@arangodb/aql-helper"); -const errors = internal.errors; const assertQueryError = helper.assertQueryError; let IM = global.instanceManager; -const {aql} = require('@arangodb'); +const {aql, db, errors} = require('@arangodb'); function AsyncPrefetchFailure () { const ruleName = "async-prefetch"; @@ -53,9 +50,15 @@ function AsyncPrefetchFailure () { db._drop(cn); }, - // This failure was observed when the exectution block were not destroyed in topological order - // and the async task had outlived its execution block - testAsyncPrefetchFailure : function () { + tearDown : function () { + IM.debugClearFailAt(); + }, + + // This failure was observed when the execution blocks were not destroyed in + // topological order, and the async task had outlived its execution block. + // Regression test for https://arangodb.atlassian.net/browse/BTS-2119 + // (https://github.com/arangodb/arangodb/pull/21709). + testAsyncPrefetchRegressionBts2119 : function () { IM.debugSetFailAt("AsyncPrefetch::blocksDestroyedOutOfOrder"); // This query is forced to have async prefetch optimization const query = aql`FOR d IN ${col} FILTER d.val > 0 LET x = FAIL('failed') RETURN d`; @@ -64,23 +67,69 @@ function AsyncPrefetchFailure () { const result = db._createStatement(query).explain(); - const actualNodes = result.plan.nodes.map((n) => [n.type, n.isAsyncPrefetchEnabled === true ? true : false]); + const actualNodes = result.plan.nodes.map((n) => [n.type, n.isAsyncPrefetchEnabled === true]); assertEqual(expectedNodes, actualNodes, query); - - let expectPrefetchRule = expectedNodes.filter((n) => n[1]).length > 0; - if (expectPrefetchRule) { - assertNotEqual(-1, result.plan.rules.indexOf(ruleName)); - } else { - assertEqual(-1, result.plan.rules.indexOf(ruleName)); - } + + assertNotEqual(-1, result.plan.rules.indexOf(ruleName)); + assertQueryError( errors.ERROR_QUERY_FAIL_CALLED.code, query, query.bindParams, query.options ); + }, + + // Regression test for https://arangodb.atlassian.net/browse/BTS-2211. + // There was a race after the prefetch task has written its result, but + // before the shared_ptr to the task is destructed. If the Query is + // destructed during that time window, and with it the AqlItemBlockManager, + // the prefetch task could hold a SharedAqlItemBlockPtr in its result, that + // would then be released to the already-destroyed AqlItemBlockManager. + // In maintainer mode, the AqlItemBlockManager's destructor asserts that + // no AqlItemBlocks are still leased: This test would run into that + // assertion. + testAsyncPrefetchRegressionBts2211 : function () { + // After a prefetch task has written its result, wait a little before + // destructing the shared_ptr to the task. It's now likely to be + // destructed after the Query. The AqlItemBlockManager's destructor + // asserts that no AqlItemBlocks are still in use, particularly for a + // result in the prefetch task. + IM.debugSetFailAt("AsyncPrefetch::delayDestructor"); + const query = aql` + FOR d IN ${col} + FILTER NOOPT(d.val >= 0) + LET x = FAIL('failed') + RETURN d.val + `; + query.options = {}; + const expectedNodes = [ + ["SingletonNode", false], + ["EnumerateCollectionNode", true], + ["CalculationNode", false], + ["FilterNode", true], + ["CalculationNode", false], + ["ReturnNode", false], + ]; + + const result = db._createStatement(query).explain(); - IM.debugRemoveFailAt("AsyncPrefetch::blocksDestroyedOutOfOrder"); + const actualNodes = result.plan.nodes.map((n) => + [ n.type, + n.isAsyncPrefetchEnabled === true, + ]); + assertEqual(expectedNodes, actualNodes, query); + + assertNotEqual(-1, result.plan.rules.indexOf(ruleName)); + + for (let i = 0; i < 10000; ++i) { + assertQueryError( + errors.ERROR_QUERY_FAIL_CALLED.code, + query, + query.bindParams, + query.options + ); + } }, }; } From 0ba7fbd2f91a39315116caa9167fee3f39a93a5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobias=20G=C3=B6dderz?= Date: Thu, 21 Aug 2025 14:20:44 +0200 Subject: [PATCH 2/4] When stopping prefetch tasks, always discard the results as well --- arangod/Aql/ExecutionBlockImpl.h | 8 ++++---- arangod/Aql/ExecutionBlockImpl.tpp | 14 +++++++++----- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/arangod/Aql/ExecutionBlockImpl.h b/arangod/Aql/ExecutionBlockImpl.h index 4121e760d3cf..0c87528d95b6 100644 --- a/arangod/Aql/ExecutionBlockImpl.h +++ b/arangod/Aql/ExecutionBlockImpl.h @@ -377,20 +377,20 @@ class ExecutionBlockImpl final : public ExecutionBlock { * consumed. * * An additional note on stopping the query early: In this case - * we now have at least two concurrent operations, the cleannup + * we now have at least two concurrent operations, the cleanup * which will eventually destruct all objects in the query, and * an async prefetch task. * This stop process now needs to ensure that the prefetchTask is * either completed or will not be started. - * Therefore if the task is not consumed the prefetch task is tried to + * Therefore, if the task is not consumed, the prefetch task is tried to * get claimed with above mechanism. If we can not claim it, we need * to wait for it to finish, as someone is actively working on it. * If we claimed it, we need to discard it, as it is not needed anymore. * - * Also please note: When we are cleaning up the query in the stopAsyncTasks + * Also, please note: When we are cleaning up the query in stopAsyncTasks(), * the execution order guarantees that for any ExecutionBlock we can only have * one thread that is either in the "execute" or in the "stopAsyncTasks" - * codepath The Executionblock may have the prefetch task running, but that + * codepath. The ExecutionBlock may have the prefetch task running, but that * only is within the executeFetcher, so already on the way to the next * Executor. And as we are stopping executors from "RETURN" to "Singleton" we * guarantee that we can only have prefetch tasks active for Executors in diff --git a/arangod/Aql/ExecutionBlockImpl.tpp b/arangod/Aql/ExecutionBlockImpl.tpp index 3c73d1e6f15b..14a2d3b59f7e 100644 --- a/arangod/Aql/ExecutionBlockImpl.tpp +++ b/arangod/Aql/ExecutionBlockImpl.tpp @@ -381,12 +381,16 @@ void ExecutionBlockImpl::stopAsyncTasks() { // some thread is still working on our prefetch task // -> we need to wait for that task to finish first! _prefetchTask->waitFor(); - } else { - // We took responsibility for the prefetch task, but we are not going to - // run anything, this in the shutdown process. - // We will simply discard the task - _prefetchTask->discard(/*isFinished*/ false); } + // We either claimed the task, or it is finished. Now we have to discard + // it, for two reasons: + // 1) The state must not stay InProgress, so a second call to + // `stopAsyncTasks()`, which is currently done, will not wait forever. + // 2) We must destroy the result, so a possible SharedAqlItemBlockPtr will + // return the AqlItemBlock to the AqlItemBlockManager. Note that a task + // in the scheduler queue will keep the prefetch task alive, possibly + // for longer than the query itself. + _prefetchTask->discard(/*isFinished*/ false); } } } From 948f4c84e26ae12615131e9d9b769ae7f04f8a61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobias=20G=C3=B6dderz?= Date: Thu, 21 Aug 2025 14:31:19 +0200 Subject: [PATCH 3/4] Added CHANGELOG entry --- CHANGELOG | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index 9992867cdf23..1b30dceb19f4 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,11 @@ devel ----- +* Fix BTS-2211: AQL queries using the async-prefetch rule, which result in a + failure due to a runtime exception or possibly a kill, had a possible race + during query destruction that could result in undefined behavior (e.g. + memory leaks or crashes). + * Add a separate thread to the UserManager to handle the load of the user-cache asynchronously. This pre-loads the user-cache as fast as possible (as soon as the global version changes) Also, compared to the old version, this allows the read-only operation to work on the user-cache From 0b1e2631c8cceac02454d7757f93dbc02c31ab18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobias=20G=C3=B6dderz?= Date: Thu, 21 Aug 2025 14:37:31 +0200 Subject: [PATCH 4/4] Improved a comment --- arangod/Aql/ExecutionBlockImpl.tpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arangod/Aql/ExecutionBlockImpl.tpp b/arangod/Aql/ExecutionBlockImpl.tpp index 14a2d3b59f7e..0406232c9764 100644 --- a/arangod/Aql/ExecutionBlockImpl.tpp +++ b/arangod/Aql/ExecutionBlockImpl.tpp @@ -384,12 +384,15 @@ void ExecutionBlockImpl::stopAsyncTasks() { } // We either claimed the task, or it is finished. Now we have to discard // it, for two reasons: - // 1) The state must not stay InProgress, so a second call to - // `stopAsyncTasks()`, which is currently done, will not wait forever. - // 2) We must destroy the result, so a possible SharedAqlItemBlockPtr will - // return the AqlItemBlock to the AqlItemBlockManager. Note that a task - // in the scheduler queue will keep the prefetch task alive, possibly - // for longer than the query itself. + // 1) The state must not stay InProgress (if we claimed the task), so a + // second call to `stopAsyncTasks()`, which is currently done, will not + // wait forever. + // 2) We must destroy the result (if the task finished), so a possible + // SharedAqlItemBlockPtr will return the AqlItemBlock to the + // AqlItemBlockManager. Note that the callback executed by the + // scheduler queue will release its shared_ptr to the prefetch task + // after it has finished; so it is possible that the task outlives the + // query, and thus the AqlItemBlockManager. _prefetchTask->discard(/*isFinished*/ false); } }