Skip to content

Commit efa43ac

Browse files
frazerclementdahlerlend
authored andcommitted
Bug#36028828 testNodeRestart -n MixedReadUpdateScan failures
Testcase failure exposed regression from fix to Bug#22602898 NDB : CURIOUS STATE OF TC COMMIT_SENT / COMPLETE_SENT TIMEOUT HANDLING Node failure handling in TC performs 1 pass through the local active transactions to find those affected by a node failure. In this pass, all transactions affected by the node failure are queued for processing, e.g. rollback, commit, complete, via e.g. the serial abort/commit or complete protocols. The exceptions are transactions in transient internal states such as CS_PREPARE_TO_COMMIT, CS_COMMITTING, CS_COMPLETING, which are then followed by stable 'wait' states such as CS_COMMIT_SENT, CS_COMPLETE_SENT. Transactions in these states were handled by doing nothing in the node failure handling pass, and relying on the timeout handling in the subsequent stable states to queue transactions for processing. The fix to Bug#22602898 removed this stable state handling to avoid it accidentally triggering, but also stopped it from triggering when needed in this case where node failure handling found a transaction in a transient state. This is solved by modifying the CS_COMMIT_SENT and CS_COMPLETE_SENT stable state handling to also perform node failure processing if a timeout has occurred for a transaction with a failure number different to the current latest failure number. This ensures that all transactions involving the failed node are handled eventually. A new testcase testNodeRestart -n TransientStatesNF T1 is added to the AT testsuite to give coverage. Change-Id: I0c0d4b6f75a97a3a7ff892cc4eafd2351491a8ff
1 parent 14c90a0 commit efa43ac

File tree

4 files changed

+117
-20
lines changed

4 files changed

+117
-20
lines changed

storage/ndb/src/kernel/blocks/ERROR_codes.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ Next DBTUP 4039
2828
Next DBLQH 5112
2929
Next DBDICT 6223
3030
Next DBDIH 7251
31-
Next DBTC 8123
31+
Next DBTC 8124
3232
Next CMVMI 9000
3333
Next TRPMAN 9501
3434
Next BACKUP 10057

storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6081,8 +6081,10 @@ void Dbtc::commit020Lab(Signal *signal,
60816081
Tcount += sendCommitLqh(signal, localTcConnectptr.p, apiConnectptr.p);
60826082

60836083
if (tcConList.next(localTcConnectptr)) {
6084-
if (Tcount < 16 && !(ERROR_INSERTED(8057) || ERROR_INSERTED(8073) ||
6085-
ERROR_INSERTED(8089))) {
6084+
if (Tcount < 16 &&
6085+
!(ERROR_INSERTED(8057) || ERROR_INSERTED(8073) ||
6086+
ERROR_INSERTED(8089) ||
6087+
(ERROR_INSERTED(8123) && ((apiConnectptr.i & 0x1) == 0)))) {
60866088
jam();
60876089
continue;
60886090
} else {
@@ -6101,7 +6103,7 @@ void Dbtc::commit020Lab(Signal *signal,
61016103
signal->theData[0] = TcContinueB::ZSEND_COMMIT_LOOP;
61026104
signal->theData[1] = apiConnectptr.i;
61036105
signal->theData[2] = localTcConnectptr.i;
6104-
if (ERROR_INSERTED(8089)) {
6106+
if (ERROR_INSERTED(8089) || ERROR_INSERTED(8123)) {
61056107
sendSignalWithDelay(cownref, GSN_CONTINUEB, signal, 100, 3);
61066108
return;
61076109
}
@@ -6513,7 +6515,8 @@ void Dbtc::complete010Lab(Signal *signal,
65136515
/* ************ */
65146516
Tcount += sendCompleteLqh(signal, localTcConnectptr.p, apiConnectptr.p);
65156517
if (tcConList.next(localTcConnectptr)) {
6516-
if (Tcount < 16 && !ERROR_INSERTED(8112)) {
6518+
if (Tcount < 16 && !ERROR_INSERTED(8112) &&
6519+
!(ERROR_INSERTED(8123) && ((apiConnectptr.i & 0x1) != 0))) {
65176520
jamDebug();
65186521
continue;
65196522
} else {
@@ -6525,7 +6528,7 @@ void Dbtc::complete010Lab(Signal *signal,
65256528
signal->theData[0] = TcContinueB::ZSEND_COMPLETE_LOOP;
65266529
signal->theData[1] = apiConnectptr.i;
65276530
signal->theData[2] = localTcConnectptr.i;
6528-
if (ERROR_INSERTED(8112)) {
6531+
if (ERROR_INSERTED(8112) || ERROR_INSERTED(8123)) {
65296532
sendSignalWithDelay(cownref, GSN_CONTINUEB, signal, 100, 3);
65306533
return;
65316534
}
@@ -8995,7 +8998,8 @@ void Dbtc::timeOutFoundLab(Signal *signal, Uint32 TapiConPtr, Uint32 errCode) {
89958998
/* or more LQH instances. We cannot speed this up. */
89968999
/* Only in confirmed node failure situations do we take action. */
89979000
/*------------------------------------------------------------------*/
8998-
if (errCode == ZNODEFAIL_BEFORE_COMMIT) {
9001+
if (errCode == ZNODEFAIL_BEFORE_COMMIT ||
9002+
apiConnectptr.p->failureNr != cfailure_nr) {
89999003
jam();
90009004
/**
90019005
* Node failure handling, switch to serial commit handling
@@ -9022,7 +9026,8 @@ void Dbtc::timeOutFoundLab(Signal *signal, Uint32 TapiConPtr, Uint32 errCode) {
90229026
/* or more LQH instances. We cannot speed this up. */
90239027
/* Only in confirmed node failure situations do we take action. */
90249028
/*--------------------------------------------------------------------*/
9025-
if (errCode == ZNODEFAIL_BEFORE_COMMIT) {
9029+
if (errCode == ZNODEFAIL_BEFORE_COMMIT ||
9030+
apiConnectptr.p->failureNr != cfailure_nr) {
90269031
jam();
90279032
/**
90289033
* Node failure handling, switch to serial complete handling

storage/ndb/test/ndbapi/testNodeRestart.cpp

Lines changed: 100 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -65,14 +65,15 @@ static int changeStartPartitionedTimeout(NDBT_Context *ctx, NDBT_Step *step) {
6565
g_err << "Setting StartPartitionedTimeout to " << startPartitionedTimeout
6666
<< endl;
6767
ConfigValues::Iterator iter(conf.m_configuration->m_config_values);
68-
for(int idx=0; iter.openSection(CFG_SECTION_NODE, idx); idx++) {
68+
for (int idx = 0; iter.openSection(CFG_SECTION_NODE, idx); idx++) {
6969
Uint32 oldValue = 0;
7070
if (iter.get(CFG_DB_START_PARTITION_TIMEOUT, oldValue)) {
7171
if (defaultValue == Uint32(~0)) {
7272
defaultValue = oldValue;
7373
} else if (oldValue != defaultValue) {
7474
g_err << "StartPartitionedTimeout is not consistent across data node"
75-
"sections" << endl;
75+
"sections"
76+
<< endl;
7677
break;
7778
}
7879
}
@@ -2832,7 +2833,8 @@ int runCreateBigTable(NDBT_Context *ctx, NDBT_Step *step) {
28322833
do {
28332834
hugoTrans.loadTableStartFrom(GETNDB(step), cnt, 10000);
28342835
cnt += 10000;
2835-
} while (cnt < rows && (NdbTick_CurrentMillisecond() - now) < 180000); // 180s
2836+
} while (cnt < rows &&
2837+
(NdbTick_CurrentMillisecond() - now) < 180000); // 180s
28362838
ndbout_c("Loaded %u rows in %llums", cnt, NdbTick_CurrentMillisecond() - now);
28372839

28382840
return NDBT_OK;
@@ -7869,10 +7871,10 @@ int run_PLCP_many_parts(NDBT_Context *ctx, NDBT_Step *step) {
78697871
}
78707872
ConfigValues::Iterator iter(conf.m_configuration->m_config_values);
78717873
Uint32 enabledPartialLCP = 1;
7872-
for(int idx=0; iter.openSection(CFG_SECTION_NODE, idx); idx++) {
7873-
Uint32 nodeId=0;
7874-
if(iter.get(CFG_NODE_ID, &nodeId)) {
7875-
if(nodeId == (Uint32) node_1) {
7874+
for (int idx = 0; iter.openSection(CFG_SECTION_NODE, idx); idx++) {
7875+
Uint32 nodeId = 0;
7876+
if (iter.get(CFG_NODE_ID, &nodeId)) {
7877+
if (nodeId == (Uint32)node_1) {
78767878
iter.get(CFG_DB_ENABLE_PARTIAL_LCP, &enabledPartialLCP);
78777879
iter.closeSection();
78787880
break;
@@ -7881,8 +7883,7 @@ int run_PLCP_many_parts(NDBT_Context *ctx, NDBT_Step *step) {
78817883
iter.closeSection();
78827884
}
78837885

7884-
if (enabledPartialLCP == 0)
7885-
{
7886+
if (enabledPartialLCP == 0) {
78867887
g_err << "[SKIPPED] Test skipped. Needs EnablePartialLcp=1" << endl;
78877888
iter.closeSection();
78887889
return NDBT_SKIPPED;
@@ -9454,7 +9455,8 @@ int runTestStallTimeoutAndNF(NDBT_Context *ctx, NDBT_Step *step) {
94549455

94559456
CHECK(hugoOps.startTransaction(pNdb, rowNum) == 0,
94569457
"Start transaction failed");
9457-
CHECK(hugoOps.pkUpdateRecord(pNdb, rowNum, 1) == 0, "Define Update failed");
9458+
CHECK(hugoOps.pkUpdateRecord(pNdb, rowNum, 1) == 0,
9459+
"Define Update failed");
94589460
CHECK(hugoOps.execute_NoCommit(pNdb) == 0, "Execute NoCommit failed");
94599461

94609462
NdbTransaction *trans = hugoOps.getTransaction();
@@ -9532,8 +9534,7 @@ int runTestStallTimeoutAndNF(NDBT_Context *ctx, NDBT_Step *step) {
95329534

95339535
if (trans->getNdbError().code != 0) {
95349536
ndbout_c("Got unexpected failure code : %u : %s",
9535-
trans->getNdbError().code,
9536-
trans->getNdbError().message);
9537+
trans->getNdbError().code, trans->getNdbError().message);
95379538
return NDBT_FAILED;
95389539
}
95399540

@@ -9555,6 +9556,86 @@ int runTestStallTimeoutAndNF(NDBT_Context *ctx, NDBT_Step *step) {
95559556
return NDBT_OK;
95569557
}
95579558

9559+
int runLargeLockingReads(NDBT_Context *ctx, NDBT_Step *step) {
9560+
int result = NDBT_OK;
9561+
int readsize = MIN(100, ctx->getNumRecords());
9562+
int i = 0;
9563+
HugoTransactions hugoTrans(*ctx->getTab());
9564+
while (ctx->isTestStopped() == false) {
9565+
g_info << i << ": ";
9566+
if (hugoTrans.pkReadRecords(GETNDB(step), readsize, readsize,
9567+
NdbOperation::LM_Read) != 0) {
9568+
return NDBT_FAILED;
9569+
}
9570+
i++;
9571+
}
9572+
return result;
9573+
}
9574+
9575+
int runRestartsWithSlowCommitComplete(NDBT_Context *ctx, NDBT_Step *step) {
9576+
int result = NDBT_OK;
9577+
NdbRestarter restarter;
9578+
const int numRestarts = 4;
9579+
9580+
if (restarter.getNumDbNodes() < 2) {
9581+
g_err << "Too few nodes" << endl;
9582+
ctx->stopTest();
9583+
return NDBT_SKIPPED;
9584+
}
9585+
9586+
for (int i = 0; i < numRestarts && !ctx->isTestStopped(); i++) {
9587+
int errorCode = 8123; // Slow commit and complete sending at TC
9588+
ndbout << "Injecting error " << errorCode << " for slow commits + completes"
9589+
<< endl;
9590+
restarter.insertErrorInAllNodes(errorCode);
9591+
9592+
/* Give some time for things to get stuck in slowness */
9593+
NdbSleep_MilliSleep(1000);
9594+
9595+
const int id = restarter.getNode(NdbRestarter::NS_RANDOM);
9596+
ndbout << "Restart node " << id << endl;
9597+
9598+
if (restarter.restartOneDbNode(id, false, true, true) != 0) {
9599+
g_err << "Failed to restart Db node" << endl;
9600+
result = NDBT_FAILED;
9601+
break;
9602+
}
9603+
9604+
if (restarter.waitNodesNoStart(&id, 1)) {
9605+
g_err << "Failed to waitNodesNoStart" << endl;
9606+
result = NDBT_FAILED;
9607+
break;
9608+
}
9609+
9610+
restarter.insertErrorInAllNodes(0);
9611+
9612+
if (restarter.startNodes(&id, 1)) {
9613+
g_err << "Failed to start node" << endl;
9614+
result = NDBT_FAILED;
9615+
break;
9616+
}
9617+
9618+
if (restarter.waitClusterStarted() != 0) {
9619+
g_err << "Cluster failed to start" << endl;
9620+
result = NDBT_FAILED;
9621+
break;
9622+
}
9623+
9624+
/* Ensure connected */
9625+
if (GETNDB(step)->get_ndb_cluster_connection().wait_until_ready(30, 30) !=
9626+
0) {
9627+
g_err << "Timeout waiting for NdbApi reconnect" << endl;
9628+
result = NDBT_FAILED;
9629+
break;
9630+
}
9631+
}
9632+
9633+
restarter.insertErrorInAllNodes(0);
9634+
ctx->stopTest();
9635+
9636+
return result;
9637+
}
9638+
95589639
NDBT_TESTSUITE(testNodeRestart);
95599640
TESTCASE("NoLoad",
95609641
"Test that one node at a time can be stopped and then restarted "
@@ -10327,6 +10408,13 @@ TESTCASE("TransStallTimeoutNF", "") {
1032710408
STEP(runTestStallTimeoutAndNF);
1032810409
FINALIZER(runClearTable);
1032910410
}
10411+
TESTCASE("TransientStatesNF",
10412+
"Test node failure handling with transactions in transient states") {
10413+
INITIALIZER(runLoadTable);
10414+
STEPS(runLargeLockingReads, 5);
10415+
STEP(runRestartsWithSlowCommitComplete);
10416+
FINALIZER(runClearTable);
10417+
}
1033010418
NDBT_TESTSUITE_END(testNodeRestart)
1033110419

1033210420
int main(int argc, const char **argv) {

storage/ndb/test/run-test/daily-devel--07-tests.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,3 +320,7 @@ cmd: test_event
320320
args: -n SubscribeEventsNRAF T1
321321
max-time: 240
322322

323+
cmd: testNodeRestart
324+
args: -n TransientStatesNF T1
325+
max-time: 360
326+

0 commit comments

Comments
 (0)