Skip to content

Commit 56f78c3

Browse files
Mikael RonströmHery Ramilison
authored andcommitted
BUG#27625172 Step 3
------------------- Added a new test in testNodeRestart -n LCP_with_many_parts. It requires running with table T17, I used 10000 rows that requires about 100 MByte of DataMemory. It requires that only one 1 LDM is used to work as planned. (cherry picked from commit 4e5c98fe470d7a7e3114aa98f1f7b52960c5f24b)
1 parent f0f1123 commit 56f78c3

File tree

3 files changed

+155
-1
lines changed

3 files changed

+155
-1
lines changed

storage/ndb/src/kernel/blocks/ERROR_codes.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ Next DBDICT 6223
2323
Next DBDIH 7249
2424
Next DBTC 8117
2525
Next CMVMI 9000
26-
Next BACKUP 10048
26+
Next BACKUP 10049
2727
Next PGMAN 11010
2828
Next DBTUX 12010
2929
Next SUMA 13049
@@ -620,6 +620,7 @@ Backup Stuff:
620620
10045: Crash insertion at FSAPPENDREF
621621
10046: DIH scan tab around Get next fragment
622622
10047: Delay start of scan for LCPs
623+
10048: Ensure that LCP runs with exactly 1 part
623624

624625
11001: Send UTIL_SEQUENCE_REF (in master)
625626

storage/ndb/src/kernel/blocks/backup/Backup.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12385,6 +12385,16 @@ Backup::calculate_number_of_parts(BackupRecordPtr ptr)
1238512385
MAX(min_parts_rule3,
1238612386
MAX(min_parts_rule4, min_parts_rule5)));
1238712387

12388+
if (ERROR_INSERTED(10048) && min_parts_rule4 == 0)
12389+
{
12390+
/**
12391+
* We need this in test cases to ensure that we can create a situation
12392+
* with 1 part per LCP and having more than 980 parts and even close to
12393+
* 2048 LCPs to restore a LCP.
12394+
*/
12395+
jam();
12396+
parts = 1;
12397+
}
1238812398
#ifdef DEBUG_LCP_STAT
1238912399
TablePtr debTabPtr;
1239012400
FragmentPtr fragPtr;
@@ -14156,6 +14166,11 @@ Backup::execEND_LCPREQ(Signal* signal)
1415614166
ptr.p->slaveState.setState(DEFINING);
1415714167
ptr.p->slaveState.setState(DEFINED);
1415814168

14169+
if (ERROR_INSERTED(10048))
14170+
{
14171+
CLEAR_ERROR_INSERT_VALUE;
14172+
}
14173+
1415914174
DEB_LCP(("(%u)TAGE Send SYNC_EXTENT_PAGES_REQ", instance()));
1416014175
/**
1416114176
* As part of ending the LCP we need to ensure that the extent pages

storage/ndb/test/ndbapi/testNodeRestart.cpp

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8714,6 +8714,139 @@ int runTestStartNode(NDBT_Context* ctx, NDBT_Step* step){
87148714
return NDBT_OK;
87158715
}
87168716

8717+
/**
8718+
* In Partial LCP we need many LCPs to restore a checkpoint. The
8719+
* maximum number of LCPs we need in order to restore a checkpoint
8720+
* is 2048. This test uses error insert 10048 to ensure that each
8721+
* LCP only stores 1 part completely. This means that this test
8722+
* can generate checkpoints that have to write LCP control files
8723+
* consisting of close to 2048 parts and similarly to restore those.
8724+
*
8725+
* The test loops for more than 2048 times to ensure that we come
8726+
* to a situation with a large number of parts in each LCP and in
8727+
* particular for the last one that we are to restore. The number
8728+
* 2058 is somewhat arbitrarily choosen to ensure this.
8729+
*
8730+
* The test case is hardcoded to make those special LCPs in node 2.
8731+
*
8732+
* Between each LCP we perform a random amount of updates to ensure
8733+
* that each part of this table will create a non-empty LCP. We
8734+
* insert a number of random LCPs that are empty as well to ensure
8735+
* that we generate empty LCPs correctly as well even if there are
8736+
* many parts in the LCP.
8737+
*/
8738+
int run_PLCP_many_parts(NDBT_Context *ctx, NDBT_Step *step)
8739+
{
8740+
Ndb *pNdb = GETNDB(step);
8741+
int loops = 2108;
8742+
int result = NDBT_OK;
8743+
int records = ctx->getNumRecords();
8744+
HugoTransactions hugoTrans(*ctx->getTab());
8745+
NdbRestarter restarter;
8746+
int i = 0;
8747+
const Uint32 nodeCount = restarter.getNumDbNodes();
8748+
int nodeId = 2;
8749+
HugoOperations hugoOps(*ctx->getTab());
8750+
if (nodeCount < 2)
8751+
{
8752+
return NDBT_OK; /* Requires at least 2 nodes to run */
8753+
}
8754+
if (hugoTrans.loadTable(pNdb, records) != NDBT_OK)
8755+
{
8756+
g_err << "Failed to load table" << endl;
8757+
return NDBT_FAILED;
8758+
}
8759+
8760+
g_err << "Executing " << loops << " loops" << endl;
8761+
while(++i <= loops && result != NDBT_FAILED)
8762+
{
8763+
g_err << "Start loop " << i << endl;
8764+
ndbout << "Start an LCP" << endl;
8765+
{
8766+
if (restarter.insertErrorInNode(nodeId, 10048) != 0)
8767+
{
8768+
g_err << "ERROR: Error insert 10048 failed" << endl;
8769+
return NDBT_FAILED;
8770+
}
8771+
int val = DumpStateOrd::DihStartLcpImmediately;
8772+
if(restarter.dumpStateAllNodes(&val, 1) != 0)
8773+
{
8774+
g_err << "ERR: "<< step->getName()
8775+
<< " failed on line " << __LINE__ << endl;
8776+
return NDBT_FAILED;
8777+
}
8778+
}
8779+
bool skip = false;
8780+
if ((i % 50) == 0)
8781+
{
8782+
skip = true;
8783+
}
8784+
Uint32 batch = 4;
8785+
Uint32 row;
8786+
if (!skip)
8787+
{
8788+
row = rand() % records;
8789+
if(row + batch > (Uint32)records)
8790+
batch = records - row;
8791+
8792+
if ((hugoOps.startTransaction(pNdb) != 0) ||
8793+
(hugoOps.pkUpdateRecord(pNdb, row, batch, rand()) != 0) ||
8794+
(hugoOps.execute_Commit(pNdb)) ||
8795+
(hugoOps.closeTransaction(pNdb)))
8796+
{
8797+
g_err << "Update failed" << endl;
8798+
//return NDBT_FAILED;
8799+
}
8800+
}
8801+
NdbSleep_SecSleep(1);
8802+
if (!skip)
8803+
{
8804+
row = rand() % records;
8805+
if(row + batch > (Uint32)records)
8806+
batch = records - row;
8807+
if ((hugoOps.startTransaction(pNdb) != 0) ||
8808+
(hugoOps.pkUpdateRecord(pNdb, row, batch, rand()) != 0) ||
8809+
(hugoOps.execute_Commit(pNdb)) ||
8810+
(hugoOps.closeTransaction(pNdb)))
8811+
{
8812+
g_err << "Update failed" << endl;
8813+
//return NDBT_FAILED;
8814+
}
8815+
}
8816+
}
8817+
/**
8818+
* Finally after creating a complex restore situation we test this
8819+
* by restarting node 2 to ensure that we can also recover the
8820+
* complex LCP setup.
8821+
*/
8822+
ndbout << "Restart node 2" << endl;
8823+
if (restarter.restartOneDbNode(nodeId,
8824+
false, /* initial */
8825+
true, /* nostart */
8826+
false, /* abort */
8827+
false /* force */) != 0)
8828+
{
8829+
g_err << "Restart failed" << endl;
8830+
return NDBT_FAILED;
8831+
}
8832+
ndbout << "Wait for NoStart state" << endl;
8833+
restarter.waitNodesNoStart(&nodeId, 1);
8834+
ndbout << "Start node" << endl;
8835+
if (restarter.startNodes(&nodeId, 1) != 0)
8836+
{
8837+
g_err << "Start failed" << endl;
8838+
return NDBT_FAILED;
8839+
}
8840+
ndbout << "Waiting for node to start" << endl;
8841+
if (restarter.waitNodesStarted(&nodeId, 1) != 0)
8842+
{
8843+
g_err << "Wait node start failed" << endl;
8844+
return NDBT_FAILED;
8845+
}
8846+
ndbout << "Test complete" << endl;
8847+
return NDBT_OK;
8848+
}
8849+
87178850
int run_PLCP_I1(NDBT_Context *ctx, NDBT_Step *step)
87188851
{
87198852
Ndb *pNdb = GETNDB(step);
@@ -9591,6 +9724,11 @@ TESTCASE("MultiCrashTest",
95919724
STEP(runMultiCrashTest);
95929725
FINALIZER(runClearTable);
95939726
}
9727+
TESTCASE("LCP_with_many_parts",
9728+
"Ensure that LCP has many parts")
9729+
{
9730+
INITIALIZER(run_PLCP_many_parts);
9731+
}
95949732
TESTCASE("PLCP_I1",
95959733
"Initial node restart while deleting rows")
95969734
{

0 commit comments

Comments
 (0)