From c9b69266cd17c1c76bac5cca8aa476bbcf5273c8 Mon Sep 17 00:00:00 2001 From: "mysql-builder@oracle.com" <> Date: Tue, 4 Mar 2025 11:25:39 +0100 Subject: [PATCH 01/13] From 4c5e1f1f29ed6009f758ad82fb46dab6d1d464ce Mon Sep 17 00:00:00 2001 From: Pedro Gomes Date: Tue, 4 Mar 2025 19:13:08 +0100 Subject: [PATCH 02/13] BUG#36421684: mysql server 8.3.0 heap-buffer-overflow at Multisource_info::get_mi When using JSON fuctions as a parameter to a SOURCE_POS_WAIT/MASTER_POS_WAIT, something that should also apply to other functions that return text results, the string extracted from the parameter did not have a safe pointer to it. The cause is that the length of the string was not properly marked inside its allocated space. Usage of the method c_ptr_safe fixes this issue. Change-Id: Ic2c54999293aa2e0833594754ad681d7453e03a1 --- sql/item_func.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sql/item_func.cc b/sql/item_func.cc index c8febd53182b..c8a0417d3ecb 100644 --- a/sql/item_func.cc +++ b/sql/item_func.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2000, 2023, Oracle and/or its affiliates. +/* Copyright (c) 2000, 2025, Oracle and/or its affiliates. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, @@ -4912,7 +4912,7 @@ longlong Item_master_pos_wait::val_int() return 0; } - mi= channel_map.get_mi(channel_str->ptr()); + mi = channel_map.get_mi(channel_str->c_ptr_safe()); } else From e7c592ddc8f74b1c8dbe10e900015bbf725bd908 Mon Sep 17 00:00:00 2001 From: Karthik Kamath Date: Wed, 5 Mar 2025 16:06:23 +0530 Subject: [PATCH 03/13] BUG#31360522 : >=5.6.36 SOME RANGE QUERIES STILL CRASH... DESCRIPTION: ============ Certain range queries on a table with index prefixed BLOB/TEXT columns could lead to a server exit. ANALYSIS: ========= While opening the table based on its table share, in open_table_from_share(), we create a copy of the key_info from TABLE_SHARE object to TABLE object. If the key is prefixed, we allocate a new Field object, having its field_length set to the prefix key length, and point the table's matching key_part->field to this new Field object. We skip creating the new Field object for prefixed BLOB columns. A secondary key is extended by adding primary key parts to it if the primary key part does not exist in the secondary key or the key part in the secondary key is a prefix of the key field (add_pk_parts_to_sk()). The consequence of skipping the creation of new Field object for prefixed BLOB columns is that the key parts from the secondary key and primary key will be pointing to the same Field object. Later, while performing end-range scan, we check if the key is within range (compare_key_in_buffer()). We change the offsets of all the fields in the key range to make the fields point to the record buffer (move_key_field_offsets()). In case of BLOBs, we end up moving the same field twice in move_key_field_offsets(). This leads to accessing out of bound memory while performing key comparison. FIX: ==== We allow creating new Field object even for BLOB columns in open_table_from_share(). Note: ===== This issue is not a regression but rather was exposed in 5.6.36 by the patch for Bug#23481444: OPTIMISER CALL ROW_SEARCH_MVCC() AND READ THE INDEX APPLIED BY UNCOMMITTED ROWS. Change-Id: I407dec8a997de2c51ebf62351351288beb7dde5e --- sql/sql_partition.cc | 8 +++++++- sql/table.cc | 9 ++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/sql/sql_partition.cc b/sql/sql_partition.cc index 39de0b21bdb7..359a4f6c7b3a 100644 --- a/sql/sql_partition.cc +++ b/sql/sql_partition.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2005, 2023, Oracle and/or its affiliates. +/* Copyright (c) 2005, 2025, Oracle and/or its affiliates. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, @@ -834,6 +834,12 @@ static bool handle_list_of_fields(List_iterator it, for (i= 0; i < num_key_parts; i++) { Field *field= table->key_info[primary_key].key_part[i].field; + // BLOB/TEXT columns are not allowed in partitioning keys. + if (field->flags & BLOB_FLAG) + { + my_error(ER_BLOB_FIELD_IN_PART_FUNC_ERROR, MYF(0)); + DBUG_RETURN(TRUE); + } field->flags|= GET_FIXED_FIELDS_FLAG; } } diff --git a/sql/table.cc b/sql/table.cc index 663dc57ffbdf..fb41f51cb07e 100644 --- a/sql/table.cc +++ b/sql/table.cc @@ -3241,8 +3241,15 @@ int open_table_from_share(THD *thd, TABLE_SHARE *share, const char *alias, { Field *field= key_part->field= outparam->field[key_part->fieldnr-1]; + /* + For spatial indexes, the key parts are assigned the length (4 * + sizeof(double)) in mysql_prepare_create_table() and the + field->key_length() is set to 0. This makes it appear like a prefixed + index. However, prefixed indexes are not allowed on Geometric columns. + Hence skipping new field creation for Geometric columns. + */ if (field->key_length() != key_part->length && - !(field->flags & BLOB_FLAG)) + field->type() != MYSQL_TYPE_GEOMETRY) { /* We are using only a prefix of the column as a key: From 0d64463bb8c0cb29b515e8a8a8fd23c7be7f2467 Mon Sep 17 00:00:00 2001 From: Frazer Clement Date: Wed, 19 Feb 2025 22:31:34 +0000 Subject: [PATCH 04/13] Bug#37512477 LQH_TRANSCONF signal printer crashes There are currently two lengths of LQH_TRANSCONF used. The signal printer should accomodate both, avoiding printing uninitialised data and not making invalid assertions about the supported signal lengths. Change-Id: I0757b512445ae30c310b9c1ec37ed3ae2e1e4e38 --- .../kernel/signaldata/LqhTransConf.hpp | 3 +- .../common/debugger/signaldata/LqhTrans.cpp | 38 ++++++++++--------- .../ndb/src/kernel/blocks/dblqh/DblqhMain.cpp | 6 +-- 3 files changed, 26 insertions(+), 21 deletions(-) diff --git a/storage/ndb/include/kernel/signaldata/LqhTransConf.hpp b/storage/ndb/include/kernel/signaldata/LqhTransConf.hpp index 1f9547606650..5e44f5540ab5 100644 --- a/storage/ndb/include/kernel/signaldata/LqhTransConf.hpp +++ b/storage/ndb/include/kernel/signaldata/LqhTransConf.hpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2003, 2021, Oracle and/or its affiliates. + Copyright (c) 2003, 2025, Oracle and/or its affiliates. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, @@ -50,6 +50,7 @@ class LqhTransConf { friend bool printLQH_TRANSCONF(FILE *, const Uint32 *, Uint32, Uint16); public: STATIC_CONST( SignalLength = 18 ); + STATIC_CONST( MarkerSignalLength = 7 ) ; /** * Upgrade diff --git a/storage/ndb/src/common/debugger/signaldata/LqhTrans.cpp b/storage/ndb/src/common/debugger/signaldata/LqhTrans.cpp index c48a84c2c3c7..ed2a5fad034c 100644 --- a/storage/ndb/src/common/debugger/signaldata/LqhTrans.cpp +++ b/storage/ndb/src/common/debugger/signaldata/LqhTrans.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2003, 2021, Oracle and/or its affiliates. + Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. Use is subject to license terms. This program is free software; you can redistribute it and/or modify @@ -30,22 +30,26 @@ bool printLQH_TRANSCONF(FILE * output, const Uint32 * theData, Uint32 len, Uint16 receiverBlockNo) { const LqhTransConf * const sig = (LqhTransConf *)theData; - fprintf(output, " tcRef: %x\n", sig->tcRef); - fprintf(output, " lqhNodeId: %x\n", sig->lqhNodeId); - fprintf(output, " operationStatus: %x\n", sig->operationStatus); - fprintf(output, " transId1: %x\n", sig->transId1); - fprintf(output, " transId2: %x\n", sig->transId2); - fprintf(output, " apiRef: %x\n", sig->apiRef); - fprintf(output, " apiOpRec: %x\n", sig->apiOpRec); - fprintf(output, " lqhConnectPtr: %x\n", sig->lqhConnectPtr); - fprintf(output, " oldTcOpRec: %x\n", sig->oldTcOpRec); - fprintf(output, " requestInfo: %x\n", sig->requestInfo); - fprintf(output, " gci_hi: %x\n", sig->gci_hi); - fprintf(output, " gci_lo: %x\n", sig->gci_lo); - fprintf(output, " nextNodeId1: %x\n", sig->nextNodeId1); - fprintf(output, " nextNodeId2: %x\n", sig->nextNodeId2); - fprintf(output, " nextNodeId3: %x\n", sig->nextNodeId3); - fprintf(output, " tableId: %x\n", sig->tableId); + if (len >= LqhTransConf::MarkerSignalLength) { + fprintf(output, " tcRef: %x\n", sig->tcRef); + fprintf(output, " lqhNodeId: %x\n", sig->lqhNodeId); + fprintf(output, " operationStatus: %x\n", sig->operationStatus); + fprintf(output, " transId1: %x\n", sig->transId1); + fprintf(output, " transId2: %x\n", sig->transId2); + fprintf(output, " apiRef: %x\n", sig->apiRef); + fprintf(output, " apiOpRec: %x\n", sig->apiOpRec); + } + if (len >= LqhTransConf::SignalLength) { + fprintf(output, " lqhConnectPtr: %x\n", sig->lqhConnectPtr); + fprintf(output, " oldTcOpRec: %x\n", sig->oldTcOpRec); + fprintf(output, " requestInfo: %x\n", sig->requestInfo); + fprintf(output, " gci_hi: %x\n", sig->gci_hi); + fprintf(output, " gci_lo: %x\n", sig->gci_lo); + fprintf(output, " nextNodeId1: %x\n", sig->nextNodeId1); + fprintf(output, " nextNodeId2: %x\n", sig->nextNodeId2); + fprintf(output, " nextNodeId3: %x\n", sig->nextNodeId3); + fprintf(output, " tableId: %x\n", sig->tableId); + } return true; } diff --git a/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp b/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp index 074515415b21..349794747742 100644 --- a/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp +++ b/storage/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2003, 2024, Oracle and/or its affiliates. + Copyright (c) 2003, 2025, Oracle and/or its affiliates. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, @@ -11945,8 +11945,8 @@ Dblqh::scanMarkers(Signal* signal, lqhTransConf->transId2 = iter.curr.p->transid2; lqhTransConf->apiRef = iter.curr.p->apiRef; lqhTransConf->apiOpRec = iter.curr.p->apiOprec; - sendSignal(tcNodeFailPtr.p->newTcBlockref, GSN_LQH_TRANSCONF, - signal, 7, JBB); + sendSignal(tcNodeFailPtr.p->newTcBlockref, GSN_LQH_TRANSCONF, signal, + LqhTransConf::MarkerSignalLength, JBB); signal->theData[0] = ZSCAN_MARKERS; signal->theData[1] = tcNodeFailPtr.i; From ee50c87917d184069b19c3d9df7ca8e56d6f5cf1 Mon Sep 17 00:00:00 2001 From: Frazer Clement Date: Wed, 19 Feb 2025 22:48:42 +0000 Subject: [PATCH 05/13] Bug#37512526 Signal dump code can read out of bounds Avoid reading signal section pointers that are not present. Change-Id: Ifdc5ae688ae2f3d3c64c895ca7a062b1ab0ccc6a --- storage/ndb/src/kernel/vm/mt.cpp | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/storage/ndb/src/kernel/vm/mt.cpp b/storage/ndb/src/kernel/vm/mt.cpp index 79a365227aad..1bd57191b635 100644 --- a/storage/ndb/src/kernel/vm/mt.cpp +++ b/storage/ndb/src/kernel/vm/mt.cpp @@ -1,4 +1,4 @@ -/* Copyright (c) 2008, 2022, Oracle and/or its affiliates. +/* Copyright (c) 2008, 2025, Oracle and/or its affiliates. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, @@ -8304,9 +8304,25 @@ FastScheduler::dumpSignalMemory(Uint32 thr_no, FILE* out) signal.header.theReceiversBlockNumber &= NDBMT_BLOCK_MASK; const Uint32 *posptr = reinterpret_cast(s); - signal.m_sectionPtrI[0] = posptr[siglen + 0]; - signal.m_sectionPtrI[1] = posptr[siglen + 1]; - signal.m_sectionPtrI[2] = posptr[siglen + 2]; + signal.m_sectionPtrI[0] = RNIL; + signal.m_sectionPtrI[1] = RNIL; + signal.m_sectionPtrI[2] = RNIL; + switch (s->m_noOfSections) { + case 3: + signal.m_sectionPtrI[2] = posptr[siglen + 2]; + [[fallthrough]]; + case 2: + signal.m_sectionPtrI[1] = posptr[siglen + 1]; + [[fallthrough]]; + case 1: + signal.m_sectionPtrI[0] = posptr[siglen + 0]; + [[fallthrough]]; + case 0: + break; + default: + /* Out of range - ignore */ + break; + }; bool prioa = signalSequence[seq_end].prioa; /* Make sure to display clearly when there is a gap in the dump. */ From 5b28126108e4a684bdd3394a14818d3c54d0ac58 Mon Sep 17 00:00:00 2001 From: Frazer Clement Date: Wed, 5 Mar 2025 22:36:05 +0000 Subject: [PATCH 06/13] Bug#37512526 Signal dump code can read out of bounds Followup fix - remove [[fallthrough]] not supported on all compilers used for 7.6. Change-Id: Ifdc5ae688ae2f3d3c64c895ca7a062b1ab0ccc6a --- storage/ndb/src/kernel/vm/mt.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/storage/ndb/src/kernel/vm/mt.cpp b/storage/ndb/src/kernel/vm/mt.cpp index 1bd57191b635..f3eef3a619f8 100644 --- a/storage/ndb/src/kernel/vm/mt.cpp +++ b/storage/ndb/src/kernel/vm/mt.cpp @@ -8310,13 +8310,13 @@ FastScheduler::dumpSignalMemory(Uint32 thr_no, FILE* out) switch (s->m_noOfSections) { case 3: signal.m_sectionPtrI[2] = posptr[siglen + 2]; - [[fallthrough]]; + /* Fall through */ case 2: signal.m_sectionPtrI[1] = posptr[siglen + 1]; - [[fallthrough]]; + /* Fall through */ case 1: signal.m_sectionPtrI[0] = posptr[siglen + 0]; - [[fallthrough]]; + /* Fall through */ case 0: break; default: From 10c4ba9677c42cd10bfd49833050dc3120840a6d Mon Sep 17 00:00:00 2001 From: Frazer Clement Date: Wed, 5 Mar 2025 16:51:00 +0000 Subject: [PATCH 07/13] Backport of NDBT_Test functionality to identify parallel steps Originally committed in : commit 4329a1385304788d642d0dec34174a6cd7842eb7 Author: Frazer Clement Date: Fri Oct 8 23:54:33 2021 +0100 Bug #32478380 DEADLOCK TIMEOUT DUE TO PROBLEM IN REDO LOG QUEUE HANDLING Originally Approved by : Maitrayi Sabaratnam Change-Id: Idcd159d0da98b6ed4c8542895fcfbaa962a75240 --- storage/ndb/test/include/NDBT_Test.hpp | 26 +++++++++++++++--- storage/ndb/test/src/NDBT_Test.cpp | 38 +++++++++++++++++++++++--- 2 files changed, 56 insertions(+), 8 deletions(-) diff --git a/storage/ndb/test/include/NDBT_Test.hpp b/storage/ndb/test/include/NDBT_Test.hpp index 3ccfad0031f4..1216aff05ad7 100644 --- a/storage/ndb/test/include/NDBT_Test.hpp +++ b/storage/ndb/test/include/NDBT_Test.hpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2003, 2024, Oracle and/or its affiliates. + Copyright (c) 2003, 2025, Oracle and/or its affiliates. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, @@ -121,6 +121,18 @@ class NDBT_Context { * Get config by beeing friend to ndb_cluster_connection_impl - ugly */ NdbApiConfig const& getConfig() const; + + /** + * get a subrange of records - useful for splitting work amongst + * threads and avoiding contention. + */ + static + void getRecordSubRange(int records, + int rangeCount, + int rangeId, + int& startRecord, + int& stopRecord); + private: friend class NDBT_Step; friend class NDBT_TestSuite; @@ -161,13 +173,17 @@ class NDBT_Step { const char* getName() { return name; } int getStepNo() { return step_no; } void setStepNo(int n) { step_no = n; } + /* Parallel steps : Step x/y (x counting from 0) */ + int getStepTypeNo() { return step_type_no; } + int getStepTypeCount() { return step_type_count; } protected: NDBT_Context* m_ctx; const char* name; NDBT_TESTFUNC* func; NDBT_TestCase* testcase; int step_no; - + int step_type_no; + int step_type_count; private: int setUp(Ndb_cluster_connection&); void tearDown(); @@ -182,7 +198,9 @@ class NDBT_ParallelStep : public NDBT_Step { public: NDBT_ParallelStep(NDBT_TestCase* ptest, const char* pname, - NDBT_TESTFUNC* pfunc); + NDBT_TESTFUNC* pfunc, + int num = 0, + int count = 1); virtual ~NDBT_ParallelStep() {} }; @@ -503,7 +521,7 @@ C##suitname():NDBT_TestSuite(#suitname){ \ // Add a number of equal steps to the testcase #define STEPS(stepfunc, num) \ { int i; for (i = 0; i < num; i++){ \ - pts = new NDBT_ParallelStep(pt, #stepfunc, stepfunc); \ + pts = new NDBT_ParallelStep(pt, #stepfunc, stepfunc, i, num); \ pt->addStep(pts);\ } } diff --git a/storage/ndb/test/src/NDBT_Test.cpp b/storage/ndb/test/src/NDBT_Test.cpp index 9bde1f21725f..9b948b4b7abd 100644 --- a/storage/ndb/test/src/NDBT_Test.cpp +++ b/storage/ndb/test/src/NDBT_Test.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2003, 2024, Oracle and/or its affiliates. + Copyright (c) 2003, 2025, Oracle and/or its affiliates. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, @@ -280,10 +280,35 @@ void NDBT_Context::setNumLoops(int _loops){ loops = _loops; } +void NDBT_Context::getRecordSubRange(int records, + int rangeCount, + int rangeId, + int& startRecord, + int& stopRecord) +{ + int recordsPerStep = records / rangeCount; + if (recordsPerStep == 0) + { + recordsPerStep = 1; + } + startRecord = rangeId * recordsPerStep; + stopRecord = startRecord + recordsPerStep; + + if (stopRecord > records) + { + stopRecord = records; + } + if (startRecord >= records) + { + startRecord = stopRecord = 0; + } +} + NDBT_Step::NDBT_Step(NDBT_TestCase* ptest, const char* pname, NDBT_TESTFUNC* pfunc) : m_ctx(NULL), name(pname), func(pfunc), - testcase(ptest), step_no(-1), m_ndb(NULL) + testcase(ptest), step_no(-1), step_type_no(0), + step_type_count(1), m_ndb(NULL) { } @@ -389,9 +414,14 @@ NDBT_Context* NDBT_Step::getContext(){ NDBT_ParallelStep::NDBT_ParallelStep(NDBT_TestCase* ptest, - const char* pname, - NDBT_TESTFUNC* pfunc) + const char* pname, + NDBT_TESTFUNC* pfunc, + int num, + int count) : NDBT_Step(ptest, pname, pfunc) { + require(num < count); + step_type_no = num; + step_type_count = count; } NDBT_Verifier::NDBT_Verifier(NDBT_TestCase* ptest, const char* pname, From bfdb2deb9f2dd399e3eaec2853ed3eba9663ec17 Mon Sep 17 00:00:00 2001 From: Frazer Clement Date: Wed, 5 Mar 2025 16:09:48 +0000 Subject: [PATCH 08/13] Bug#37524092 Improve Api Failure handling logs + limit duration 7.6 backport Improve observability of API failure handling stall - QMGR signals blocks yet to complete API failure handling to dump block internal API failure handling state - TC enhanced to - Track + dump API failure handling sub-state - Dump info about remaining transactions to be handled - Include TC instance number in generated logs - Also dump to node log in cases where truncation may occur in cluster log Change-Id: I20c96ba9081610abd4c4f9696bada496b8f4c1ba --- storage/ndb/src/kernel/blocks/ERROR_codes.txt | 4 +- storage/ndb/src/kernel/blocks/dbtc/Dbtc.hpp | 14 ++- .../ndb/src/kernel/blocks/dbtc/DbtcMain.cpp | 95 ++++++++++++++----- .../ndb/src/kernel/blocks/qmgr/QmgrMain.cpp | 14 ++- 4 files changed, 96 insertions(+), 31 deletions(-) diff --git a/storage/ndb/src/kernel/blocks/ERROR_codes.txt b/storage/ndb/src/kernel/blocks/ERROR_codes.txt index 096fc26dc0af..0265e7eb6ea2 100644 --- a/storage/ndb/src/kernel/blocks/ERROR_codes.txt +++ b/storage/ndb/src/kernel/blocks/ERROR_codes.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2003, 2024, Oracle and/or its affiliates. +# Copyright (c) 2003, 2025, Oracle and/or its affiliates. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License, version 2.0, @@ -29,7 +29,7 @@ Next DBTUP 4040 Next DBLQH 5113 Next DBDICT 6227 Next DBDIH 7251 -Next DBTC 8125 +Next DBTC 8127 Next TRPMAN 9007 Next CMVMI 9993 Note: CMVMI grows downwards Next BACKUP 10057 diff --git a/storage/ndb/src/kernel/blocks/dbtc/Dbtc.hpp b/storage/ndb/src/kernel/blocks/dbtc/Dbtc.hpp index c1ff3d8fe6fb..cc08823a6e93 100644 --- a/storage/ndb/src/kernel/blocks/dbtc/Dbtc.hpp +++ b/storage/ndb/src/kernel/blocks/dbtc/Dbtc.hpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2003, 2024, Oracle and/or its affiliates. + Copyright (c) 2003, 2025, Oracle and/or its affiliates. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, @@ -1210,6 +1210,16 @@ class Dbtc Uint32 m_location_domain_id; + /* Discrete states of API failure handling for logs etc */ + enum ApiFailStates { + AF_IDLE, + AF_CHECK_TRANS, + AF_CHECK_MARKERS, + AF_CHECK_MARKERS_WAIT_TC_TAKEOVER, + AF_CHECK_MARKERS_WAIT_TRANS + }; + Uint32 m_af_state; + /* Independent steps of Data node failure handling */ enum NodeFailBits { NF_TAKEOVER = 0x1, @@ -1218,7 +1228,7 @@ class Dbtc NF_BLOCK_HANDLE = 0x8, NF_NODE_FAIL_BITS = 0xF // All bits... }; - Uint32 m_nf_bits; + Uint32 m_nf_bits; /* Node fail handling state */ NdbNodeBitmask m_lqh_trans_conf; /** * Indicator if any history to track yet diff --git a/storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp b/storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp index 113dbf9b98ff..905c45bbf109 100644 --- a/storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp +++ b/storage/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp @@ -1,4 +1,4 @@ -/* Copyright (c) 2003, 2024, Oracle and/or its affiliates. +/* Copyright (c) 2003, 2025, Oracle and/or its affiliates. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, @@ -1219,6 +1219,7 @@ void Dbtc::execAPI_FAILREQ(Signal* signal) **************************************************************************/ jamEntry(); + const Uint32 apiNodeId = signal->theData[0]; if (ERROR_INSERTED(8056)) { CLEAR_ERROR_INSERT_VALUE; @@ -1227,15 +1228,16 @@ void Dbtc::execAPI_FAILREQ(Signal* signal) #ifdef ERROR_INSERT if (ERROR_INSERTED(8078)) { - c_lastFailedApi = signal->theData[0]; + c_lastFailedApi = apiNodeId; SET_ERROR_INSERT_VALUE(8079); } #endif capiFailRef = signal->theData[1]; - arrGuard(signal->theData[0], MAX_NODES); - capiConnectClosing[signal->theData[0]] = 1; - handleFailedApiNode(signal, signal->theData[0], (UintR)0); + + arrGuard(apiNodeId, MAX_NODES); + capiConnectClosing[apiNodeId] = 1; + handleFailedApiNode(signal, apiNodeId, (UintR)0); } /** @@ -1425,8 +1427,13 @@ Dbtc::handleFailedApiNode(Signal* signal, { UintR TloopCount = 0; arrGuard(TapiFailedNode, MAX_NODES); + hostptr.i = TapiFailedNode; + ptrCheckGuard(hostptr, chostFilesize, hostRecord); + /* Mark progress */ + hostptr.p->m_af_state = HostRecord::AF_CHECK_TRANS; apiConnectptr.i = TapiConnectPtr; - do { + while (TloopCount++ <= 256 && !ERROR_INSERTED(8125)) + { ptrCheckGuard(apiConnectptr, capiConnectFilesize, apiConnectRecord); const UintR TapiNode = refToNode(apiConnectptr.p->ndbapiBlockref); if (TapiNode == TapiFailedNode) @@ -1456,7 +1463,7 @@ Dbtc::handleFailedApiNode(Signal* signal, removeMarkerForFailedAPI(signal, TapiFailedNode, 0); return; }//if - } while (TloopCount++ < 256); + } signal->theData[0] = TcContinueB::ZHANDLE_FAILED_API_NODE; signal->theData[1] = TapiFailedNode; signal->theData[2] = apiConnectptr.i; @@ -1471,8 +1478,16 @@ Dbtc::removeMarkerForFailedAPI(Signal* signal, TcFailRecordPtr node_fail_ptr; node_fail_ptr.i = 0; ptrAss(node_fail_ptr, tcFailRecord); - if(node_fail_ptr.p->failStatus != FS_IDLE) { + HostRecordPtr myHostPtr; + myHostPtr.i = nodeId; + ptrCheckGuard(myHostPtr, chostFilesize, hostRecord); + /* Mark progress */ + myHostPtr.p->m_af_state = HostRecord::AF_CHECK_MARKERS; + + if(node_fail_ptr.p->failStatus != FS_IDLE || ERROR_INSERTED(8126)) { jam(); + /* Mark progress */ + myHostPtr.p->m_af_state = HostRecord::AF_CHECK_MARKERS_WAIT_TC_TAKEOVER; DEBUG("Restarting removeMarkerForFailedAPI"); /** * TC take-over in progress @@ -1501,6 +1516,8 @@ Dbtc::removeMarkerForFailedAPI(Signal* signal, capiConnectClosing[nodeId]--; if (capiConnectClosing[nodeId] == 0) { jam(); + /* Mark progress */ + myHostPtr.p->m_af_state = HostRecord::AF_IDLE; /********************************************************************/ // No outstanding ABORT or COMMIT's of this failed API node. @@ -1538,6 +1555,9 @@ Dbtc::removeMarkerForFailedAPI(Signal* signal, * * Don't remove it, but continueb retry with a short delay */ + /* Mark progress */ + myHostPtr.p->m_af_state = HostRecord::AF_CHECK_MARKERS_WAIT_TRANS; + signal->theData[0] = TcContinueB::ZHANDLE_FAILED_API_NODE_REMOVE_MARKERS; signal->theData[1] = nodeId; signal->theData[2] = iter.bucket; @@ -1588,6 +1608,11 @@ void Dbtc::handleApiFailState(Signal* signal, UintR TapiConnectptr) { jam(); + /* Mark progress */ + hostptr.i = TfailedApiNode; + ptrCheckGuard(hostptr, chostFilesize, hostRecord); + hostptr.p->m_af_state = HostRecord::AF_IDLE; + /** * Perform block-level cleanups (e.g assembleFragments...) */ @@ -15343,6 +15368,7 @@ void Dbtc::inithost(Signal* signal) container->noOfPackedWords = 0; container->hostBlockRef = numberToRef(DBLQH, i, hostptr.i); } + hostptr.p->m_af_state = HostRecord::AF_IDLE; hostptr.p->m_nf_bits = 0; }//for c_alive_nodes.clear(); @@ -16617,7 +16643,7 @@ Dbtc::execDUMP_STATE_ORD(Signal* signal) if (len + 2 > 25) { jam(); - infoEvent("Too long filter"); + infoEvent("DBTC %u: Too long filter", instance()); return; } if (validate_filter(signal)) @@ -16628,7 +16654,7 @@ Dbtc::execDUMP_STATE_ORD(Signal* signal) signal->theData[1] = 0; // record sendSignal(reference(), GSN_DUMP_STATE_ORD, signal, len + 2, JBB); - infoEvent("Starting dump of transactions"); + infoEvent("DBTC %u: Starting dump of transactions", instance()); } return; } @@ -16663,7 +16689,7 @@ Dbtc::execDUMP_STATE_ORD(Signal* signal) if (ap.i == capiConnectFilesize) { jam(); - infoEvent("End of transaction dump"); + infoEvent("DBTC %u: End of transaction dump", instance()); return; } @@ -16695,12 +16721,30 @@ Dbtc::execDUMP_STATE_ORD(Signal* signal) NodeId nodeId = signal->theData[1]; if (nodeId < MAX_NODES && nodeId < NDB_ARRAY_SIZE(capiConnectClosing)) { - warningEvent(" DBTC: capiConnectClosing[%u]: %u", - nodeId, capiConnectClosing[nodeId]); + if (getNodeInfo(nodeId).getType() == NODE_TYPE_API) { + jam(); + hostptr.i = nodeId; + ptrCheckGuard(hostptr, chostFilesize, hostRecord); + warningEvent(" DBTC %u: capiConnectClosing[%u]: %u", instance(), nodeId, + capiConnectClosing[nodeId]); + warningEvent(" DBTC %u: apiFailState[%u]: %u", instance(), nodeId, + hostptr.p->m_af_state); + + if (capiConnectClosing[nodeId] > 0) { + jam(); + /* Dump all transactions with given nodeid as client */ + signal->theData[0] = 2550; + signal->theData[1] = 1; + signal->theData[2] = nodeId; + sendSignal(reference(), GSN_DUMP_STATE_ORD, signal, 3, JBB); + } + } + // Could add more info for Data node failure handling delay } else { - warningEvent(" DBTC: dump-%u to unknown node: %u", arg, nodeId); + warningEvent(" DBTC %u: dump-%u to unknown node: %u", instance(), arg, + nodeId); } } @@ -17268,19 +17312,18 @@ Dbtc::match_and_print(Signal* signal, ApiConnectRecordPtr apiPtr) break; } - char buf[100]; - BaseString::snprintf(buf, sizeof(buf), - "TRX[%u]: API: %d(0x%x)" - "transid: 0x%x 0x%x inactive: %u(%d) state: %s", - apiPtr.i, - refToNode(apiPtr.p->ndbapiBlockref), - refToBlock(apiPtr.p->ndbapiBlockref), - apiPtr.p->transid[0], - apiPtr.p->transid[1], - apiTimer ? (ctcTimer - apiTimer) / 100 : 0, - c_apiConTimer_line[apiPtr.i], - stateptr); + char buf[150]; + BaseString::snprintf( + buf, sizeof(buf), + "DBTC %u TRX[%u] API %d(0x%x)" + "trid 0x%x 0x%x inact %u(%d) state %s nodes %s", + instance(), apiPtr.i, refToNode(apiPtr.p->ndbapiBlockref), + refToBlock(apiPtr.p->ndbapiBlockref), apiPtr.p->transid[0], + apiPtr.p->transid[1], apiTimer ? (ctcTimer - apiTimer) / 100 : 0, + c_apiConTimer_line[apiPtr.i], stateptr, + BaseString::getPrettyText(apiPtr.p->m_transaction_nodes).c_str()); infoEvent("%s", buf); + g_eventLogger->info("%s", buf); memcpy(signal->theData, temp, 4*len); return true; diff --git a/storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp b/storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp index b6aa84ad48d1..431d1584bf4e 100644 --- a/storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp +++ b/storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2003, 2023, Oracle and/or its affiliates. + Copyright (c) 2003, 2025, Oracle and/or its affiliates. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, @@ -3188,6 +3188,18 @@ void Qmgr::checkStartInterface(Signal* signal, NDB_TICKS now) nodePtr.p->m_failconf_blocks[3], nodePtr.p->m_failconf_blocks[4]); warningEvent("%s", buf); + + /* Ask delayed block(s) to explain themselves */ + for (Uint32 i = 0; + i < NDB_ARRAY_SIZE(nodePtr.p->m_failconf_blocks); i++) { + if (nodePtr.p->m_failconf_blocks[i] != 0) { + signal->theData[0] = DumpStateOrd::DihTcSumaNodeFailCompleted; + signal->theData[1] = nodePtr.i; + const Uint32 dstRef = + numberToRef(nodePtr.p->m_failconf_blocks[i], 0); + sendSignal(dstRef, GSN_DUMP_STATE_ORD, signal, 2, JBB); + } + } } } } From 27d890619094b346c1bb7e941efd3004f0b8ffab Mon Sep 17 00:00:00 2001 From: Shubham Sinha Date: Mon, 24 Feb 2025 05:27:12 +0100 Subject: [PATCH 09/13] Bug# 37607195 - fprintf_string not using the actual quote parameter (mysql-5.7) The fprintf_string function present in mysqldump takes the quote of the string as a parameter, but does not pass it to the mysql_real_escape_string_quote to escape the string. The fix is to pass the string quote to mysql_real_escape_string_quote as a parameter. Added a test case. Change-Id: Idc2001a96679fe32bb48e5e3a14d724d5ab9cb9f --- client/mysqldump.c | 4 +- .../r/mysqldump-tablespace-escape.result | 71 ++++++++++++ mysql-test/t/mysqldump-tablespace-escape.test | 108 +++++++++++++++++- 3 files changed, 180 insertions(+), 3 deletions(-) diff --git a/client/mysqldump.c b/client/mysqldump.c index 71db81168349..6ab6bb99657f 100644 --- a/client/mysqldump.c +++ b/client/mysqldump.c @@ -2356,7 +2356,7 @@ static void fprintf_string(char *row, ulong row_len, char quote, pbuffer = (char *)my_malloc(PSI_NOT_INSTRUMENTED, curr_row_size, MYF(0)); // Put the sanitized row in the buffer. - mysql_real_escape_string_quote(mysql, pbuffer, row, row_len, '\''); + mysql_real_escape_string_quote(mysql, pbuffer, row, row_len, quote); // Opening quote fputc(quote, md_result_file); @@ -4658,7 +4658,7 @@ static int dump_tablespaces(char* ts_where) mysql_free_result(tableres); mysql_query_with_error_report( mysql, &tableres, - "SELECT 'TN; /*' AS TABLESPACE_NAME, 'FN' AS FILE_NAME, 'LGN' AS " + "SELECT 'T`N; /*' AS TABLESPACE_NAME, 'FN' AS FILE_NAME, 'LGN' AS " "LOGFILE_GROUP_NAME, 77 AS EXTENT_SIZE, 88 AS INITIAL_SIZE, " "'*/\nsystem touch foo;\n' AS ENGINE"); }); diff --git a/mysql-test/r/mysqldump-tablespace-escape.result b/mysql-test/r/mysqldump-tablespace-escape.result index 5c4b3c117210..d923710a114e 100644 --- a/mysql-test/r/mysqldump-tablespace-escape.result +++ b/mysql-test/r/mysqldump-tablespace-escape.result @@ -2,7 +2,78 @@ # Bug#36816986 - MySQL Shell command injection # CREATE DATABASE bug36816986; +USE bug36816986; -- Run mysqldump with tablespace_injection_test. The test injected string must be found: Pattern found. +The ` must be escaped: +Pattern found. DROP DATABASE bug36816986; + +####################################### + +# +# Bug#37607195 - fprintf_string not using the actual quote parameter +# +CREATE DATABASE bug37607195; +USE bug37607195; +Create a bunch of tables with numerous ` ' " \n etc. +SET @@sql_mode='ANSI_QUOTES,ONLY_FULL_GROUP_BY,STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_ENGINE_SUBSTITUTION'; +CREATE TABLE "custo`mers" ( +"customer'_id" INT AUTO_INCREMENT PRIMARY KEY, +"fir`st_`na`me" VARCHAR(50) NOT NULL, +"last_'name" VARCHAR(50) NOT NULL, +"em`ail" VARCHAR(100) UNIQUE NOT NULL, +`pho"\ne` VARCHAR(15), +"created'_'at" TIMESTAMP DEFAULT CURRENT_TIMESTAMP, +"updated'_'at" TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP +); +CREATE TABLE "prod'ucts" ( +"product`_`id" INT AUTO_INCREMENT PRIMARY KEY, +"product'_`name" VARCHAR(100) NOT NULL, +"descri`p`t`i`o`n" TEXT, +"pr'i'ce" DECIMAL(10, 2) NOT NULL CHECK ("pr'i'ce" >= 0), +`stock"_"qua\ntity` INT DEFAULT 0, +`created'_'at` TIMESTAMP DEFAULT CURRENT_TIMESTAMP, +`updated"_'at` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, +INDEX ("product'_`name") +); +CREATE TABLE "orders" ( +"order_id" INT AUTO_INCREMENT PRIMARY KEY, +"customer_id" INT NOT NULL, +"order_date" TIMESTAMP DEFAULT CURRENT_TIMESTAMP, +"status" ENUM('Pending', 'Completed', 'Cancelled') NOT NULL, +"total\n" DECIMAL(10, 2) NOT NULL CHECK ("total\n" >= 0), +FOREIGN KEY (customer_id) REFERENCES "custo`mers"("customer'_id") ON DELETE CASCADE, +INDEX (order_date) +); +CREATE TABLE `'order'_'items'` ( +`order'_'item_id` INT AUTO_INCREMENT PRIMARY KEY, +`'order'_'id'` INT NOT NULL, +`product'_'id` INT NOT NULL, +`qua\ntity` INT NOT NULL CHECK (`qua\ntity` > 0), +`p'rice` DECIMAL(10,2) NOT NULL CHECK (`p'rice` >= 0), +FOREIGN KEY (`'order'_'id'`) REFERENCES "orders"(order_id) ON DELETE CASCADE, +FOREIGN KEY (`product'_'id`) REFERENCES "prod'ucts"("product`_`id") ON DELETE CASCADE, +UNIQUE KEY (`'order'_'id'`, `product'_'id`) +); +# Table 1: `'order'_'items'` +# `qua\ntity` must be escaped +Pattern found. +# Table 2: "custo`mers" +# "custo`mers" must be escaped +Pattern found. +# `pho"\ne` must be escaped +Pattern found. +# Table 3: "orders" +# `total\n` must be escaped +Pattern found. +# FOREIGN KEY (`customer_id`) REFERENCES must be escaped +Pattern found. +# Table 4: `prod'ucts` +# "descri`p`t`i`o`n" TEXT must be escaped +Pattern found. +# `stock"_"qua\ntity` must be escaped +Pattern found. +SET @@sql_mode='ONLY_FULL_GROUP_BY,STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_ENGINE_SUBSTITUTION'; +DROP DATABASE bug37607195; diff --git a/mysql-test/t/mysqldump-tablespace-escape.test b/mysql-test/t/mysqldump-tablespace-escape.test index 0a23c154facc..dc3153d76380 100644 --- a/mysql-test/t/mysqldump-tablespace-escape.test +++ b/mysql-test/t/mysqldump-tablespace-escape.test @@ -8,6 +8,7 @@ let $grep_file= $MYSQLTEST_VARDIR/tmp/bug36816986.sql; let $grep_output=boolean; CREATE DATABASE bug36816986; +USE bug36816986; --echo -- Run mysqldump with tablespace_injection_test. --exec $MYSQL_DUMP --debug="d,tablespace_injection_test" --result-file=$grep_file bug36816986 --all-tablespaces 2>&1 @@ -16,6 +17,111 @@ CREATE DATABASE bug36816986; let $grep_pattern=qr| ENGINE=\*/\nsystem touch foo|; --source include/grep_pattern.inc -# Cleanup +--echo The ` must be escaped: +let $grep_pattern=qr|CREATE TABLESPACE `T``N; /*`|; +--source include/grep_pattern.inc + --remove_file $grep_file DROP DATABASE bug36816986; + +--echo +--echo ####################################### +--echo + +--echo # +--echo # Bug#37607195 - fprintf_string not using the actual quote parameter +--echo # + +CREATE DATABASE bug37607195; +USE bug37607195; + +let $grep_file= $MYSQLTEST_VARDIR/tmp/bug37607195.sql; +let $grep_output=boolean; + +--echo Create a bunch of tables with numerous ` ' " \n etc. + +--disable_warnings +SET @@sql_mode='ANSI_QUOTES,ONLY_FULL_GROUP_BY,STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_ENGINE_SUBSTITUTION'; +--enable_warnings + +CREATE TABLE "custo`mers" ( + "customer'_id" INT AUTO_INCREMENT PRIMARY KEY, + "fir`st_`na`me" VARCHAR(50) NOT NULL, + "last_'name" VARCHAR(50) NOT NULL, + "em`ail" VARCHAR(100) UNIQUE NOT NULL, + `pho"\ne` VARCHAR(15), + "created'_'at" TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + "updated'_'at" TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP +); + +CREATE TABLE "prod'ucts" ( + "product`_`id" INT AUTO_INCREMENT PRIMARY KEY, + "product'_`name" VARCHAR(100) NOT NULL, + "descri`p`t`i`o`n" TEXT, + "pr'i'ce" DECIMAL(10, 2) NOT NULL CHECK ("pr'i'ce" >= 0), + `stock"_"qua\ntity` INT DEFAULT 0, + `created'_'at` TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + `updated"_'at` TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + INDEX ("product'_`name") +); + +CREATE TABLE "orders" ( + "order_id" INT AUTO_INCREMENT PRIMARY KEY, + "customer_id" INT NOT NULL, + "order_date" TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + "status" ENUM('Pending', 'Completed', 'Cancelled') NOT NULL, + "total\n" DECIMAL(10, 2) NOT NULL CHECK ("total\n" >= 0), + FOREIGN KEY (customer_id) REFERENCES "custo`mers"("customer'_id") ON DELETE CASCADE, + INDEX (order_date) +); + +CREATE TABLE `'order'_'items'` ( + `order'_'item_id` INT AUTO_INCREMENT PRIMARY KEY, + `'order'_'id'` INT NOT NULL, + `product'_'id` INT NOT NULL, + `qua\ntity` INT NOT NULL CHECK (`qua\ntity` > 0), + `p'rice` DECIMAL(10,2) NOT NULL CHECK (`p'rice` >= 0), + FOREIGN KEY (`'order'_'id'`) REFERENCES "orders"(order_id) ON DELETE CASCADE, + FOREIGN KEY (`product'_'id`) REFERENCES "prod'ucts"("product`_`id") ON DELETE CASCADE, + UNIQUE KEY (`'order'_'id'`, `product'_'id`) +); + +--exec $MYSQL_DUMP bug37607195 --result-file=$grep_file 2>&1 + +--echo # Table 1: `'order'_'items'` +--echo # `qua\ntity` must be escaped +let $grep_pattern=qr| `qua\ntity` INT NOT NULL CHECK (`qua\ntity` > 0)|; +--source include/grep_pattern.inc + +--echo # Table 2: "custo`mers" +--echo # "custo`mers" must be escaped +let $grep_pattern=qr|CREATE TABLE `custo``mers`|; +--source include/grep_pattern.inc + +--echo # `pho"\ne` must be escaped +let $grep_pattern=qr|`pho"\ne` varchar(15) DEFAULT NULL|; +--source include/grep_pattern.inc + +--echo # Table 3: "orders" +--echo # `total\n` must be escaped +let $grep_pattern=qr|`total\n` decimal(10,2) NOT NULL|; +--source include/grep_pattern.inc + +--echo # FOREIGN KEY (`customer_id`) REFERENCES must be escaped +let $grep_pattern=qr|REFERENCES `custo``mers`|; +--source include/grep_pattern.inc + +--echo # Table 4: `prod'ucts` +--echo # "descri`p`t`i`o`n" TEXT must be escaped +let $grep_pattern=qr|`descri``p``t``i``o``n` text|; +--source include/grep_pattern.inc + +--echo # `stock"_"qua\ntity` must be escaped +let $grep_pattern=qr|`stock"_"qua\ntity` int DEFAULT '0'|; +--source include/grep_pattern.inc + +SET @@sql_mode='ONLY_FULL_GROUP_BY,STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_ENGINE_SUBSTITUTION'; + +# Cleanup +--remove_file $grep_file +DROP DATABASE bug37607195; From 6b537204887a549a1c53da4c2b0e81ea53ebd538 Mon Sep 17 00:00:00 2001 From: Frazer Clement Date: Wed, 5 Mar 2025 17:23:14 +0000 Subject: [PATCH 10/13] Bug#37524092 Improve Api Failure handling logs + limit duration Backport to 7.6 Implement a time limit on API failure handling at the data nodes. QMGR already logs when API failure handling is taking a long time. This is extended with a time limit on how long a data node is allowed to take to handle an API node failure before it is considered a failure of the data node itself. API failure handling involves cleanup of transaction state in the DICT, TC, SPJ and SUMA blocks. DICT transaction state can include schema transactions which may take some time to complete, so a hard-coded long timeout (7 days) is used for cases where API failure handling is blocked on DICT processing. For other blocks, a shorter configurable timeout is used. When the timeout elapses, the data node with the problem will be shutdown, which will hopefully help resolve the problem. The short timeout is configured with a new data node configuration parameter : ApiFailureHandlingTimeout Units : Seconds Where : 0 = No limit 1..10 = 10 seconds > 10 = Limit in seconds Default: 600 Two new tests are added to extend coverage of API failure handling : testNodeRestart -n multi_apifail Coverage of concurrent API failure handling testNodeRestart -n timeout_apifail Coverage of data node timeout of api failure handling Change-Id: Iefea39042cdd4ea83fb22fe2681f9e7bf7d56dba --- mysql-test/suite/ndb/r/ndbinfo_plans.result | 6 +- .../include/mgmapi/mgmapi_config_parameters.h | 4 +- storage/ndb/include/mgmapi/ndbd_exit_codes.h | 4 +- storage/ndb/src/kernel/blocks/ERROR_codes.txt | 4 +- .../ndb/src/kernel/blocks/dbdict/Dbdict.cpp | 11 +- storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp | 3 +- .../ndb/src/kernel/blocks/qmgr/QmgrMain.cpp | 87 +++- .../ndb/src/kernel/error/ndbd_exit_codes.c | 7 +- storage/ndb/src/kernel/vm/NdbinfoTables.cpp | 4 +- storage/ndb/src/mgmsrv/ConfigInfo.cpp | 9 +- storage/ndb/test/ndbapi/testNodeRestart.cpp | 375 +++++++++++++++++- storage/ndb/test/run-test/conf-autotest.cnf | 22 +- .../test/run-test/daily-devel--07-tests.txt | 11 +- 13 files changed, 508 insertions(+), 39 deletions(-) diff --git a/mysql-test/suite/ndb/r/ndbinfo_plans.result b/mysql-test/suite/ndb/r/ndbinfo_plans.result index 62dde811b81a..89c63878468f 100644 --- a/mysql-test/suite/ndb/r/ndbinfo_plans.result +++ b/mysql-test/suite/ndb/r/ndbinfo_plans.result @@ -50,8 +50,8 @@ ndb$acc_operations 15 64 ndb$blocks 23 20 ndb$columns 445 44 ndb$config_nodes 34 28 -ndb$config_params 152 120 -ndb$config_values 288 24 +ndb$config_params 153 120 +ndb$config_values 290 24 ndb$counters 104 24 ndb$dblqh_tcconnect_state 25 52 ndb$dbtc_apiconnect_state 25 52 @@ -60,7 +60,7 @@ ndb$dict_obj_types 20 20 ndb$disk_write_speed_aggregate 8 120 ndb$disk_write_speed_base 488 48 ndb$diskpagebuffer 10 64 -ndb$error_messages 768 52 +ndb$error_messages 769 52 ndb$frag_locks 344 96 ndb$frag_mem_use 344 100 ndb$frag_operations 344 192 diff --git a/storage/ndb/include/mgmapi/mgmapi_config_parameters.h b/storage/ndb/include/mgmapi/mgmapi_config_parameters.h index bd28a7261213..a9520be64c5a 100644 --- a/storage/ndb/include/mgmapi/mgmapi_config_parameters.h +++ b/storage/ndb/include/mgmapi/mgmapi_config_parameters.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2004, 2021, Oracle and/or its affiliates. + Copyright (c) 2004, 2025, Oracle and/or its affiliates. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, @@ -240,6 +240,8 @@ #define CFG_DB_WATCHDOG_IMMEDIATE_KILL 657 #define CFG_DB_ENABLE_REDO_CONTROL 658 +#define CFG_DB_API_FAILURE_HANDLING_TIMEOUT 682 + #define CFG_NODE_ARBIT_RANK 200 #define CFG_NODE_ARBIT_DELAY 201 #define CFG_EXTRA_SEND_BUFFER_MEMORY 203 diff --git a/storage/ndb/include/mgmapi/ndbd_exit_codes.h b/storage/ndb/include/mgmapi/ndbd_exit_codes.h index cc734171118c..bbe403295939 100644 --- a/storage/ndb/include/mgmapi/ndbd_exit_codes.h +++ b/storage/ndb/include/mgmapi/ndbd_exit_codes.h @@ -1,5 +1,5 @@ /* - Copyright (c) 2003, 2021, Oracle and/or its affiliates. + Copyright (c) 2003, 2025, Oracle and/or its affiliates. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, @@ -140,6 +140,8 @@ typedef ndbd_exit_classification_enum ndbd_exit_classification; #define NDBD_EXIT_SR_OUT_OF_DATAMEMORY 6800 /* LQH 7200-> */ #define NDBD_EXIT_LCP_SCAN_WATCHDOG_FAIL 7200 +/* QMGR 7400-> */ +#define NDBD_EXIT_API_FAIL_HANDLING_TIMEOUT 7400 /* Errorcodes for NDB filesystem */ #define NDBD_EXIT_AFS_NOPATH 2801 diff --git a/storage/ndb/src/kernel/blocks/ERROR_codes.txt b/storage/ndb/src/kernel/blocks/ERROR_codes.txt index 0265e7eb6ea2..f20460c1e119 100644 --- a/storage/ndb/src/kernel/blocks/ERROR_codes.txt +++ b/storage/ndb/src/kernel/blocks/ERROR_codes.txt @@ -21,13 +21,13 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -Next QMGR 950 +Next QMGR 962 Next NDBCNTR 1030 Next NDBFS 2003 Next DBACC 3007 Next DBTUP 4040 Next DBLQH 5113 -Next DBDICT 6227 +Next DBDICT 6228 Next DBDIH 7251 Next DBTC 8127 Next TRPMAN 9007 diff --git a/storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp b/storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp index 2932f3b77b9e..919aa3ecdb29 100644 --- a/storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp +++ b/storage/ndb/src/kernel/blocks/dbdict/Dbdict.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2003, 2024, Oracle and/or its affiliates. + Copyright (c) 2003, 2025, Oracle and/or its affiliates. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, @@ -5396,6 +5396,15 @@ void Dbdict::execAPI_FAILREQ(Signal* signal) Uint32 failedApiNode = signal->theData[0]; BlockReference retRef = signal->theData[1]; + if (ERROR_INSERTED(6227)) { + jam(); + g_eventLogger->info("Delaying failure handling of node %u for 5 seconds", + failedApiNode); + sendSignalWithDelay(reference(), GSN_API_FAILREQ, signal, 5000, + signal->getLength()); + return; + } + ndbrequire(retRef == QMGR_REF); // As callback hard-codes QMGR_REF #if 0 Uint32 userNode = refToNode(c_connRecord.userBlockRef); diff --git a/storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp b/storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp index 262faefe3722..fd45781e9800 100644 --- a/storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp +++ b/storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2003, 2021, Oracle and/or its affiliates. + Copyright (c) 2003, 2025, Oracle and/or its affiliates. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, @@ -541,6 +541,7 @@ class Qmgr : public SimulatedBlock { Uint32 c_restartFailureTimeout; Uint32 c_restartNoNodegroupTimeout; NDB_TICKS c_start_election_time; + Uint32 c_apiFailureTimeoutSecs; Uint16 creadyDistCom; diff --git a/storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp b/storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp index 431d1584bf4e..bc76b5d33006 100644 --- a/storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp +++ b/storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp @@ -2719,6 +2719,7 @@ void Qmgr::initData(Signal* signal) c_restartPartitionedTimeout = Uint32(~0); c_restartFailureTimeout = Uint32(~0); c_restartNoNodegroupTimeout = 15000; + c_apiFailureTimeoutSecs = 600; ndb_mgm_get_int_parameter(p, CFG_DB_HEARTBEAT_INTERVAL, &hbDBDB); ndb_mgm_get_int_parameter(p, CFG_DB_ARBIT_TIMEOUT, &arbitTimeout); ndb_mgm_get_int_parameter(p, CFG_DB_ARBIT_METHOD, &arbitMethod); @@ -2732,6 +2733,8 @@ void Qmgr::initData(Signal* signal) &c_restartFailureTimeout); ndb_mgm_get_int_parameter(p, CFG_DB_CONNECT_CHECK_DELAY, &ccInterval); + ndb_mgm_get_int_parameter(p, CFG_DB_API_FAILURE_HANDLING_TIMEOUT, + &c_apiFailureTimeoutSecs); if(c_restartPartialTimeout == 0) { @@ -3144,18 +3147,18 @@ void Qmgr::checkStartInterface(Signal* signal, NDB_TICKS now) else { jam(); - if(((get_hb_count(nodePtr.i) + 1) % 30) == 0) - { - jam(); - char buf[256]; - if (getNodeInfo(nodePtr.i).m_type == NodeInfo::DB) - { + const Uint32 secondsElapsed = get_hb_count(nodePtr.i); + bool generateDelayLog = + (secondsElapsed && ((secondsElapsed % 30) == 0)); + + if (getNodeInfo(nodePtr.i).m_type == NodeInfo::DB) { + if (generateDelayLog) { jam(); + char buf[256]; BaseString::snprintf(buf, sizeof(buf), "Failure handling of node %d has not completed" " in %d seconds - state = %d", - nodePtr.i, - get_hb_count(nodePtr.i), + nodePtr.i, secondsElapsed, nodePtr.p->failState); warningEvent("%s", buf); @@ -3166,14 +3169,42 @@ void Qmgr::checkStartInterface(Signal* signal, NDB_TICKS now) signal->theData[1] = nodePtr.i; sendSignal(DBDIH_REF, GSN_DUMP_STATE_ORD, signal, 2, JBB); } - else - { + } + else + { + /* API/MGMD */ + + /* Check which timeout value to use */ + Uint32 maxSeconds = c_apiFailureTimeoutSecs; + if (nodePtr.p->failState == WAITING_FOR_API_FAILCONF) { + /* Check if we are waiting for DICT */ + for (Uint32 i = 0; i < NDB_ARRAY_SIZE(nodePtr.p->m_failconf_blocks); + i++) { + if (nodePtr.p->m_failconf_blocks[i] == DBDICT) { + /* DICT failure handling time can include + * Schema Transaction rollback/forward + */ + maxSeconds = (7 * 24 * 60 * 60); + break; + } + } + } + const Uint32 remainSecs = + ((maxSeconds > 0) ? (secondsElapsed >= maxSeconds + ? 0 + : maxSeconds - secondsElapsed) + : UINT32_MAX); + + const bool escalate = (remainSecs == 0); + generateDelayLog |= (remainSecs == 5 || escalate); + + if (generateDelayLog) { jam(); + char buf[256]; BaseString::snprintf(buf, sizeof(buf), "Failure handling of api %u has not completed" - " in %d seconds - state = %d", - nodePtr.i, - get_hb_count(nodePtr.i), + " in %d seconds. Limit %u - state = %d", + nodePtr.i, secondsElapsed, maxSeconds, nodePtr.p->failState); warningEvent("%s", buf); if (nodePtr.p->failState == WAITING_FOR_API_FAILCONF) @@ -3202,6 +3233,27 @@ void Qmgr::checkStartInterface(Signal* signal, NDB_TICKS now) } } } + if (escalate) + { + g_eventLogger->error( + "Failure handling of api %u has not completed " + "in %d seconds. Limit %d - state = %d blocks " + "%u %u %u %u %u", + nodePtr.i, secondsElapsed, maxSeconds, nodePtr.p->failState, + nodePtr.p->m_failconf_blocks[0], + nodePtr.p->m_failconf_blocks[1], + nodePtr.p->m_failconf_blocks[2], + nodePtr.p->m_failconf_blocks[3], + nodePtr.p->m_failconf_blocks[4]); + + CRASH_INSERTION(961); // Safe exit for testing + char buf[100]; + BaseString::snprintf( + buf, sizeof(buf), + "Exceeded limit of %u seconds handling failure of Api node %u.", + maxSeconds, nodePtr.i); + progError(__LINE__, NDBD_EXIT_API_FAIL_HANDLING_TIMEOUT, buf); + } } } } @@ -6722,6 +6774,15 @@ Qmgr::execDUMP_STATE_ORD(Signal* signal) sendSignal(TRPMAN_REF, GSN_CLOSE_COMREQ, signal, CloseComReqConf::SignalLength, JBB); } + if (signal->theData[0] == 909) { + jam(); + if (signal->getLength() == 2) { + jam(); + g_eventLogger->info("QMGR : Setting c_apiFailureTimeoutSecs to %u", + signal->theData[1]); + c_apiFailureTimeoutSecs = signal->theData[1]; + } + } }//Qmgr::execDUMP_STATE_ORD() void diff --git a/storage/ndb/src/kernel/error/ndbd_exit_codes.c b/storage/ndb/src/kernel/error/ndbd_exit_codes.c index 6726442c90d0..5cc2cfb0f53b 100644 --- a/storage/ndb/src/kernel/error/ndbd_exit_codes.c +++ b/storage/ndb/src/kernel/error/ndbd_exit_codes.c @@ -1,5 +1,5 @@ /* - Copyright (c) 2003, 2021, Oracle and/or its affiliates. + Copyright (c) 2003, 2025, Oracle and/or its affiliates. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, @@ -160,6 +160,11 @@ static const ErrStruct errArray[] = {NDBD_EXIT_LCP_SCAN_WATCHDOG_FAIL, XIE, "LCP fragment scan watchdog detected a problem. Please report a bug."}, + /* QMGR */ + {NDBD_EXIT_API_FAIL_HANDLING_TIMEOUT, XIE, + "Timeout handling Api failure. Please check ApiFailureHandlingTimeout " + "config or report a bug."}, + /* Ndbfs error messages */ /* Most codes will have additional info, such as OS error code */ {NDBD_EXIT_AFS_NOPATH, XIE, "No file system path"}, diff --git a/storage/ndb/src/kernel/vm/NdbinfoTables.cpp b/storage/ndb/src/kernel/vm/NdbinfoTables.cpp index bb812b2938ad..cb988b83b98d 100644 --- a/storage/ndb/src/kernel/vm/NdbinfoTables.cpp +++ b/storage/ndb/src/kernel/vm/NdbinfoTables.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2009, 2021, Oracle and/or its affiliates. + Copyright (c) 2009, 2025, Oracle and/or its affiliates. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, @@ -624,7 +624,7 @@ DECLARE_NDBINFO_TABLE(TC_TIME_TRACK_STATS, 15) = }; Uint32 CONFIG_VALUES_fn(const Ndbinfo::Counts &c) { - return c.data_nodes * 144; // 144 = current number of config parameters + return c.data_nodes * 145; // 145 = current number of config parameters }; DECLARE_NDBINFO_TABLE(CONFIG_VALUES,12) = { { "config_values", 3, 0, CONFIG_VALUES_fn, "Configuration parameter values" }, diff --git a/storage/ndb/src/mgmsrv/ConfigInfo.cpp b/storage/ndb/src/mgmsrv/ConfigInfo.cpp index 60d155ec6c1c..91bcf78c5d14 100644 --- a/storage/ndb/src/mgmsrv/ConfigInfo.cpp +++ b/storage/ndb/src/mgmsrv/ConfigInfo.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2003, 2021, Oracle and/or its affiliates. + Copyright (c) 2003, 2025, Oracle and/or its affiliates. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, @@ -2436,6 +2436,13 @@ const ConfigInfo::ParamInfo ConfigInfo::m_ParamInfo[] = { "512" }, + {CFG_DB_API_FAILURE_HANDLING_TIMEOUT, "ApiFailureHandlingTimeout", DB_TOKEN, + "Maximum allowed duration of Api failure handling before escalating " + "handling. 0 implies no time limit, minimum usable value is 10.", + ConfigInfo::CI_USED, false, ConfigInfo::CI_INT, + "600", // 10 minutes + "0", STR_VALUE(MAX_INT_RNIL)}, + /*************************************************************************** * API ***************************************************************************/ diff --git a/storage/ndb/test/ndbapi/testNodeRestart.cpp b/storage/ndb/test/ndbapi/testNodeRestart.cpp index 347f7b3ddd64..5f0bfcdc6bce 100644 --- a/storage/ndb/test/ndbapi/testNodeRestart.cpp +++ b/storage/ndb/test/ndbapi/testNodeRestart.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2003, 2024, Oracle and/or its affiliates. + Copyright (c) 2003, 2025, Oracle and/or its affiliates. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, @@ -11025,6 +11025,334 @@ int runRestartsWithSlowCommitComplete(NDBT_Context *ctx, NDBT_Step *step) { } +static const Uint32 MAX_EXTRA_CONNECTIONS = MAX_NODES; +static Uint32 g_numExtraConnections = 0; +static Ndb_cluster_connection *g_extraConnections[MAX_EXTRA_CONNECTIONS]; + +int runSetupExtraConnections(NDBT_Context *ctx, NDBT_Step *step) { + const Uint32 extraConnections = + ctx->getProperty("ExtraConnections", Uint32(0)); + assert(g_numExtraConnections == 0); + if (extraConnections > MAX_EXTRA_CONNECTIONS) { + g_err << "Too many extra connections requested " << extraConnections + << endl; + return NDBT_FAILED; + } + + g_err << "Setting up " << extraConnections << " extra connections." << endl; + + for (Uint32 c = 0; c < extraConnections; c++) { + Ndb_cluster_connection *ncc = new Ndb_cluster_connection(); + if (ncc->connect() != 0) { + g_err << "ERROR : connect failure." << endl; + return NDBT_FAILED; + } + g_err << "Connection " << c << " node id " << ncc->node_id() << endl; + + g_extraConnections[c] = ncc; + g_numExtraConnections++; + } + + return NDBT_OK; +} + +int applyDumpCodes(const char *codeGroupsString) { + NdbRestarter restarter; + + /* Format is Code 1 [Code 2]*[, Code 1 [Code 2]*]* */ + + g_err << "Applying dump codes " << codeGroupsString << endl; + Vector codeGroups; + { + BaseString list(codeGroupsString); + list.split(codeGroups, ","); + } + + for (Uint32 g = 0; g < codeGroups.size(); g++) { + Vector codes; + codeGroups[g].split(codes, " "); + const int maxCodes = 25; + int codeNums[maxCodes]; + const int numCodes = codes.size(); + + if (numCodes > maxCodes) { + g_err << "Too many codes " << numCodes << endl; + return NDBT_FAILED; + } + + for (int c = 0; c < numCodes; c++) { + codeNums[c] = atoi(codes[c].c_str()); + } + + g_err << " Injecting code group " << codeGroups[g].c_str() + << " in all nodes " << endl; + + if (restarter.dumpStateAllNodes(codeNums, numCodes) != NDBT_OK) { + g_err << "Failed to dump codeGroup " << codeGroups[g].c_str() << endl; + return NDBT_FAILED; + } + } + + return NDBT_OK; +} + +int runClearExtraConnections(NDBT_Context *ctx, NDBT_Step *step) { + g_err << "Clearing away " << g_numExtraConnections << " extra connections" + << endl; + for (Uint32 c = 0; c < g_numExtraConnections; c++) { + Ndb_cluster_connection *ncc = g_extraConnections[c]; + delete ncc; + + g_extraConnections[c] = NULL; + } + + g_numExtraConnections = 0; + return NDBT_OK; +} + +int runDumpSetup(NDBT_Context *ctx, NDBT_Step *step) { + const char *dumpSetupList = ctx->getProperty("DumpSetup", ""); + if (strcmp(dumpSetupList, "") == 0) { + return NDBT_OK; + } + + return applyDumpCodes(dumpSetupList); +} + +int runDumpClear(NDBT_Context *ctx, NDBT_Step *step) { + const char *dumpClearList = ctx->getProperty("DumpClear", ""); + if (strcmp(dumpClearList, "") == 0) { + return NDBT_OK; + } + + return applyDumpCodes(dumpClearList); +} + +int runSetupErrorInjections(NDBT_Context *ctx, NDBT_Step *step) { + const char *errorList = ctx->getProperty("ErrorInjections", ""); + if (strcmp(errorList, "") == 0) { + return NDBT_OK; + } + NdbRestarter restarter; + + Uint32 errorInjectionNode = ctx->getProperty("ErrorInjectionNode", Uint32(0)); + g_err << "Error list : " << errorList << endl; + if (errorInjectionNode) { + /* 0 == ALL + * 1..n == specific node + * ~Uint32(0) == choose one + */ + if (errorInjectionNode == ~Uint32(0)) { + errorInjectionNode = restarter.getNode(NdbRestarter::NS_RANDOM); + } + g_err << "Error node : " << errorInjectionNode << endl; + } + + Vector codes; + { + BaseString list(errorList); + list.split(codes, ","); + } + + for (Uint32 i = 0; i < codes.size(); i++) { + const int code = atoi(codes[i].c_str()); + if (errorInjectionNode) { + g_err << " Injecting code " << code << " in node " << errorInjectionNode + << endl; + if (restarter.insertErrorInNode(errorInjectionNode, code) != NDBT_OK) { + g_err << "Failed to inject error " << code << " in node " + << errorInjectionNode << endl; + return NDBT_FAILED; + } + } else { + g_err << " Injecting code " << code << " in all nodes" << endl; + if (restarter.insertErrorInAllNodes(code) != NDBT_OK) { + g_err << "Failed to inject error " << code << endl; + return NDBT_FAILED; + } + } + } + + return NDBT_OK; +} + +int runClearErrorInjections(NDBT_Context *ctx, NDBT_Step *step) { + NdbRestarter restarter; + + restarter.insertErrorInAllNodes(0); + return NDBT_OK; +} + +int runFailExtraConnections(NDBT_Context *ctx, NDBT_Step *step) { + const Uint32 connectFailIterations = + ctx->getProperty("ConnectionFailIterations", Uint32(10)); + const Uint32 connectFailDelaySecs = + ctx->getProperty("ConnectionFailDelaySecs", Uint32(10)); + const bool connectFailWaitReconnect = + (ctx->getProperty("ConnectionFailWaitReconnect", Uint32(0)) != 0); + + g_err << "runFailExtraConnections : Extra connections " + << g_numExtraConnections << " iterations " << connectFailIterations + << " delay secs " << connectFailDelaySecs << " wait reconnect " + << connectFailWaitReconnect << endl; + + if (g_numExtraConnections == 0) { + g_err << "No extra connections - nothing to do" << endl; + ctx->stopTest(); + return NDBT_OK; + } + + NdbRestarter restarter; + + /* Get list of data nodes */ + const int numDataNodes = restarter.getNumDbNodes(); + int dataNodes[MAX_NDB_NODES]; + for (int i = 0; i < numDataNodes; i++) { + dataNodes[i] = restarter.getDbNodeId(i); + } + + Ndb_cluster_connection *failSet[MAX_EXTRA_CONNECTIONS]; + for (Uint32 c = 0; c < g_numExtraConnections; c++) { + failSet[c] = g_extraConnections[c]; + } + + int dumpCodes[] = {900, 0}; + + NdbSleep_SecSleep(connectFailDelaySecs); + + for (Uint32 i = 0; i < connectFailIterations; i++) { + if (ctx->isTestStopped()) { + g_err << "Test stopped by another step" << endl; + break; + } + + /* Rotate set of apis left */ + Ndb_cluster_connection *prev = failSet[0]; + for (Uint32 c = g_numExtraConnections; c > 0; c--) { + Ndb_cluster_connection *curr = failSet[c - 1]; + failSet[c - 1] = prev; + prev = curr; + } + + const Uint32 concurrentFailures = 1 + (i % g_numExtraConnections); + + for (Uint32 f = 0; f < concurrentFailures; f++) { + const int nodeId = failSet[f]->node_id(); + g_err << "Failing node " << f + 1 << "/" << concurrentFailures + << " nodeid : " << nodeId << endl; + dumpCodes[1] = nodeId; + /* Todo : Consider dumping in just one data node, allowing to propagate */ + restarter.dumpStateAllNodes(dumpCodes, 2); + } + + NdbSleep_SecSleep(connectFailDelaySecs); + + if (connectFailWaitReconnect) { + for (Uint32 f = 0; f < concurrentFailures; f++) { + const int nodeId = failSet[f]->node_id(); + + g_err << "Waiting for Api node id " << nodeId + << " to report connected to " << numDataNodes << " data nodes." + << endl; + if (failSet[f]->wait_until_ready(dataNodes, numDataNodes, 120) != + numDataNodes) { + g_err << "Timed out waiting for api node " << nodeId << " to connect." + << endl; + ctx->stopTest(); + return NDBT_FAILED; + } + g_err << " Api node id " << nodeId << " now connected to " + << numDataNodes << " data nodes" << endl; + } + } + } + + g_err << "All Api failures generated, stopping test" << endl; + ctx->stopTest(); + + return NDBT_OK; +} + +int runMixedLoadExtra(NDBT_Context *ctx, NDBT_Step *step) { + /* One thread running transactions towards a cluster + * across the main and extra cluster connections. + * Cluster connections may fail and recover during + * execution + */ + const Uint32 totalSteps = step->getStepTypeCount(); + const Uint32 stepNo = step->getStepTypeNo(); + + int api_node_id = 0; + Ndb_cluster_connection *ncc = NULL; + { + Ndb *pMainNdb = GETNDB(step); + Uint32 connectionNo = stepNo % (1 + g_numExtraConnections); + + if (connectionNo == 0) { + ncc = &pMainNdb->get_ndb_cluster_connection(); + } else { + ncc = g_extraConnections[connectionNo - 1]; + } + + api_node_id = ncc->node_id(); + + g_err << "runMixedLoadExtra step " << stepNo << "/" << totalSteps + << " using connection " << connectionNo << " (api id " << api_node_id + << ")" << endl; + } + + /* Setup an Ndb object using the connection */ + Ndb *pNdb = new Ndb(ncc, "TEST_DB"); + if (pNdb->init() != 0) { + g_err << "Error initialising Ndb connection : " << pNdb->getNdbError() + << endl; + delete pNdb; + return NDBT_FAILED; + } + + if (pNdb->waitUntilReady(30) != 0) { + g_err << "Error waiting until ready in step " << stepNo << endl; + delete pNdb; + return NDBT_FAILED; + } + + HugoTransactions hugoTrans( + *ctx->getTab()); /* Cheat using main connection dict */ + /* Have hugoTrans instances across step threads avoid each other */ + hugoTrans.setThrInfo(totalSteps, stepNo); + int records = ctx->getNumRecords(); + int batch = 10; + + Uint32 loopCount = 0; + while (!ctx->isTestStopped()) { + int ret = hugoTrans.pkUpdateRecords(pNdb, records, batch); + if (ret != 0) { + g_err << "Step " << stepNo << " running updates as api node " + << api_node_id << " got hugoTrans error " << hugoTrans.getNdbError() + << endl; + g_err << "Ignoring." << endl; + } + + ret = hugoTrans.scanReadRecords(pNdb, records, 10, /* abortPct */ + 0, NdbOperation::LM_CommittedRead); + if (ret != 0) { + g_err << "Step " << stepNo << " running scan as api node " << api_node_id + << " got hugoTrans error " << hugoTrans.getNdbError() << endl; + g_err << "Ignoring." << endl; + } + + if ((++loopCount % 10) == 0) { + g_err << "Step " << stepNo << " on api node " << api_node_id + << " completed " << loopCount << " iterations." << endl; + } + } + + delete pNdb; + + return NDBT_OK; +} + + NDBT_TESTSUITE(testNodeRestart); TESTCASE("NoLoad", "Test that one node at a time can be stopped and then restarted "\ @@ -11926,6 +12254,51 @@ TESTCASE("TransientStatesNF", STEP(runRestartsWithSlowCommitComplete); FINALIZER(runClearTable); } +TESTCASE("multi_apifail", "Multiple concurrent api failures") { + /* Multiple extra API connections + * Multiple threads using main connection + extra connections + * (Sub)sets of extra Api connections disconnected + * Gives coverage of API failure + reconnect handling + */ + INITIALIZER(runLoadTable); + TC_PROPERTY("ExtraConnections", 3); + TC_PROPERTY("ConnectionFailIterations", 10); + TC_PROPERTY("ConnectionFailDelaySecs", 10); + INITIALIZER(runSetupExtraConnections); + STEPS(runMixedLoadExtra, 10); + STEP(runFailExtraConnections); + FINALIZER(runClearExtraConnections); + FINALIZER(runClearTable); +} +TESTCASE("timeout_apifail", "Timeout handling api failure") { + /* Single extra API connection + * Multiple threads using main connection + extra connection + * Reduced QMGR API failure handling timeout to reduce test runtime + * TC failure handling stalled + * API is disconnected, TC API failure handling stalls + * Gives coverage of API failure handling timeout escalation + */ + INITIALIZER(runLoadTable); + TC_PROPERTY("ExtraConnections", 1); + TC_PROPERTY("ConnectionFailIterations", 1); + TC_PROPERTY("ConnectionFailDelaySecs", 10); + TC_PROPERTY("ConnectionFailWaitReconnect", + 1); // Wait for connection to fully recover + TC_PROPERTY("DumpSetup", "909 20"); // Reduce Api Failure timeout + TC_PROPERTY("DumpClear", "909 600"); // Restore to default + TC_PROPERTY("ErrorInjections", + "961, 8125"); // Stall failure handling, soft crash + TC_PROPERTY("ErrorInjectionNode", ~Uint32(0)); // Choose one + INITIALIZER(runSetupExtraConnections); + INITIALIZER(runDumpSetup); + INITIALIZER(runSetupErrorInjections); + STEPS(runMixedLoadExtra, 10); + STEP(runFailExtraConnections); + FINALIZER(runDumpClear); + FINALIZER(runClearErrorInjections); + FINALIZER(runClearExtraConnections); + FINALIZER(runClearTable); +} NDBT_TESTSUITE_END(testNodeRestart) int main(int argc, const char** argv){ diff --git a/storage/ndb/test/run-test/conf-autotest.cnf b/storage/ndb/test/run-test/conf-autotest.cnf index 33af4e9aab31..8c1e879fed97 100644 --- a/storage/ndb/test/run-test/conf-autotest.cnf +++ b/storage/ndb/test/run-test/conf-autotest.cnf @@ -1,4 +1,4 @@ -# Copyright (c) 2015, 2024, Oracle and/or its affiliates. +# Copyright (c) 2015, 2025, Oracle and/or its affiliates. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -33,7 +33,7 @@ protocol=tcp [cluster_config.2ndbd] ndb_mgmd = CHOOSE_host1 ndbd = CHOOSE_host2,CHOOSE_host3 -ndbapi= CHOOSE_host1,, +ndbapi= CHOOSE_host1,,,, NoOfReplicas = 2 DataMemory = 400M @@ -61,7 +61,7 @@ TimeBetweenWatchDogCheckInitial=60000 [cluster_config.2node] ndb_mgmd = CHOOSE_host1 ndbd = CHOOSE_host2,CHOOSE_host3 -ndbapi= CHOOSE_host1,, +ndbapi= CHOOSE_host1,,,, NoOfReplicas = 2 DataMemory = 400M @@ -89,7 +89,7 @@ TimeBetweenWatchDogCheckInitial=60000 [cluster_config.2node8thr] ndb_mgmd = CHOOSE_host1 ndbd = CHOOSE_host2,CHOOSE_host3 -ndbapi= CHOOSE_host1,, +ndbapi= CHOOSE_host1,,,, NoOfReplicas = 2 IndexMemory = 100M @@ -118,7 +118,7 @@ TimeBetweenWatchDogCheckInitial=60000 [cluster_config.2node10thr] ndb_mgmd = CHOOSE_host1 ndbd = CHOOSE_host2,CHOOSE_host3 -ndbapi= CHOOSE_host1,, +ndbapi= CHOOSE_host1,,,, NoOfReplicas = 2 DataMemory = 400M @@ -147,7 +147,7 @@ TimeBetweenWatchDogCheckInitial=60000 [cluster_config.4node] ndb_mgmd = CHOOSE_host1 ndbd = CHOOSE_host2,CHOOSE_host3,CHOOSE_host4,CHOOSE_host5 -ndbapi= CHOOSE_host1,, +ndbapi= CHOOSE_host1,,,, NoOfReplicas = 2 IndexMemory = 100M @@ -203,7 +203,7 @@ InitialTablespace = datafile01.dat:256M;datafile02.dat:256M [cluster_config.3node3rpl] ndb_mgmd = CHOOSE_host1 ndbd = CHOOSE_host2,CHOOSE_host3,CHOOSE_host1 -ndbapi= CHOOSE_host1,, +ndbapi= CHOOSE_host1,,,, NoOfReplicas = 3 IndexMemory = 100M @@ -230,7 +230,7 @@ TimeBetweenWatchDogCheckInitial=60000 [cluster_config.4node4rpl] ndb_mgmd = CHOOSE_host1 ndbd = CHOOSE_host2,CHOOSE_host3,CHOOSE_host4,CHOOSE_host5 -ndbapi= CHOOSE_host1,, +ndbapi= CHOOSE_host1,,,, NoOfReplicas = 4 IndexMemory = 100M @@ -260,7 +260,7 @@ TimeBetweenWatchDogCheckInitial=60000 [cluster_config.2node2mgm] ndb_mgmd = CHOOSE_host1,CHOOSE_host6 ndbd = CHOOSE_host2,CHOOSE_host3 -ndbapi= CHOOSE_host1,, +ndbapi= CHOOSE_host1,,,, NoOfReplicas = 2 IndexMemory = 50M @@ -279,7 +279,7 @@ Checksum=1 [cluster_config.2node8thr2mgm] ndb_mgmd = CHOOSE_host1,CHOOSE_host6 ndbd = CHOOSE_host2,CHOOSE_host3 -ndbapi= CHOOSE_host1,, +ndbapi= CHOOSE_host1,,,, NoOfReplicas = 2 IndexMemory = 50M @@ -299,7 +299,7 @@ Checksum=1 [cluster_config.4node2mgm] ndb_mgmd = CHOOSE_host1,CHOOSE_host8 ndbd = CHOOSE_host2,CHOOSE_host3,CHOOSE_host4,CHOOSE_host5 -ndbapi= CHOOSE_host1,, +ndbapi= CHOOSE_host1,,,, NoOfReplicas = 2 IndexMemory = 50M diff --git a/storage/ndb/test/run-test/daily-devel--07-tests.txt b/storage/ndb/test/run-test/daily-devel--07-tests.txt index 46fb9ddeca98..62c28cf5bcc4 100644 --- a/storage/ndb/test/run-test/daily-devel--07-tests.txt +++ b/storage/ndb/test/run-test/daily-devel--07-tests.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2015, 2024, Oracle and/or its affiliates. +# Copyright (c) 2015, 2025, Oracle and/or its affiliates. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -320,3 +320,12 @@ max-time: 180 cmd: testNdbinfo args: -n ScanFragMemUseDuringCreateDropTable -l 10000 T1 max-time: 180 + +cmd: testNodeRestart +args: -n multi_apifail T1 +max-time: 360 + +cmd: testNodeRestart +args: -n timeout_apifail T1 +max-time: 240 + From 4c61d4ec638f4a112557eb03ac291ef6f3641b23 Mon Sep 17 00:00:00 2001 From: Frazer Clement Date: Fri, 7 Mar 2025 18:33:13 +0000 Subject: [PATCH 11/13] Bug#37518267 Improve data node thread watchdog shutdown handling Backport to 7.6 Two changes : 1. Have node error handling set thread watchdog state prior to attempting to serialise or log error details to files. This helps users understand whether Watchdog logs indicate a detected overload, or whether they indicate a delay in shutting down a data node. 2. Have the Watchdog thread treat 'slow logging' as a special case. If a registered thread exceeds its time allowance in a shutdown logging state then the watchdog directly calls NdbShutdown(), which is more likely to lead to an immediate process exit. This improves the system's ability to force a timely process failure (and subsequent restart) potentially at the expense of some logging. Test coverage by testNodeRestart -n WatchdogSlowShutdown is enhanced to cover another case. Error injection coverage of data node shutdown is refactored to enable future extensions. Change-Id: I57eabbdb04423409d0aae1b6e548013a7088f4d0 --- storage/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp | 26 +++--- .../ndb/src/kernel/error/ErrorReporter.cpp | 34 ++++++-- storage/ndb/src/kernel/ndbd.cpp | 29 +++++-- storage/ndb/src/kernel/vm/Configuration.cpp | 17 +++- storage/ndb/src/kernel/vm/Configuration.hpp | 21 ++++- storage/ndb/src/kernel/vm/SimulatedBlock.cpp | 4 +- storage/ndb/src/kernel/vm/WatchDog.cpp | 12 ++- storage/ndb/src/kernel/vm/mt.cpp | 8 ++ storage/ndb/test/ndbapi/testNodeRestart.cpp | 85 ++++++++++--------- 9 files changed, 166 insertions(+), 70 deletions(-) diff --git a/storage/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp b/storage/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp index aa4cd2f49788..443c72760664 100644 --- a/storage/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp +++ b/storage/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2003, 2024, Oracle and/or its affiliates. + Copyright (c) 2003, 2025, Oracle and/or its affiliates. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, @@ -74,11 +74,6 @@ // Used here only to print event reports on stdout/console. extern EventLogger * g_eventLogger; -extern int simulate_error_during_shutdown; - -#ifdef ERROR_INSERT -extern int simulate_error_during_error_reporting; -#endif // Index pages used by ACC instances Uint32 g_acc_pages_used[1 + MAX_NDBMT_LQH_WORKERS]; @@ -219,17 +214,20 @@ void Cmvmi::execNDB_TAMPER(Signal* signal) ndbrequire(false); } +#ifdef ERROR_INSERT #ifndef NDB_WIN32 if(ERROR_INSERTED(9996)){ - simulate_error_during_shutdown= SIGSEGV; + globalEmulatorData.theConfiguration->setShutdownHandlingFault( + Configuration::SHF_UNIX_SIGNAL, SIGSEGV); ndbrequire(false); } if(ERROR_INSERTED(9995)){ - simulate_error_during_shutdown= SIGSEGV; + globalEmulatorData.theConfiguration->setShutdownHandlingFault( + Configuration::SHF_UNIX_SIGNAL, SIGSEGV); kill(getpid(), SIGABRT); } - +#endif #endif } // execNDB_TAMPER() @@ -1982,13 +1980,17 @@ Cmvmi::execDUMP_STATE_ORD(Signal* signal) if (arg == DumpStateOrd::CmvmiSetErrorHandlingError) { Uint32 val = 0; + Uint32 extra = 0; if (signal->length() >= 2) { val = signal->theData[1]; + if (signal->length() >= 3) { + extra = signal->theData[2]; + } } - g_eventLogger->info("Cmvmi : Setting ErrorHandlingError to %u", - val); - simulate_error_during_error_reporting = val; + g_eventLogger->info("Cmvmi : Setting ShutdownErrorHandling to %u %u", val, + extra); + globalEmulatorData.theConfiguration->setShutdownHandlingFault(val, extra); } #endif diff --git a/storage/ndb/src/kernel/error/ErrorReporter.cpp b/storage/ndb/src/kernel/error/ErrorReporter.cpp index bdc23aa0234e..7d0ddec5332f 100644 --- a/storage/ndb/src/kernel/error/ErrorReporter.cpp +++ b/storage/ndb/src/kernel/error/ErrorReporter.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2003, 2022, Oracle and/or its affiliates. + Copyright (c) 2003, 2025, Oracle and/or its affiliates. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, @@ -65,10 +65,6 @@ static void dumpJam(FILE* jamStream, const char * ndb_basename(const char *path); -#ifdef ERROR_INSERT -int simulate_error_during_error_reporting = 0; -#endif - static const char* formatTimeStampString(char* theDateTimeString, size_t len){ @@ -435,13 +431,33 @@ WriteMessage(int thrdMessageID, fflush(stream); fclose(stream); +#ifdef ERROR_INSERT + if (globalEmulatorData.theConfiguration->getShutdownHandlingFault() == + Configuration::SHF_DELAY_WHILE_WRITING_ERRORLOG) { + Uint32 seconds = + globalEmulatorData.theConfiguration->getShutdownHandlingFaultExtra(); + if (seconds == 0) seconds = 300; + + fprintf(stderr, + "Stall for %us during error reporting before releasing lock\n", + seconds); + NdbSleep_SecSleep(seconds); + fprintf(stderr, "Stall finished\n"); + } +#endif + ErrorReporter::prepare_to_crash(false, (nst == NST_ErrorInsert)); #ifdef ERROR_INSERT - if (simulate_error_during_error_reporting == 1) - { - fprintf(stderr, "Stall during error reporting after releasing lock\n"); - NdbSleep_MilliSleep(30000); + if (globalEmulatorData.theConfiguration->getShutdownHandlingFault() == + Configuration::SHF_DELAY_AFTER_WRITING_ERRORLOG) { + Uint32 seconds = + globalEmulatorData.theConfiguration->getShutdownHandlingFaultExtra(); + if (seconds == 0) seconds = 300; + fprintf(stderr, + "Stall for %us during error reporting after releasing lock\n", + seconds); + NdbSleep_SecSleep(seconds); } #endif diff --git a/storage/ndb/src/kernel/ndbd.cpp b/storage/ndb/src/kernel/ndbd.cpp index 242df6ebf6db..c3084d71ae12 100644 --- a/storage/ndb/src/kernel/ndbd.cpp +++ b/storage/ndb/src/kernel/ndbd.cpp @@ -1,4 +1,4 @@ -/* Copyright (c) 2009, 2024, Oracle and/or its affiliates. +/* Copyright (c) 2009, 2025, Oracle and/or its affiliates. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, @@ -1152,8 +1152,6 @@ extern "C" my_bool opt_core; // instantiated and updated in NdbcntrMain.cpp extern Uint32 g_currentStartPhase; -int simulate_error_during_shutdown= 0; - void NdbShutdown(int error_code, NdbShutdownType type, @@ -1225,6 +1223,19 @@ NdbShutdown(int error_code, * Very serious, don't attempt to free, just die!! */ g_eventLogger->info("Watchdog shutdown completed - %s", exitAbort); +#ifdef ERROR_INSERT + const Uint32 shf = + globalEmulatorData.theConfiguration->getShutdownHandlingFault(); + if (shf != 0) { + if (shf == Configuration::SHF_DELAY_AFTER_WRITING_ERRORLOG || + shf == Configuration::SHF_DELAY_WHILE_WRITING_ERRORLOG) { + g_eventLogger->info( + "ERROR_INSERT : Watchdog choosing restart rather than hard exit " + "for test pass"); + childExit(error_code, NRT_NoStart_Restart, g_currentStartPhase); + } + } +#endif if (opt_core) { childAbort(error_code, -1,g_currentStartPhase); @@ -1235,13 +1246,19 @@ NdbShutdown(int error_code, } } +#ifdef ERROR_INSERT #ifndef NDB_WIN32 - if (simulate_error_during_shutdown) - { - kill(getpid(), simulate_error_during_shutdown); + if (globalEmulatorData.theConfiguration->getShutdownHandlingFault() == + Configuration::SHF_UNIX_SIGNAL) { + const Uint32 sigId = + globalEmulatorData.theConfiguration->getShutdownHandlingFaultExtra(); + g_eventLogger->info("ERROR_INSERT : Raising unix signal %u to self", + sigId); + kill(getpid(), sigId); while(true) NdbSleep_MilliSleep(10); } +#endif #endif globalEmulatorData.theWatchDog->doStop(); diff --git a/storage/ndb/src/kernel/vm/Configuration.cpp b/storage/ndb/src/kernel/vm/Configuration.cpp index 779657e0832d..207826a19c7d 100644 --- a/storage/ndb/src/kernel/vm/Configuration.cpp +++ b/storage/ndb/src/kernel/vm/Configuration.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2003, 2021, Oracle and/or its affiliates. + Copyright (c) 2003, 2025, Oracle and/or its affiliates. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, @@ -384,6 +384,9 @@ Configuration::setupConfiguration(){ ndbout_c("Mixology level set to 0x%x", _mixologyLevel); globalTransporterRegistry.setMixologyLevel(_mixologyLevel); } + + _shutdownHandlingFault = 0; + _shutdownHandlingFaultExtra = 0; #endif /** @@ -665,6 +668,18 @@ void Configuration::setMixologyLevel(Uint32 l){ _mixologyLevel = l; } + +Uint32 Configuration::getShutdownHandlingFault() const { + return _shutdownHandlingFault; +}; +Uint32 Configuration::getShutdownHandlingFaultExtra() const { + return _shutdownHandlingFaultExtra; +}; + +void Configuration ::setShutdownHandlingFault(Uint32 v, Uint32 extra) { + _shutdownHandlingFault = v; + _shutdownHandlingFaultExtra = extra; +}; #endif const ndb_mgm_configuration_iterator * diff --git a/storage/ndb/src/kernel/vm/Configuration.hpp b/storage/ndb/src/kernel/vm/Configuration.hpp index f41cbd3be837..cb49af13b789 100644 --- a/storage/ndb/src/kernel/vm/Configuration.hpp +++ b/storage/ndb/src/kernel/vm/Configuration.hpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2003, 2021, Oracle and/or its affiliates. + Copyright (c) 2003, 2025, Oracle and/or its affiliates. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, @@ -136,6 +136,23 @@ class Configuration { #ifdef ERROR_INSERT Uint32 getMixologyLevel() const; void setMixologyLevel(Uint32); + + enum { + SHF_NONE = 0, + /* Delays during crash handling */ + /* Extra specifies delay in seconds */ + SHF_DELAY_AFTER_WRITING_ERRORLOG = 1, + SHF_DELAY_WHILE_WRITING_ERRORLOG = 2, + + /* Unix signal during crash handling */ + /* Extra specifies signal number */ + SHF_UNIX_SIGNAL = 10 + } ShutdownHandlingFaults; + + Uint32 getShutdownHandlingFault() const; + Uint32 getShutdownHandlingFaultExtra() const; + + void setShutdownHandlingFault(Uint32 v, Uint32 extra = 0); #endif // Cluster configuration @@ -172,6 +189,8 @@ class Configuration { Uint32 _timeBetweenWatchDogCheckInitial; #ifdef ERROR_INSERT Uint32 _mixologyLevel; + Uint32 _shutdownHandlingFault; + Uint32 _shutdownHandlingFaultExtra; #endif Vector threadInfo; diff --git a/storage/ndb/src/kernel/vm/SimulatedBlock.cpp b/storage/ndb/src/kernel/vm/SimulatedBlock.cpp index c081df8b5419..37235a263bde 100644 --- a/storage/ndb/src/kernel/vm/SimulatedBlock.cpp +++ b/storage/ndb/src/kernel/vm/SimulatedBlock.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2003, 2023, Oracle and/or its affiliates. + Copyright (c) 2003, 2025, Oracle and/or its affiliates. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, @@ -4838,6 +4838,8 @@ ErrorReporter::prepare_to_crash(bool first_phase, bool error_insert_crash) { (void)first_phase; (void)error_insert_crash; + + globalData.incrementWatchDogCounter(22); // Handling node stop } #endif diff --git a/storage/ndb/src/kernel/vm/WatchDog.cpp b/storage/ndb/src/kernel/vm/WatchDog.cpp index 58955bd2b8cf..9e92b46a9b39 100644 --- a/storage/ndb/src/kernel/vm/WatchDog.cpp +++ b/storage/ndb/src/kernel/vm/WatchDog.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2003, 2021, Oracle and/or its affiliates. + Copyright (c) 2003, 2025, Oracle and/or its affiliates. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2.0, @@ -217,6 +217,9 @@ const char *get_action(char *buf, Uint32 IPValue) case 21: action = "Initial value in mt_job_thread_main"; break; + case 22: + action = "Handling node stop"; + break; default: action = NULL; break; @@ -440,6 +443,13 @@ WatchDog::run() } if ((elapsed[i] > 3 * theInterval) || killer) { + if (oldCounterValue[i] == 4 || // Print Job Buffers at crash + oldCounterValue[i] == 22) { // Handling node stop + /* Immediate exit without attempting to trace + * to avoid I/O stalls leaving process hanging + */ + NdbShutdown(NDBD_EXIT_WATCHDOG_TERMINATE, NST_Watchdog); + } shutdownSystem(last_stuck_action); } } diff --git a/storage/ndb/src/kernel/vm/mt.cpp b/storage/ndb/src/kernel/vm/mt.cpp index f3eef3a619f8..c4a3f51d1c10 100644 --- a/storage/ndb/src/kernel/vm/mt.cpp +++ b/storage/ndb/src/kernel/vm/mt.cpp @@ -8049,6 +8049,14 @@ static bool crash_started = false; void ErrorReporter::prepare_to_crash(bool first_phase, bool error_insert_crash) { + { + void *value= NdbThread_GetTlsKey(NDB_THREAD_TLS_THREAD); + thr_data *selfptr = reinterpret_cast(value); + if (selfptr != NULL) { + selfptr->m_watchdog_counter = 22; + } + } + if (first_phase) { NdbMutex_Lock(&g_thr_repository->stop_for_crash_mutex); diff --git a/storage/ndb/test/ndbapi/testNodeRestart.cpp b/storage/ndb/test/ndbapi/testNodeRestart.cpp index 5f0bfcdc6bce..bcb86669ea40 100644 --- a/storage/ndb/test/ndbapi/testNodeRestart.cpp +++ b/storage/ndb/test/ndbapi/testNodeRestart.cpp @@ -10332,56 +10332,63 @@ int runWatchdogSlowShutdown(NDBT_Context* ctx, NDBT_Step* step) * 3 Trigger shutdown * * Expectation - * - Shutdown triggered, but slow + * - Shutdown triggered, but very slow * - Watchdog detects and also attempts shutdown * - No crash results, shutdown completes eventually */ NdbRestarter restarter; - /* 1 Set low watchdog threshold */ - { - const int dumpVals[] = {DumpStateOrd::CmvmiSetWatchdogInterval, 2000 }; - CHECK((restarter.dumpStateAllNodes(dumpVals, 2) == NDBT_OK), - "Failed to set watchdog thresh"); - } - - /* 2 Use error insert to get error reporter to be slow - * during shutdown + /* Scenarios + * 1 : Stall during error reporting after releasing lock + * 2 : Stall during error reporting before releasing lock */ - { - const int dumpVals[] = {DumpStateOrd::CmvmiSetErrorHandlingError, 1 }; - CHECK((restarter.dumpStateAllNodes(dumpVals, 2) == NDBT_OK), - "Failed to set error handling mode"); - } - - /* 3 Trigger shutdown */ - const int nodeId = restarter.getNode(NdbRestarter::NS_RANDOM); - g_err << "Injecting crash in node " << nodeId << endl; - /* First request a 'NOSTART' restart on error insert */ - { - const int dumpVals[] = {DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1}; - CHECK((restarter.dumpStateOneNode(nodeId, dumpVals, 2) == NDBT_OK), - "Failed to request error insert restart"); - } + for (int scenario = 1; scenario < 3; scenario++) { + g_err << "Scenario " << scenario << endl; + /* 1 Set low watchdog threshold */ + { + const int dumpVals[] = {DumpStateOrd::CmvmiSetWatchdogInterval, 2000}; + CHECK((restarter.dumpStateAllNodes(dumpVals, 2) == NDBT_OK), + "Failed to set watchdog thresh"); + } - /* Next cause an error insert failure */ - CHECK((restarter.insertErrorInNode(nodeId, 9999) == NDBT_OK), - "Failed to request node crash"); + /* 2 Use error insert to get error reporter to be slow + * during shutdown + */ + { + int dumpVals[] = {DumpStateOrd::CmvmiSetErrorHandlingError, 0}; + dumpVals[1] = scenario; + CHECK((restarter.dumpStateAllNodes(dumpVals, 2) == NDBT_OK), + "Failed to set error handling mode"); + } - /* Expect shutdown to be stalled, and shortly after, watchdog - * to detect this and act - */ - g_err << "Waiting for node " << nodeId << " to stop." << endl; - CHECK((restarter.waitNodesNoStart(&nodeId, 1) == NDBT_OK), - "Timeout waiting for node to stop"); + /* 3 Trigger shutdown */ + const int nodeId = restarter.getNode(NdbRestarter::NS_RANDOM); + g_err << "Injecting crash in node " << nodeId << endl; + /* First request a 'NOSTART' restart on error insert */ + { + const int dumpVals[] = {DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1}; + CHECK((restarter.dumpStateOneNode(nodeId, dumpVals, 2) == NDBT_OK), + "Failed to request error insert restart"); + } + /* Next cause an error insert failure */ + CHECK((restarter.insertErrorInNode(nodeId, 9999) == NDBT_OK), + "Failed to request node crash"); - g_err << "Waiting for node " << nodeId << " to start." << endl; - CHECK((restarter.startNodes(&nodeId, 1) == NDBT_OK), - "Timeout waiting for node to start"); + /* Expect shutdown to be stalled, and shortly after, watchdog + * to detect this and act + */ + g_err << "Waiting for node " << nodeId << " to stop." << endl; + CHECK((restarter.waitNodesNoStart(&nodeId, 1) == NDBT_OK), + "Timeout waiting for node to stop"); + + g_err << "Waiting for node " << nodeId << " to start." << endl; + CHECK((restarter.startNodes(&nodeId, 1) == NDBT_OK), + "Timeout waiting for node to start"); - CHECK((restarter.waitClusterStarted() == NDBT_OK), - "Timeout waiting for cluster to start"); + CHECK((restarter.waitClusterStarted() == NDBT_OK), + "Timeout waiting for cluster to start"); + } g_err << "Success" << endl; return NDBT_OK; From 3d680eb2abed42179319cff9bd26530f08316f89 Mon Sep 17 00:00:00 2001 From: Karthik Kamath Date: Thu, 13 Mar 2025 21:14:44 +0530 Subject: [PATCH 12/13] Revert "BUG#31360522 : >=5.6.36 SOME RANGE QUERIES STILL CRASH..." This reverts commit e86c8eaa8012de53d67f4489f6c3774abdc4cdc1. Change-Id: I1e7d58346e92ef2b36ec9dad5d8f4e6128a72c4b --- sql/sql_partition.cc | 6 ------ sql/table.cc | 9 +-------- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/sql/sql_partition.cc b/sql/sql_partition.cc index 359a4f6c7b3a..4eeb247a054d 100644 --- a/sql/sql_partition.cc +++ b/sql/sql_partition.cc @@ -834,12 +834,6 @@ static bool handle_list_of_fields(List_iterator it, for (i= 0; i < num_key_parts; i++) { Field *field= table->key_info[primary_key].key_part[i].field; - // BLOB/TEXT columns are not allowed in partitioning keys. - if (field->flags & BLOB_FLAG) - { - my_error(ER_BLOB_FIELD_IN_PART_FUNC_ERROR, MYF(0)); - DBUG_RETURN(TRUE); - } field->flags|= GET_FIXED_FIELDS_FLAG; } } diff --git a/sql/table.cc b/sql/table.cc index fb41f51cb07e..663dc57ffbdf 100644 --- a/sql/table.cc +++ b/sql/table.cc @@ -3241,15 +3241,8 @@ int open_table_from_share(THD *thd, TABLE_SHARE *share, const char *alias, { Field *field= key_part->field= outparam->field[key_part->fieldnr-1]; - /* - For spatial indexes, the key parts are assigned the length (4 * - sizeof(double)) in mysql_prepare_create_table() and the - field->key_length() is set to 0. This makes it appear like a prefixed - index. However, prefixed indexes are not allowed on Geometric columns. - Hence skipping new field creation for Geometric columns. - */ if (field->key_length() != key_part->length && - field->type() != MYSQL_TYPE_GEOMETRY) + !(field->flags & BLOB_FLAG)) { /* We are using only a prefix of the column as a key: From c4942c2c0183019551aae7fbc53af9b5caf96cac Mon Sep 17 00:00:00 2001 From: Bjorn Munch Date: Thu, 20 Mar 2025 09:37:05 +0100 Subject: [PATCH 13/13] Update License Book Approved-by: Balasubramanian Kandasamy --- LICENSE | 49 ++++++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/LICENSE b/LICENSE index 26507c47ca88..4dd8fad63594 100644 --- a/LICENSE +++ b/LICENSE @@ -10,7 +10,7 @@ Introduction third-party software which may be included in this distribution of MySQL NDB Cluster 7.6.33 Community. - Last updated: December 2024 + Last updated: March 2025 Licensing Information @@ -36,7 +36,7 @@ Licensing Information reproduced below and can also be found along with its FAQ at http://oss.oracle.com/licenses/universal-foss-exception. - Copyright (c) 1997, 2024, Oracle and/or its affiliates. + Copyright (c) 1997, 2025, Oracle and/or its affiliates. Election of GPLv2 @@ -4327,30 +4327,29 @@ their respective owners. xxHash -Copyright (c) 2012-2014, Yann Collet +Copyright (c) 2012-2021 Yann Collet All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -* Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - -* Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. +BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +Some source files include the above license with different copyright years: +Copyright (C) 2012-2023 Yann Collet +Copyright (C) 2020-2024 Yann Collet ====================================================================== ======================================================================