Skip to content

Commit 1dbf3e8

Browse files
committed
BUG#32836868: BACKPORT BUG#32738137 TO 5.7
There are two ways of interaction between Group Replication plugin (GR) and the Group Communication layer (GCS): GR -> GCS: commands like START, STOP and others which belong to GR plugin do API calls to GCS. Those calls are wrapped on the `Gcs_operations` singleton. GCS -> GR: events from GCS, like message delivery, membership changes and others which belong to GCS do call event handlers on GR. Those event handlers are wrapped on the `Plugin_gcs_events_handler` singleton. Due to the nature of GCS event driven, these two singletons cannot have mutual exclusive sections, otherwise we can have a deadlock. That rule was not being respected during the `Plugin_gcs_events_handler::on_view_changed()` event handler, which is handling a event from GCS, but during its body is calling `Gcs_operations::leave_coordination_member_left()` which is mutually exclusive with the method `Gcs_operations::leave()`. We could have a deadlock between: 1) STOP GROUP_REPLICATION, which calls `Gcs_operations::leave()` 2) `Plugin_gcs_events_handler::on_view_changed()`, which calls `Gcs_operations::leave_coordination_member_left()` 1) is holding a write lock on `Gcs_operations`, 2) is waiting for the same write lock. Since `Gcs_operations::leave_coordination_member_left()` is only changing flags that indicate at which point the leave operation is, to solve the deadlock we convert that flags into atomic variables, thence do not requiring the write lock acquire. Additionally, another case was detected. 1) START GROUP_REPLICATION, which calls `Gcs_operations::join()` 2) `Plugin_gcs_events_handler::on_view_changed()`, which calls `Gcs_operations::get_current_view()` 1) is holding a write lock on `Gcs_operations`, 2) is waiting for the read lock on the same lock. `Plugin_gcs_events_handler::on_view_changed()` already knows the current view id, as such the call to `Gcs_operations::get_current_view()` was removed. RB: 26325
1 parent 9fa411d commit 1dbf3e8

File tree

6 files changed

+15
-65
lines changed

6 files changed

+15
-65
lines changed

rapid/plugin/group_replication/include/gcs_operations.h

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -189,14 +189,11 @@ class Gcs_operations
189189
Gcs_interface *gcs_interface;
190190

191191
/** Is the member leaving*/
192-
bool leave_coordination_leaving;
192+
int32 leave_coordination_leaving;
193193
/** Did the member already left*/
194-
bool leave_coordination_left;
195-
/** Is finalize ongoing*/
196-
bool finalize_ongoing;
194+
int32 leave_coordination_left;
197195

198196
Checkable_rwlock *gcs_operations_lock;
199-
Checkable_rwlock *finalize_ongoing_lock;
200197
};
201198

202199
#endif /* GCS_OPERATIONS_INCLUDE */

rapid/plugin/group_replication/include/plugin_psi.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,6 @@ extern PSI_rwlock_key
136136
key_GR_RWLOCK_cert_stable_gtid_set,
137137
key_GR_RWLOCK_io_cache_unused_list,
138138
key_GR_RWLOCK_plugin_stop,
139-
key_GR_RWLOCK_gcs_operations,
140-
key_GR_RWLOCK_gcs_operations_finalize_ongoing;
139+
key_GR_RWLOCK_gcs_operations;
141140

142141
#endif /* PLUGIN_PSI_INCLUDED */

rapid/plugin/group_replication/src/gcs_event_handlers.cc

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -553,18 +553,10 @@ Plugin_gcs_events_handler::on_view_changed(const Gcs_view& new_view,
553553

554554
if (!is_leaving)
555555
{
556-
std::string view_id_representation= "";
557-
Gcs_view *view= gcs_module->get_current_view();
558-
if (view != NULL)
559-
{
560-
view_id_representation= view->get_view_id().get_representation();
561-
delete view;
562-
}
563-
564556
log_message(MY_INFORMATION_LEVEL,
565557
"Group membership changed to %s on view %s.",
566558
group_member_mgr->get_string_current_view_active_hosts().c_str(),
567-
view_id_representation.c_str());
559+
new_view.get_view_id().get_representation().c_str());
568560
}
569561
else
570562
{

rapid/plugin/group_replication/src/gcs_operations.cc

Lines changed: 9 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -32,18 +32,12 @@ const std::string Gcs_operations::gcs_engine= "xcom";
3232

3333
Gcs_operations::Gcs_operations()
3434
: gcs_interface(NULL),
35-
leave_coordination_leaving(false),
36-
leave_coordination_left(false),
37-
finalize_ongoing(false)
35+
leave_coordination_leaving(0),
36+
leave_coordination_left(0)
3837
{
3938
gcs_operations_lock= new Checkable_rwlock(
4039
#ifdef HAVE_PSI_INTERFACE
4140
key_GR_RWLOCK_gcs_operations
42-
#endif
43-
);
44-
finalize_ongoing_lock= new Checkable_rwlock(
45-
#ifdef HAVE_PSI_INTERFACE
46-
key_GR_RWLOCK_gcs_operations_finalize_ongoing
4741
#endif
4842
);
4943
}
@@ -52,7 +46,6 @@ Gcs_operations::Gcs_operations()
5246
Gcs_operations::~Gcs_operations()
5347
{
5448
delete gcs_operations_lock;
55-
delete finalize_ongoing_lock;
5649
}
5750

5851

@@ -63,8 +56,8 @@ Gcs_operations::initialize()
6356
int error= 0;
6457
gcs_operations_lock->wrlock();
6558

66-
leave_coordination_leaving= false;
67-
leave_coordination_left= false;
59+
my_atomic_store32(&leave_coordination_leaving, 0);
60+
my_atomic_store32(&leave_coordination_left, 0);
6861

6962
assert(gcs_interface == NULL);
7063
if ((gcs_interface=
@@ -100,20 +93,14 @@ void
10093
Gcs_operations::finalize()
10194
{
10295
DBUG_ENTER("Gcs_operations::finalize");
103-
finalize_ongoing_lock->wrlock();
104-
finalize_ongoing= true;
10596
gcs_operations_lock->wrlock();
106-
finalize_ongoing_lock->unlock();
10797

10898
if (gcs_interface != NULL)
10999
gcs_interface->finalize();
110100
Gcs_interface_factory::cleanup(gcs_engine);
111101
gcs_interface= NULL;
112102

113-
finalize_ongoing_lock->wrlock();
114-
finalize_ongoing= false;
115103
gcs_operations_lock->unlock();
116-
finalize_ongoing_lock->unlock();
117104
DBUG_VOID_RETURN;
118105
}
119106

@@ -213,12 +200,12 @@ Gcs_operations::leave()
213200
enum_leave_state state= ERROR_WHEN_LEAVING;
214201
gcs_operations_lock->wrlock();
215202

216-
if (leave_coordination_left)
203+
if (my_atomic_load32(&leave_coordination_left))
217204
{
218205
state= ALREADY_LEFT;
219206
goto end;
220207
}
221-
if (leave_coordination_leaving)
208+
if (my_atomic_load32(&leave_coordination_leaving))
222209
{
223210
state= ALREADY_LEAVING;
224211
goto end;
@@ -236,7 +223,7 @@ Gcs_operations::leave()
236223
if (!gcs_control->leave())
237224
{
238225
state= NOW_LEAVING;
239-
leave_coordination_leaving= true;
226+
my_atomic_store32(&leave_coordination_leaving, 1);
240227
goto end;
241228
}
242229
}
@@ -268,30 +255,8 @@ void
268255
Gcs_operations::leave_coordination_member_left()
269256
{
270257
DBUG_ENTER("Gcs_operations::leave_coordination_member_left");
271-
272-
/*
273-
If finalize method is ongoing, it means that GCS is waiting that
274-
all messages and views are delivered to GR, if we proceed with
275-
this method we will enter on the deadlock:
276-
1) leave view was not delivered before wait view timeout;
277-
2) finalize did start and acquired lock->wrlock();
278-
3) leave view was delivered, member_left is waiting to
279-
acquire lock->wrlock().
280-
So, if leaving, we just do nothing.
281-
*/
282-
finalize_ongoing_lock->rdlock();
283-
if (finalize_ongoing)
284-
{
285-
finalize_ongoing_lock->unlock();
286-
DBUG_VOID_RETURN;
287-
}
288-
gcs_operations_lock->wrlock();
289-
finalize_ongoing_lock->unlock();
290-
291-
leave_coordination_leaving= false;
292-
leave_coordination_left= true;
293-
294-
gcs_operations_lock->unlock();
258+
my_atomic_store32(&leave_coordination_leaving, 0);
259+
my_atomic_store32(&leave_coordination_left, 1);
295260
DBUG_VOID_RETURN;
296261
}
297262

rapid/plugin/group_replication/src/plugin_psi.cc

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -82,8 +82,7 @@ PSI_thread_key key_GR_THD_applier_module_receiver,
8282
PSI_rwlock_key key_GR_RWLOCK_cert_stable_gtid_set,
8383
key_GR_RWLOCK_io_cache_unused_list,
8484
key_GR_RWLOCK_plugin_stop,
85-
key_GR_RWLOCK_gcs_operations,
86-
key_GR_RWLOCK_gcs_operations_finalize_ongoing;
85+
key_GR_RWLOCK_gcs_operations;
8786

8887
#ifdef HAVE_PSI_INTERFACE
8988
static PSI_mutex_info all_group_replication_psi_mutex_keys[]=
@@ -156,8 +155,7 @@ static PSI_rwlock_info all_group_replication_psi_rwlock_keys[]=
156155
{&key_GR_RWLOCK_cert_stable_gtid_set, "RWLOCK_certifier_stable_gtid_set", PSI_FLAG_GLOBAL},
157156
{&key_GR_RWLOCK_io_cache_unused_list , "RWLOCK_io_cache_unused_list", PSI_FLAG_GLOBAL},
158157
{&key_GR_RWLOCK_plugin_stop, "RWLOCK_plugin_stop", PSI_FLAG_GLOBAL},
159-
{&key_GR_RWLOCK_gcs_operations, "RWLOCK_gcs_operations", PSI_FLAG_GLOBAL},
160-
{&key_GR_RWLOCK_gcs_operations_finalize_ongoing, "RWLOCK_gcs_operations_finalize_ongoing", PSI_FLAG_GLOBAL}
158+
{&key_GR_RWLOCK_gcs_operations, "RWLOCK_gcs_operations", PSI_FLAG_GLOBAL}
161159
};
162160

163161
void register_group_replication_mutex_psi_keys(PSI_mutex_info mutexes[],

rapid/plugin/group_replication/tests/mtr/r/gr_psi_keys.result

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@ wait/synch/rwlock/group_rpl/RWLOCK_certifier_stable_gtid_set YES YES
4242
wait/synch/rwlock/group_rpl/RWLOCK_io_cache_unused_list YES YES
4343
wait/synch/rwlock/group_rpl/RWLOCK_plugin_stop YES YES
4444
wait/synch/rwlock/group_rpl/RWLOCK_gcs_operations YES YES
45-
wait/synch/rwlock/group_rpl/RWLOCK_gcs_operations_finalize_ongoing YES YES
4645
wait/synch/cond/group_rpl/COND_applier_module_run YES YES
4746
wait/synch/cond/group_rpl/COND_applier_module_suspend YES YES
4847
wait/synch/cond/group_rpl/COND_applier_module_wait YES YES

0 commit comments

Comments
 (0)