24
24
* are treated as not a crash but approximately normal termination;
25
25
* the walsender will exit quickly without sending any more XLOG records.
26
26
*
27
- * If the server is shut down, postmaster sends us SIGUSR2 after all
28
- * regular backends have exited and the shutdown checkpoint has been written.
29
- * This instructs walsender to send any outstanding WAL, including the
30
- * shutdown checkpoint record, wait for it to be replicated to the standby,
31
- * and then exit.
27
+ * If the server is shut down, postmaster sends us SIGUSR2 after all regular
28
+ * backends have exited. This causes the walsender to switch to the "stopping"
29
+ * state. In this state, the walsender will reject any replication command
30
+ * that may generate WAL activity. The checkpointer begins the shutdown
31
+ * checkpoint once all walsenders are confirmed as stopping. When the shutdown
32
+ * checkpoint finishes, the postmaster sends us SIGINT. This instructs
33
+ * walsender to send any outstanding WAL, including the shutdown checkpoint
34
+ * record, wait for it to be replicated to the standby, and then exit.
32
35
*
33
36
*
34
37
* Portions Copyright (c) 2010-2017, PostgreSQL Global Development Group
@@ -177,13 +180,14 @@ static bool WalSndCaughtUp = false;
177
180
178
181
/* Flags set by signal handlers for later service in main loop */
179
182
static volatile sig_atomic_t got_SIGHUP = false;
180
- static volatile sig_atomic_t walsender_ready_to_stop = false;
183
+ static volatile sig_atomic_t got_SIGINT = false;
184
+ static volatile sig_atomic_t got_SIGUSR2 = false;
181
185
182
186
/*
183
- * This is set while we are streaming. When not set, SIGUSR2 signal will be
187
+ * This is set while we are streaming. When not set, SIGINT signal will be
184
188
* handled like SIGTERM. When set, the main loop is responsible for checking
185
- * walsender_ready_to_stop and terminating when it's set (after streaming any
186
- * remaining WAL).
189
+ * got_SIGINT and terminating when it's set (after streaming any remaining
190
+ * WAL).
187
191
*/
188
192
static volatile sig_atomic_t replication_active = false;
189
193
@@ -213,6 +217,7 @@ static struct
213
217
/* Signal handlers */
214
218
static void WalSndSigHupHandler (SIGNAL_ARGS );
215
219
static void WalSndXLogSendHandler (SIGNAL_ARGS );
220
+ static void WalSndSwitchStopping (SIGNAL_ARGS );
216
221
static void WalSndLastCycleHandler (SIGNAL_ARGS );
217
222
218
223
/* Prototypes for private functions */
@@ -299,11 +304,14 @@ WalSndErrorCleanup(void)
299
304
ReplicationSlotCleanup ();
300
305
301
306
replication_active = false;
302
- if (walsender_ready_to_stop )
307
+ if (got_SIGINT )
303
308
proc_exit (0 );
304
309
305
310
/* Revert back to startup state */
306
311
WalSndSetState (WALSNDSTATE_STARTUP );
312
+
313
+ if (got_SIGUSR2 )
314
+ WalSndSetState (WALSNDSTATE_STOPPING );
307
315
}
308
316
309
317
/*
@@ -676,7 +684,7 @@ StartReplication(StartReplicationCmd *cmd)
676
684
WalSndLoop (XLogSendPhysical );
677
685
678
686
replication_active = false;
679
- if (walsender_ready_to_stop )
687
+ if (got_SIGINT )
680
688
proc_exit (0 );
681
689
WalSndSetState (WALSNDSTATE_STARTUP );
682
690
@@ -1053,7 +1061,7 @@ StartLogicalReplication(StartReplicationCmd *cmd)
1053
1061
{
1054
1062
ereport (LOG ,
1055
1063
(errmsg ("terminating walsender process after promotion" )));
1056
- walsender_ready_to_stop = true;
1064
+ got_SIGINT = true;
1057
1065
}
1058
1066
1059
1067
WalSndSetState (WALSNDSTATE_CATCHUP );
@@ -1103,7 +1111,7 @@ StartLogicalReplication(StartReplicationCmd *cmd)
1103
1111
ReplicationSlotRelease ();
1104
1112
1105
1113
replication_active = false;
1106
- if (walsender_ready_to_stop )
1114
+ if (got_SIGINT )
1107
1115
proc_exit (0 );
1108
1116
WalSndSetState (WALSNDSTATE_STARTUP );
1109
1117
@@ -1290,6 +1298,14 @@ WalSndWaitForWal(XLogRecPtr loc)
1290
1298
else
1291
1299
RecentFlushPtr = GetXLogReplayRecPtr (NULL );
1292
1300
1301
+ /*
1302
+ * If postmaster asked us to switch to the stopping state, do so.
1303
+ * Shutdown is in progress and this will allow the checkpointer to
1304
+ * move on with the shutdown checkpoint.
1305
+ */
1306
+ if (got_SIGUSR2 )
1307
+ WalSndSetState (WALSNDSTATE_STOPPING );
1308
+
1293
1309
/*
1294
1310
* If postmaster asked us to stop, don't wait here anymore. This will
1295
1311
* cause the xlogreader to return without reading a full record, which
@@ -1299,7 +1315,7 @@ WalSndWaitForWal(XLogRecPtr loc)
1299
1315
* RecentFlushPtr, so we can send all remaining data before shutting
1300
1316
* down.
1301
1317
*/
1302
- if (walsender_ready_to_stop )
1318
+ if (got_SIGINT )
1303
1319
break ;
1304
1320
1305
1321
/*
@@ -1373,6 +1389,22 @@ exec_replication_command(const char *cmd_string)
1373
1389
MemoryContext cmd_context ;
1374
1390
MemoryContext old_context ;
1375
1391
1392
+ /*
1393
+ * If WAL sender has been told that shutdown is getting close, switch its
1394
+ * status accordingly to handle the next replication commands correctly.
1395
+ */
1396
+ if (got_SIGUSR2 )
1397
+ WalSndSetState (WALSNDSTATE_STOPPING );
1398
+
1399
+ /*
1400
+ * Throw error if in stopping mode. We need prevent commands that could
1401
+ * generate WAL while the shutdown checkpoint is being written. To be
1402
+ * safe, we just prohibit all new commands.
1403
+ */
1404
+ if (MyWalSnd -> state == WALSNDSTATE_STOPPING )
1405
+ ereport (ERROR ,
1406
+ (errmsg ("cannot execute new commands while WAL sender is in stopping mode" )));
1407
+
1376
1408
/*
1377
1409
* CREATE_REPLICATION_SLOT ... LOGICAL exports a snapshot until the next
1378
1410
* command arrives. Clean up the old stuff if there's anything.
@@ -2095,13 +2127,20 @@ WalSndLoop(WalSndSendDataCallback send_data)
2095
2127
}
2096
2128
2097
2129
/*
2098
- * When SIGUSR2 arrives, we send any outstanding logs up to the
2130
+ * At the reception of SIGUSR2, switch the WAL sender to the stopping
2131
+ * state.
2132
+ */
2133
+ if (got_SIGUSR2 )
2134
+ WalSndSetState (WALSNDSTATE_STOPPING );
2135
+
2136
+ /*
2137
+ * When SIGINT arrives, we send any outstanding logs up to the
2099
2138
* shutdown checkpoint record (i.e., the latest record), wait for
2100
2139
* them to be replicated to the standby, and exit. This may be a
2101
2140
* normal termination at shutdown, or a promotion, the walsender
2102
2141
* is not sure which.
2103
2142
*/
2104
- if (walsender_ready_to_stop )
2143
+ if (got_SIGINT )
2105
2144
WalSndDone (send_data );
2106
2145
}
2107
2146
@@ -2841,7 +2880,23 @@ WalSndXLogSendHandler(SIGNAL_ARGS)
2841
2880
errno = save_errno ;
2842
2881
}
2843
2882
2844
- /* SIGUSR2: set flag to do a last cycle and shut down afterwards */
2883
+ /* SIGUSR2: set flag to switch to stopping state */
2884
+ static void
2885
+ WalSndSwitchStopping (SIGNAL_ARGS )
2886
+ {
2887
+ int save_errno = errno ;
2888
+
2889
+ got_SIGUSR2 = true;
2890
+ SetLatch (MyLatch );
2891
+
2892
+ errno = save_errno ;
2893
+ }
2894
+
2895
+ /*
2896
+ * SIGINT: set flag to do a last cycle and shut down afterwards. The WAL
2897
+ * sender should already have been switched to WALSNDSTATE_STOPPING at
2898
+ * this point.
2899
+ */
2845
2900
static void
2846
2901
WalSndLastCycleHandler (SIGNAL_ARGS )
2847
2902
{
@@ -2856,7 +2911,7 @@ WalSndLastCycleHandler(SIGNAL_ARGS)
2856
2911
if (!replication_active )
2857
2912
kill (MyProcPid , SIGTERM );
2858
2913
2859
- walsender_ready_to_stop = true;
2914
+ got_SIGINT = true;
2860
2915
SetLatch (MyLatch );
2861
2916
2862
2917
errno = save_errno ;
@@ -2869,14 +2924,14 @@ WalSndSignals(void)
2869
2924
/* Set up signal handlers */
2870
2925
pqsignal (SIGHUP , WalSndSigHupHandler ); /* set flag to read config
2871
2926
* file */
2872
- pqsignal (SIGINT , SIG_IGN ); /* not used */
2927
+ pqsignal (SIGINT , WalSndLastCycleHandler ); /* request a last cycle and
2928
+ * shutdown */
2873
2929
pqsignal (SIGTERM , die ); /* request shutdown */
2874
2930
pqsignal (SIGQUIT , quickdie ); /* hard crash time */
2875
2931
InitializeTimeouts (); /* establishes SIGALRM handler */
2876
2932
pqsignal (SIGPIPE , SIG_IGN );
2877
2933
pqsignal (SIGUSR1 , WalSndXLogSendHandler ); /* request WAL sending */
2878
- pqsignal (SIGUSR2 , WalSndLastCycleHandler ); /* request a last cycle and
2879
- * shutdown */
2934
+ pqsignal (SIGUSR2 , WalSndSwitchStopping ); /* switch to stopping state */
2880
2935
2881
2936
/* Reset some signals that are accepted by postmaster but not here */
2882
2937
pqsignal (SIGCHLD , SIG_DFL );
@@ -2954,6 +3009,50 @@ WalSndWakeup(void)
2954
3009
}
2955
3010
}
2956
3011
3012
+ /*
3013
+ * Wait that all the WAL senders have reached the stopping state. This is
3014
+ * used by the checkpointer to control when shutdown checkpoints can
3015
+ * safely begin.
3016
+ */
3017
+ void
3018
+ WalSndWaitStopping (void )
3019
+ {
3020
+ for (;;)
3021
+ {
3022
+ int i ;
3023
+ bool all_stopped = true;
3024
+
3025
+ for (i = 0 ; i < max_wal_senders ; i ++ )
3026
+ {
3027
+ WalSndState state ;
3028
+ WalSnd * walsnd = & WalSndCtl -> walsnds [i ];
3029
+
3030
+ SpinLockAcquire (& walsnd -> mutex );
3031
+
3032
+ if (walsnd -> pid == 0 )
3033
+ {
3034
+ SpinLockRelease (& walsnd -> mutex );
3035
+ continue ;
3036
+ }
3037
+
3038
+ state = walsnd -> state ;
3039
+ SpinLockRelease (& walsnd -> mutex );
3040
+
3041
+ if (state != WALSNDSTATE_STOPPING )
3042
+ {
3043
+ all_stopped = false;
3044
+ break ;
3045
+ }
3046
+ }
3047
+
3048
+ /* safe to leave if confirmation is done for all WAL senders */
3049
+ if (all_stopped )
3050
+ return ;
3051
+
3052
+ pg_usleep (10000L ); /* wait for 10 msec */
3053
+ }
3054
+ }
3055
+
2957
3056
/* Set state for current walsender (only called in walsender) */
2958
3057
void
2959
3058
WalSndSetState (WalSndState state )
@@ -2987,6 +3086,8 @@ WalSndGetStateString(WalSndState state)
2987
3086
return "catchup" ;
2988
3087
case WALSNDSTATE_STREAMING :
2989
3088
return "streaming" ;
3089
+ case WALSNDSTATE_STOPPING :
3090
+ return "stopping" ;
2990
3091
}
2991
3092
return "UNKNOWN" ;
2992
3093
}
0 commit comments