Skip to content

Commit 55b454d

Browse files
anarazelmacdice
andcommitted
aio: Infrastructure for io_method=worker
This commit contains the basic, system-wide, infrastructure for io_method=worker. It does not yet actually execute IO, this commit just provides the infrastructure for running IO workers, kept separate for easier review. The number of IO workers can be adjusted with a PGC_SIGHUP GUC. Eventually we'd like to make the number of workers dynamically scale up/down based on the current "IO load". To allow the number of IO workers to be increased without a restart, we need to reserve PGPROC entries for the workers unconditionally. This has been judged to be worth the cost. If it turns out to be problematic, we can introduce a PGC_POSTMASTER GUC to control the maximum number. As io workers might be needed during shutdown, e.g. for AIO during the shutdown checkpoint, a new PMState phase is added. IO workers are shut down after the shutdown checkpoint has been performed and walsender/archiver have shut down, but before the checkpointer itself shuts down. See also 87a6690. Updates PGSTAT_FILE_FORMAT_ID due to the addition of a new BackendType. Reviewed-by: Noah Misch <noah@leadboat.com> Co-authored-by: Thomas Munro <thomas.munro@gmail.com> Co-authored-by: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/uvrtrknj4kdytuboidbhwclo4gxhswwcpgadptsjvjqcluzmah%40brqs62irg4dt Discussion: https://postgr.es/m/20210223100344.llw5an2aklengrmn@alap3.anarazel.de Discussion: https://postgr.es/m/stj36ea6yyhoxtqkhpieia2z4krnam7qyetc57rfezgk4zgapf@gcnactj4z56m
1 parent 549ea06 commit 55b454d

File tree

20 files changed

+342
-15
lines changed

20 files changed

+342
-15
lines changed

doc/src/sgml/config.sgml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2689,6 +2689,25 @@ include_dir 'conf.d'
26892689
</listitem>
26902690
</varlistentry>
26912691

2692+
<varlistentry id="guc-io-workers" xreflabel="io_workers">
2693+
<term><varname>io_workers</varname> (<type>int</type>)
2694+
<indexterm>
2695+
<primary><varname>io_workers</varname> configuration parameter</primary>
2696+
</indexterm>
2697+
</term>
2698+
<listitem>
2699+
<para>
2700+
Selects the number of I/O worker processes to use. The default is
2701+
3. This parameter can only be set in the
2702+
<filename>postgresql.conf</filename> file or on the server command
2703+
line.
2704+
</para>
2705+
<para>
2706+
Only has an effect if <xref linkend="guc-io-method"/> is set to
2707+
<literal>worker</literal>.
2708+
</para>
2709+
</listitem>
2710+
</varlistentry>
26922711
</variablelist>
26932712
</sect2>
26942713

src/backend/postmaster/launch_backend.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
#include "replication/slotsync.h"
4949
#include "replication/walreceiver.h"
5050
#include "storage/dsm.h"
51+
#include "storage/io_worker.h"
5152
#include "storage/pg_shmem.h"
5253
#include "tcop/backend_startup.h"
5354
#include "utils/memutils.h"
@@ -197,6 +198,7 @@ static child_process_kind child_process_kinds[] = {
197198
[B_ARCHIVER] = {"archiver", PgArchiverMain, true},
198199
[B_BG_WRITER] = {"bgwriter", BackgroundWriterMain, true},
199200
[B_CHECKPOINTER] = {"checkpointer", CheckpointerMain, true},
201+
[B_IO_WORKER] = {"io_worker", IoWorkerMain, true},
200202
[B_STARTUP] = {"startup", StartupProcessMain, true},
201203
[B_WAL_RECEIVER] = {"wal_receiver", WalReceiverMain, true},
202204
[B_WAL_SUMMARIZER] = {"wal_summarizer", WalSummarizerMain, true},

src/backend/postmaster/pmchild.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ InitPostmasterChildSlots(void)
101101

102102
pmchild_pools[B_AUTOVAC_WORKER].size = autovacuum_worker_slots;
103103
pmchild_pools[B_BG_WORKER].size = max_worker_processes;
104+
pmchild_pools[B_IO_WORKER].size = MAX_IO_WORKERS;
104105

105106
/*
106107
* There can be only one of each of these running at a time. They each

src/backend/postmaster/postmaster.c

Lines changed: 162 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -108,9 +108,12 @@
108108
#include "replication/logicallauncher.h"
109109
#include "replication/slotsync.h"
110110
#include "replication/walsender.h"
111+
#include "storage/aio_subsys.h"
111112
#include "storage/fd.h"
113+
#include "storage/io_worker.h"
112114
#include "storage/ipc.h"
113115
#include "storage/pmsignal.h"
116+
#include "storage/proc.h"
114117
#include "tcop/backend_startup.h"
115118
#include "tcop/tcopprot.h"
116119
#include "utils/datetime.h"
@@ -340,6 +343,7 @@ typedef enum
340343
* ckpt */
341344
PM_WAIT_XLOG_ARCHIVAL, /* waiting for archiver and walsenders to
342345
* finish */
346+
PM_WAIT_IO_WORKERS, /* waiting for io workers to exit */
343347
PM_WAIT_CHECKPOINTER, /* waiting for checkpointer to shut down */
344348
PM_WAIT_DEAD_END, /* waiting for dead-end children to exit */
345349
PM_NO_CHILDREN, /* all important children have exited */
@@ -402,6 +406,10 @@ bool LoadedSSL = false;
402406
static DNSServiceRef bonjour_sdref = NULL;
403407
#endif
404408

409+
/* State for IO worker management. */
410+
static int io_worker_count = 0;
411+
static PMChild *io_worker_children[MAX_IO_WORKERS];
412+
405413
/*
406414
* postmaster.c - function prototypes
407415
*/
@@ -436,6 +444,8 @@ static void TerminateChildren(int signal);
436444
static int CountChildren(BackendTypeMask targetMask);
437445
static void LaunchMissingBackgroundProcesses(void);
438446
static void maybe_start_bgworkers(void);
447+
static bool maybe_reap_io_worker(int pid);
448+
static void maybe_adjust_io_workers(void);
439449
static bool CreateOptsFile(int argc, char *argv[], char *fullprogname);
440450
static PMChild *StartChildProcess(BackendType type);
441451
static void StartSysLogger(void);
@@ -1365,6 +1375,11 @@ PostmasterMain(int argc, char *argv[])
13651375
*/
13661376
AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_STARTING);
13671377

1378+
UpdatePMState(PM_STARTUP);
1379+
1380+
/* Make sure we can perform I/O while starting up. */
1381+
maybe_adjust_io_workers();
1382+
13681383
/* Start bgwriter and checkpointer so they can help with recovery */
13691384
if (CheckpointerPMChild == NULL)
13701385
CheckpointerPMChild = StartChildProcess(B_CHECKPOINTER);
@@ -1377,7 +1392,6 @@ PostmasterMain(int argc, char *argv[])
13771392
StartupPMChild = StartChildProcess(B_STARTUP);
13781393
Assert(StartupPMChild != NULL);
13791394
StartupStatus = STARTUP_RUNNING;
1380-
UpdatePMState(PM_STARTUP);
13811395

13821396
/* Some workers may be scheduled to start now */
13831397
maybe_start_bgworkers();
@@ -2502,6 +2516,16 @@ process_pm_child_exit(void)
25022516
continue;
25032517
}
25042518

2519+
/* Was it an IO worker? */
2520+
if (maybe_reap_io_worker(pid))
2521+
{
2522+
if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus))
2523+
HandleChildCrash(pid, exitstatus, _("io worker"));
2524+
2525+
maybe_adjust_io_workers();
2526+
continue;
2527+
}
2528+
25052529
/*
25062530
* Was it a backend or a background worker?
25072531
*/
@@ -2723,6 +2747,7 @@ HandleFatalError(QuitSignalReason reason, bool consider_sigabrt)
27232747
case PM_WAIT_XLOG_SHUTDOWN:
27242748
case PM_WAIT_XLOG_ARCHIVAL:
27252749
case PM_WAIT_CHECKPOINTER:
2750+
case PM_WAIT_IO_WORKERS:
27262751

27272752
/*
27282753
* NB: Similar code exists in PostmasterStateMachine()'s handling
@@ -2905,20 +2930,21 @@ PostmasterStateMachine(void)
29052930

29062931
/*
29072932
* If we are doing crash recovery or an immediate shutdown then we
2908-
* expect archiver, checkpointer and walsender to exit as well,
2909-
* otherwise not.
2933+
* expect archiver, checkpointer, io workers and walsender to exit as
2934+
* well, otherwise not.
29102935
*/
29112936
if (FatalError || Shutdown >= ImmediateShutdown)
29122937
targetMask = btmask_add(targetMask,
29132938
B_CHECKPOINTER,
29142939
B_ARCHIVER,
2940+
B_IO_WORKER,
29152941
B_WAL_SENDER);
29162942

29172943
/*
2918-
* Normally walsenders and archiver will continue running; they will
2919-
* be terminated later after writing the checkpoint record. We also
2920-
* let dead-end children to keep running for now. The syslogger
2921-
* process exits last.
2944+
* Normally archiver, checkpointer, IO workers and walsenders will
2945+
* continue running; they will be terminated later after writing the
2946+
* checkpoint record. We also let dead-end children to keep running
2947+
* for now. The syslogger process exits last.
29222948
*
29232949
* This assertion checks that we have covered all backend types,
29242950
* either by including them in targetMask, or by noting here that they
@@ -2933,12 +2959,13 @@ PostmasterStateMachine(void)
29332959
B_LOGGER);
29342960

29352961
/*
2936-
* Archiver, checkpointer and walsender may or may not be in
2937-
* targetMask already.
2962+
* Archiver, checkpointer, IO workers, and walsender may or may
2963+
* not be in targetMask already.
29382964
*/
29392965
remainMask = btmask_add(remainMask,
29402966
B_ARCHIVER,
29412967
B_CHECKPOINTER,
2968+
B_IO_WORKER,
29422969
B_WAL_SENDER);
29432970

29442971
/* these are not real postmaster children */
@@ -3039,11 +3066,25 @@ PostmasterStateMachine(void)
30393066
{
30403067
/*
30413068
* PM_WAIT_XLOG_ARCHIVAL state ends when there are no children other
3042-
* than checkpointer, dead-end children and logger left. There
3069+
* than checkpointer, io workers and dead-end children left. There
30433070
* shouldn't be any regular backends left by now anyway; what we're
30443071
* really waiting for is for walsenders and archiver to exit.
30453072
*/
3046-
if (CountChildren(btmask_all_except(B_CHECKPOINTER, B_LOGGER, B_DEAD_END_BACKEND)) == 0)
3073+
if (CountChildren(btmask_all_except(B_CHECKPOINTER, B_IO_WORKER,
3074+
B_LOGGER, B_DEAD_END_BACKEND)) == 0)
3075+
{
3076+
UpdatePMState(PM_WAIT_IO_WORKERS);
3077+
SignalChildren(SIGUSR2, btmask(B_IO_WORKER));
3078+
}
3079+
}
3080+
3081+
if (pmState == PM_WAIT_IO_WORKERS)
3082+
{
3083+
/*
3084+
* PM_WAIT_IO_WORKERS state ends when there's only checkpointer and
3085+
* dead_end children left.
3086+
*/
3087+
if (io_worker_count == 0)
30473088
{
30483089
UpdatePMState(PM_WAIT_CHECKPOINTER);
30493090

@@ -3171,10 +3212,14 @@ PostmasterStateMachine(void)
31713212
/* re-create shared memory and semaphores */
31723213
CreateSharedMemoryAndSemaphores();
31733214

3215+
UpdatePMState(PM_STARTUP);
3216+
3217+
/* Make sure we can perform I/O while starting up. */
3218+
maybe_adjust_io_workers();
3219+
31743220
StartupPMChild = StartChildProcess(B_STARTUP);
31753221
Assert(StartupPMChild != NULL);
31763222
StartupStatus = STARTUP_RUNNING;
3177-
UpdatePMState(PM_STARTUP);
31783223
/* crash recovery started, reset SIGKILL flag */
31793224
AbortStartTime = 0;
31803225

@@ -3198,6 +3243,7 @@ pmstate_name(PMState state)
31983243
PM_TOSTR_CASE(PM_WAIT_BACKENDS);
31993244
PM_TOSTR_CASE(PM_WAIT_XLOG_SHUTDOWN);
32003245
PM_TOSTR_CASE(PM_WAIT_XLOG_ARCHIVAL);
3246+
PM_TOSTR_CASE(PM_WAIT_IO_WORKERS);
32013247
PM_TOSTR_CASE(PM_WAIT_DEAD_END);
32023248
PM_TOSTR_CASE(PM_WAIT_CHECKPOINTER);
32033249
PM_TOSTR_CASE(PM_NO_CHILDREN);
@@ -3235,6 +3281,16 @@ LaunchMissingBackgroundProcesses(void)
32353281
if (SysLoggerPMChild == NULL && Logging_collector)
32363282
StartSysLogger();
32373283

3284+
/*
3285+
* The number of configured workers might have changed, or a prior start
3286+
* of a worker might have failed. Check if we need to start/stop any
3287+
* workers.
3288+
*
3289+
* A config file change will always lead to this function being called, so
3290+
* we always will process the config change in a timely manner.
3291+
*/
3292+
maybe_adjust_io_workers();
3293+
32383294
/*
32393295
* The checkpointer and the background writer are active from the start,
32403296
* until shutdown is initiated.
@@ -4120,6 +4176,7 @@ bgworker_should_start_now(BgWorkerStartTime start_time)
41204176
case PM_WAIT_DEAD_END:
41214177
case PM_WAIT_XLOG_ARCHIVAL:
41224178
case PM_WAIT_XLOG_SHUTDOWN:
4179+
case PM_WAIT_IO_WORKERS:
41234180
case PM_WAIT_BACKENDS:
41244181
case PM_STOP_BACKENDS:
41254182
break;
@@ -4270,6 +4327,99 @@ maybe_start_bgworkers(void)
42704327
}
42714328
}
42724329

4330+
static bool
4331+
maybe_reap_io_worker(int pid)
4332+
{
4333+
for (int id = 0; id < MAX_IO_WORKERS; ++id)
4334+
{
4335+
if (io_worker_children[id] &&
4336+
io_worker_children[id]->pid == pid)
4337+
{
4338+
ReleasePostmasterChildSlot(io_worker_children[id]);
4339+
4340+
--io_worker_count;
4341+
io_worker_children[id] = NULL;
4342+
return true;
4343+
}
4344+
}
4345+
return false;
4346+
}
4347+
4348+
/*
4349+
* Start or stop IO workers, to close the gap between the number of running
4350+
* workers and the number of configured workers. Used to respond to change of
4351+
* the io_workers GUC (by increasing and decreasing the number of workers), as
4352+
* well as workers terminating in response to errors (by starting
4353+
* "replacement" workers).
4354+
*/
4355+
static void
4356+
maybe_adjust_io_workers(void)
4357+
{
4358+
if (!pgaio_workers_enabled())
4359+
return;
4360+
4361+
/*
4362+
* If we're in final shutting down state, then we're just waiting for all
4363+
* processes to exit.
4364+
*/
4365+
if (pmState >= PM_WAIT_IO_WORKERS)
4366+
return;
4367+
4368+
/* Don't start new workers during an immediate shutdown either. */
4369+
if (Shutdown >= ImmediateShutdown)
4370+
return;
4371+
4372+
/*
4373+
* Don't start new workers if we're in the shutdown phase of a crash
4374+
* restart. But we *do* need to start if we're already starting up again.
4375+
*/
4376+
if (FatalError && pmState >= PM_STOP_BACKENDS)
4377+
return;
4378+
4379+
Assert(pmState < PM_WAIT_IO_WORKERS);
4380+
4381+
/* Not enough running? */
4382+
while (io_worker_count < io_workers)
4383+
{
4384+
PMChild *child;
4385+
int id;
4386+
4387+
/* find unused entry in io_worker_children array */
4388+
for (id = 0; id < MAX_IO_WORKERS; ++id)
4389+
{
4390+
if (io_worker_children[id] == NULL)
4391+
break;
4392+
}
4393+
if (id == MAX_IO_WORKERS)
4394+
elog(ERROR, "could not find a free IO worker ID");
4395+
4396+
/* Try to launch one. */
4397+
child = StartChildProcess(B_IO_WORKER);
4398+
if (child != NULL)
4399+
{
4400+
io_worker_children[id] = child;
4401+
++io_worker_count;
4402+
}
4403+
else
4404+
break; /* XXX try again soon? */
4405+
}
4406+
4407+
/* Too many running? */
4408+
if (io_worker_count > io_workers)
4409+
{
4410+
/* ask the IO worker in the highest slot to exit */
4411+
for (int id = MAX_IO_WORKERS - 1; id >= 0; --id)
4412+
{
4413+
if (io_worker_children[id] != NULL)
4414+
{
4415+
kill(io_worker_children[id]->pid, SIGUSR2);
4416+
break;
4417+
}
4418+
}
4419+
}
4420+
}
4421+
4422+
42734423
/*
42744424
* When a backend asks to be notified about worker state changes, we
42754425
* set a flag in its backend entry. The background worker machinery needs

src/backend/storage/aio/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ OBJS = \
1515
aio_io.o \
1616
aio_target.o \
1717
method_sync.o \
18+
method_worker.o \
1819
read_stream.o
1920

2021
include $(top_srcdir)/src/backend/common.mk

src/backend/storage/aio/meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,6 @@ backend_sources += files(
77
'aio_io.c',
88
'aio_target.c',
99
'method_sync.c',
10+
'method_worker.c',
1011
'read_stream.c',
1112
)

0 commit comments

Comments
 (0)