diff options
author | Andres Freund | 2025-03-18 14:52:33 +0000 |
---|---|---|
committer | Andres Freund | 2025-03-18 15:54:01 +0000 |
commit | 55b454d0e14084c841a034073abbf1a0ea937a45 (patch) | |
tree | 4bdd85a6acb02123b35bfeb1c33c1c071be30ba1 /src/backend/postmaster/postmaster.c | |
parent | 549ea06e4217aca10d3a73dc09cf5018c51bc23a (diff) |
aio: Infrastructure for io_method=worker
This commit contains the basic, system-wide, infrastructure for
io_method=worker. It does not yet actually execute IO, this commit just
provides the infrastructure for running IO workers, kept separate for easier
review.
The number of IO workers can be adjusted with a PGC_SIGHUP GUC. Eventually
we'd like to make the number of workers dynamically scale up/down based on the
current "IO load".
To allow the number of IO workers to be increased without a restart, we need
to reserve PGPROC entries for the workers unconditionally. This has been
judged to be worth the cost. If it turns out to be problematic, we can
introduce a PGC_POSTMASTER GUC to control the maximum number.
As io workers might be needed during shutdown, e.g. for AIO during the
shutdown checkpoint, a new PMState phase is added. IO workers are shut down
after the shutdown checkpoint has been performed and walsender/archiver have
shut down, but before the checkpointer itself shuts down. See also
87a6690cc69.
Updates PGSTAT_FILE_FORMAT_ID due to the addition of a new BackendType.
Reviewed-by: Noah Misch <[email protected]>
Co-authored-by: Thomas Munro <[email protected]>
Co-authored-by: Andres Freund <[email protected]>
Discussion: https://postgr.es/m/uvrtrknj4kdytuboidbhwclo4gxhswwcpgadptsjvjqcluzmah%40brqs62irg4dt
Discussion: https://postgr.es/m/[email protected]
Discussion: https://postgr.es/m/stj36ea6yyhoxtqkhpieia2z4krnam7qyetc57rfezgk4zgapf@gcnactj4z56m
Diffstat (limited to 'src/backend/postmaster/postmaster.c')
-rw-r--r-- | src/backend/postmaster/postmaster.c | 174 |
1 files changed, 162 insertions, 12 deletions
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index d13846298bd..a0c37532d2f 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -108,9 +108,12 @@ #include "replication/logicallauncher.h" #include "replication/slotsync.h" #include "replication/walsender.h" +#include "storage/aio_subsys.h" #include "storage/fd.h" +#include "storage/io_worker.h" #include "storage/ipc.h" #include "storage/pmsignal.h" +#include "storage/proc.h" #include "tcop/backend_startup.h" #include "tcop/tcopprot.h" #include "utils/datetime.h" @@ -340,6 +343,7 @@ typedef enum * ckpt */ PM_WAIT_XLOG_ARCHIVAL, /* waiting for archiver and walsenders to * finish */ + PM_WAIT_IO_WORKERS, /* waiting for io workers to exit */ PM_WAIT_CHECKPOINTER, /* waiting for checkpointer to shut down */ PM_WAIT_DEAD_END, /* waiting for dead-end children to exit */ PM_NO_CHILDREN, /* all important children have exited */ @@ -402,6 +406,10 @@ bool LoadedSSL = false; static DNSServiceRef bonjour_sdref = NULL; #endif +/* State for IO worker management. */ +static int io_worker_count = 0; +static PMChild *io_worker_children[MAX_IO_WORKERS]; + /* * postmaster.c - function prototypes */ @@ -436,6 +444,8 @@ static void TerminateChildren(int signal); static int CountChildren(BackendTypeMask targetMask); static void LaunchMissingBackgroundProcesses(void); static void maybe_start_bgworkers(void); +static bool maybe_reap_io_worker(int pid); +static void maybe_adjust_io_workers(void); static bool CreateOptsFile(int argc, char *argv[], char *fullprogname); static PMChild *StartChildProcess(BackendType type); static void StartSysLogger(void); @@ -1365,6 +1375,11 @@ PostmasterMain(int argc, char *argv[]) */ AddToDataDirLockFile(LOCK_FILE_LINE_PM_STATUS, PM_STATUS_STARTING); + UpdatePMState(PM_STARTUP); + + /* Make sure we can perform I/O while starting up. */ + maybe_adjust_io_workers(); + /* Start bgwriter and checkpointer so they can help with recovery */ if (CheckpointerPMChild == NULL) CheckpointerPMChild = StartChildProcess(B_CHECKPOINTER); @@ -1377,7 +1392,6 @@ PostmasterMain(int argc, char *argv[]) StartupPMChild = StartChildProcess(B_STARTUP); Assert(StartupPMChild != NULL); StartupStatus = STARTUP_RUNNING; - UpdatePMState(PM_STARTUP); /* Some workers may be scheduled to start now */ maybe_start_bgworkers(); @@ -2502,6 +2516,16 @@ process_pm_child_exit(void) continue; } + /* Was it an IO worker? */ + if (maybe_reap_io_worker(pid)) + { + if (!EXIT_STATUS_0(exitstatus) && !EXIT_STATUS_1(exitstatus)) + HandleChildCrash(pid, exitstatus, _("io worker")); + + maybe_adjust_io_workers(); + continue; + } + /* * Was it a backend or a background worker? */ @@ -2723,6 +2747,7 @@ HandleFatalError(QuitSignalReason reason, bool consider_sigabrt) case PM_WAIT_XLOG_SHUTDOWN: case PM_WAIT_XLOG_ARCHIVAL: case PM_WAIT_CHECKPOINTER: + case PM_WAIT_IO_WORKERS: /* * NB: Similar code exists in PostmasterStateMachine()'s handling @@ -2905,20 +2930,21 @@ PostmasterStateMachine(void) /* * If we are doing crash recovery or an immediate shutdown then we - * expect archiver, checkpointer and walsender to exit as well, - * otherwise not. + * expect archiver, checkpointer, io workers and walsender to exit as + * well, otherwise not. */ if (FatalError || Shutdown >= ImmediateShutdown) targetMask = btmask_add(targetMask, B_CHECKPOINTER, B_ARCHIVER, + B_IO_WORKER, B_WAL_SENDER); /* - * Normally walsenders and archiver will continue running; they will - * be terminated later after writing the checkpoint record. We also - * let dead-end children to keep running for now. The syslogger - * process exits last. + * Normally archiver, checkpointer, IO workers and walsenders will + * continue running; they will be terminated later after writing the + * checkpoint record. We also let dead-end children to keep running + * for now. The syslogger process exits last. * * This assertion checks that we have covered all backend types, * either by including them in targetMask, or by noting here that they @@ -2933,12 +2959,13 @@ PostmasterStateMachine(void) B_LOGGER); /* - * Archiver, checkpointer and walsender may or may not be in - * targetMask already. + * Archiver, checkpointer, IO workers, and walsender may or may + * not be in targetMask already. */ remainMask = btmask_add(remainMask, B_ARCHIVER, B_CHECKPOINTER, + B_IO_WORKER, B_WAL_SENDER); /* these are not real postmaster children */ @@ -3039,11 +3066,25 @@ PostmasterStateMachine(void) { /* * PM_WAIT_XLOG_ARCHIVAL state ends when there are no children other - * than checkpointer, dead-end children and logger left. There + * than checkpointer, io workers and dead-end children left. There * shouldn't be any regular backends left by now anyway; what we're * really waiting for is for walsenders and archiver to exit. */ - if (CountChildren(btmask_all_except(B_CHECKPOINTER, B_LOGGER, B_DEAD_END_BACKEND)) == 0) + if (CountChildren(btmask_all_except(B_CHECKPOINTER, B_IO_WORKER, + B_LOGGER, B_DEAD_END_BACKEND)) == 0) + { + UpdatePMState(PM_WAIT_IO_WORKERS); + SignalChildren(SIGUSR2, btmask(B_IO_WORKER)); + } + } + + if (pmState == PM_WAIT_IO_WORKERS) + { + /* + * PM_WAIT_IO_WORKERS state ends when there's only checkpointer and + * dead_end children left. + */ + if (io_worker_count == 0) { UpdatePMState(PM_WAIT_CHECKPOINTER); @@ -3171,10 +3212,14 @@ PostmasterStateMachine(void) /* re-create shared memory and semaphores */ CreateSharedMemoryAndSemaphores(); + UpdatePMState(PM_STARTUP); + + /* Make sure we can perform I/O while starting up. */ + maybe_adjust_io_workers(); + StartupPMChild = StartChildProcess(B_STARTUP); Assert(StartupPMChild != NULL); StartupStatus = STARTUP_RUNNING; - UpdatePMState(PM_STARTUP); /* crash recovery started, reset SIGKILL flag */ AbortStartTime = 0; @@ -3198,6 +3243,7 @@ pmstate_name(PMState state) PM_TOSTR_CASE(PM_WAIT_BACKENDS); PM_TOSTR_CASE(PM_WAIT_XLOG_SHUTDOWN); PM_TOSTR_CASE(PM_WAIT_XLOG_ARCHIVAL); + PM_TOSTR_CASE(PM_WAIT_IO_WORKERS); PM_TOSTR_CASE(PM_WAIT_DEAD_END); PM_TOSTR_CASE(PM_WAIT_CHECKPOINTER); PM_TOSTR_CASE(PM_NO_CHILDREN); @@ -3236,6 +3282,16 @@ LaunchMissingBackgroundProcesses(void) StartSysLogger(); /* + * The number of configured workers might have changed, or a prior start + * of a worker might have failed. Check if we need to start/stop any + * workers. + * + * A config file change will always lead to this function being called, so + * we always will process the config change in a timely manner. + */ + maybe_adjust_io_workers(); + + /* * The checkpointer and the background writer are active from the start, * until shutdown is initiated. * @@ -4120,6 +4176,7 @@ bgworker_should_start_now(BgWorkerStartTime start_time) case PM_WAIT_DEAD_END: case PM_WAIT_XLOG_ARCHIVAL: case PM_WAIT_XLOG_SHUTDOWN: + case PM_WAIT_IO_WORKERS: case PM_WAIT_BACKENDS: case PM_STOP_BACKENDS: break; @@ -4270,6 +4327,99 @@ maybe_start_bgworkers(void) } } +static bool +maybe_reap_io_worker(int pid) +{ + for (int id = 0; id < MAX_IO_WORKERS; ++id) + { + if (io_worker_children[id] && + io_worker_children[id]->pid == pid) + { + ReleasePostmasterChildSlot(io_worker_children[id]); + + --io_worker_count; + io_worker_children[id] = NULL; + return true; + } + } + return false; +} + +/* + * Start or stop IO workers, to close the gap between the number of running + * workers and the number of configured workers. Used to respond to change of + * the io_workers GUC (by increasing and decreasing the number of workers), as + * well as workers terminating in response to errors (by starting + * "replacement" workers). + */ +static void +maybe_adjust_io_workers(void) +{ + if (!pgaio_workers_enabled()) + return; + + /* + * If we're in final shutting down state, then we're just waiting for all + * processes to exit. + */ + if (pmState >= PM_WAIT_IO_WORKERS) + return; + + /* Don't start new workers during an immediate shutdown either. */ + if (Shutdown >= ImmediateShutdown) + return; + + /* + * Don't start new workers if we're in the shutdown phase of a crash + * restart. But we *do* need to start if we're already starting up again. + */ + if (FatalError && pmState >= PM_STOP_BACKENDS) + return; + + Assert(pmState < PM_WAIT_IO_WORKERS); + + /* Not enough running? */ + while (io_worker_count < io_workers) + { + PMChild *child; + int id; + + /* find unused entry in io_worker_children array */ + for (id = 0; id < MAX_IO_WORKERS; ++id) + { + if (io_worker_children[id] == NULL) + break; + } + if (id == MAX_IO_WORKERS) + elog(ERROR, "could not find a free IO worker ID"); + + /* Try to launch one. */ + child = StartChildProcess(B_IO_WORKER); + if (child != NULL) + { + io_worker_children[id] = child; + ++io_worker_count; + } + else + break; /* XXX try again soon? */ + } + + /* Too many running? */ + if (io_worker_count > io_workers) + { + /* ask the IO worker in the highest slot to exit */ + for (int id = MAX_IO_WORKERS - 1; id >= 0; --id) + { + if (io_worker_children[id] != NULL) + { + kill(io_worker_children[id]->pid, SIGUSR2); + break; + } + } + } +} + + /* * When a backend asks to be notified about worker state changes, we * set a flag in its backend entry. The background worker machinery needs |