PostgreSQL Source Code git master
Loading...
Searching...
No Matches
method_worker.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * method_worker.c
4 * AIO - perform AIO using worker processes
5 *
6 * IO workers consume IOs from a shared memory submission queue, run
7 * traditional synchronous system calls, and perform the shared completion
8 * handling immediately. Client code submits most requests by pushing IOs
9 * into the submission queue, and waits (if necessary) using condition
10 * variables. Some IOs cannot be performed in another process due to lack of
11 * infrastructure for reopening the file, and must processed synchronously by
12 * the client code when submitted.
13 *
14 * So that the submitter can make just one system call when submitting a batch
15 * of IOs, wakeups "fan out"; each woken IO worker can wake two more. XXX This
16 * could be improved by using futexes instead of latches to wake N waiters.
17 *
18 * This method of AIO is available in all builds on all operating systems, and
19 * is the default.
20 *
21 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
22 * Portions Copyright (c) 1994, Regents of the University of California
23 *
24 * IDENTIFICATION
25 * src/backend/storage/aio/method_worker.c
26 *
27 *-------------------------------------------------------------------------
28 */
29
30#include "postgres.h"
31
32#include "libpq/pqsignal.h"
33#include "miscadmin.h"
34#include "port/pg_bitutils.h"
37#include "storage/aio.h"
39#include "storage/aio_subsys.h"
40#include "storage/io_worker.h"
41#include "storage/ipc.h"
42#include "storage/latch.h"
43#include "storage/proc.h"
44#include "tcop/tcopprot.h"
46#include "utils/memdebug.h"
47#include "utils/ps_status.h"
48#include "utils/wait_event.h"
49
50
51/* How many workers should each worker wake up if needed? */
52#define IO_WORKER_WAKEUP_FANOUT 2
53
54
62
68
74
75
76static size_t pgaio_worker_shmem_size(void);
77static void pgaio_worker_shmem_init(bool first_time);
78
80static int pgaio_worker_submit(uint16 num_staged_ios, PgAioHandle **staged_ios);
81
82
85 .shmem_init = pgaio_worker_shmem_init,
86
87 .needs_synchronous_execution = pgaio_worker_needs_synchronous_execution,
88 .submit = pgaio_worker_submit,
89};
90
91
92/* GUCs */
93int io_workers = 3;
94
95
96static int io_worker_queue_size = 64;
97static int MyIoWorkerId;
100
101
102static size_t
104{
105 /* Round size up to next power of two so we can make a mask. */
107
109 sizeof(int) * *queue_size;
110}
111
112static size_t
118
119static size_t
121{
122 size_t sz;
123 int queue_size;
124
125 sz = pgaio_worker_queue_shmem_size(&queue_size);
127
128 return sz;
129}
130
131static void
133{
134 bool found;
135 int queue_size;
136
138 ShmemInitStruct("AioWorkerSubmissionQueue",
140 &found);
141 if (!found)
142 {
143 io_worker_submission_queue->size = queue_size;
146 }
147
149 ShmemInitStruct("AioWorkerControl",
151 &found);
152 if (!found)
153 {
155 for (int i = 0; i < MAX_IO_WORKERS; ++i)
156 {
159 }
160 }
161}
162
163static int
165{
166 int worker;
167
169 return -1;
170
171 /* Find the lowest bit position, and clear it. */
173 io_worker_control->idle_worker_mask &= ~(UINT64_C(1) << worker);
175
176 return worker;
177}
178
179static bool
181{
184
186 new_head = (queue->head + 1) & (queue->size - 1);
187 if (new_head == queue->tail)
188 {
189 pgaio_debug(DEBUG3, "io queue is full, at %u elements",
191 return false; /* full */
192 }
193
194 queue->sqes[queue->head] = pgaio_io_get_id(ioh);
195 queue->head = new_head;
196
197 return true;
198}
199
200static int
202{
204 int result;
205
207 if (queue->tail == queue->head)
208 return -1; /* empty */
209
210 result = queue->sqes[queue->tail];
211 queue->tail = (queue->tail + 1) & (queue->size - 1);
212
213 return result;
214}
215
216static uint32
218{
219 uint32 head;
220 uint32 tail;
221
224
225 if (tail > head)
227
228 Assert(head >= tail);
229
230 return head - tail;
231}
232
233static bool
241
242static void
243pgaio_worker_submit_internal(int num_staged_ios, PgAioHandle **staged_ios)
244{
246 int nsync = 0;
247 Latch *wakeup = NULL;
248 int worker;
249
250 Assert(num_staged_ios <= PGAIO_SUBMIT_BATCH_SIZE);
251
253 {
254 for (int i = 0; i < num_staged_ios; ++i)
255 {
257 if (!pgaio_worker_submission_queue_insert(staged_ios[i]))
258 {
259 /*
260 * Do the rest synchronously. If the queue is full, give up
261 * and do the rest synchronously. We're holding an exclusive
262 * lock on the queue so nothing can consume entries.
263 */
264 synchronous_ios = &staged_ios[i];
265 nsync = (num_staged_ios - i);
266
267 break;
268 }
269
270 if (wakeup == NULL)
271 {
272 /* Choose an idle worker to wake up if we haven't already. */
273 worker = pgaio_worker_choose_idle();
274 if (worker >= 0)
276
277 pgaio_debug_io(DEBUG4, staged_ios[i],
278 "choosing worker %d",
279 worker);
280 }
281 }
283 }
284 else
285 {
286 /* do everything synchronously, no wakeup needed */
287 synchronous_ios = staged_ios;
288 nsync = num_staged_ios;
289 }
290
291 if (wakeup)
293
294 /* Run whatever is left synchronously. */
295 if (nsync > 0)
296 {
297 for (int i = 0; i < nsync; ++i)
298 {
300 }
301 }
302}
303
304static int
305pgaio_worker_submit(uint16 num_staged_ios, PgAioHandle **staged_ios)
306{
307 for (int i = 0; i < num_staged_ios; i++)
308 {
309 PgAioHandle *ioh = staged_ios[i];
310
312 }
313
314 pgaio_worker_submit_internal(num_staged_ios, staged_ios);
315
316 return num_staged_ios;
317}
318
319/*
320 * on_shmem_exit() callback that releases the worker's slot in
321 * io_worker_control.
322 */
323static void
335
336/*
337 * Register the worker in shared memory, assign MyIoWorkerId and register a
338 * shutdown callback to release registration.
339 */
340static void
342{
343 MyIoWorkerId = -1;
344
345 /*
346 * XXX: This could do with more fine-grained locking. But it's also not
347 * very common for the number of workers to change at the moment...
348 */
350
351 for (int i = 0; i < MAX_IO_WORKERS; ++i)
352 {
354 {
357 MyIoWorkerId = i;
358 break;
359 }
360 else
362 }
363
364 if (MyIoWorkerId == -1)
365 elog(ERROR, "couldn't find a free worker slot");
366
370
372}
373
374static void
376{
377 ProcNumber owner;
379 int32 owner_pid;
381
382 if (!ioh)
383 return;
384
385 Assert(ioh->owner_procno != MyProcNumber);
387
388 owner = ioh->owner_procno;
390 owner_pid = owner_proc->pid;
391
392 errcontext("I/O worker executing I/O on behalf of process %d", owner_pid);
393}
394
395void
397{
399 PgAioHandle *volatile error_ioh = NULL;
400 ErrorContextCallback errcallback = {0};
401 volatile int error_errno = 0;
402 char cmd[128];
403
405
407 pqsignal(SIGINT, die); /* to allow manually triggering worker restart */
408
409 /*
410 * Ignore SIGTERM, will get explicit shutdown via SIGUSR2 later in the
411 * shutdown sequence, similar to checkpointer.
412 */
414 /* SIGQUIT handler was already set up by InitPostmasterChild */
419
420 /* also registers a shutdown callback to unregister */
422
423 sprintf(cmd, "%d", MyIoWorkerId);
424 set_ps_display(cmd);
425
427 errcallback.previous = error_context_stack;
428 error_context_stack = &errcallback;
429
430 /* see PostgresMain() */
431 if (sigsetjmp(local_sigjmp_buf, 1) != 0)
432 {
435
437
438 /*
439 * In the - very unlikely - case that the IO failed in a way that
440 * raises an error we need to mark the IO as failed.
441 *
442 * Need to do just enough error recovery so that we can mark the IO as
443 * failed and then exit (postmaster will start a new worker).
444 */
446
447 if (error_ioh != NULL)
448 {
449 /* should never fail without setting error_errno */
450 Assert(error_errno != 0);
451
453
457 }
458
459 proc_exit(1);
460 }
461
462 /* We can now handle ereport(ERROR) */
464
466
468 {
471 int nlatches = 0;
472 int nwakeups = 0;
473 int worker;
474
475 /*
476 * Try to get a job to do.
477 *
478 * The lwlock acquisition also provides the necessary memory barrier
479 * to ensure that we don't see an outdated data in the handle.
480 */
483 {
484 /*
485 * Nothing to do. Mark self idle.
486 *
487 * XXX: Invent some kind of back pressure to reduce useless
488 * wakeups?
489 */
491 }
492 else
493 {
494 /* Got one. Clear idle flag. */
496
497 /* See if we can wake up some peers. */
500 for (int i = 0; i < nwakeups; ++i)
501 {
502 if ((worker = pgaio_worker_choose_idle()) < 0)
503 break;
505 }
506 }
508
509 for (int i = 0; i < nlatches; ++i)
511
512 if (io_index != -1)
513 {
515
517 error_ioh = ioh;
518 errcallback.arg = ioh;
519
521 "worker %d processing IO",
523
524 /*
525 * Prevent interrupts between pgaio_io_reopen() and
526 * pgaio_io_perform_synchronously() that otherwise could lead to
527 * the FD getting closed in that window.
528 */
530
531 /*
532 * It's very unlikely, but possible, that reopen fails. E.g. due
533 * to memory allocations failing or file permissions changing or
534 * such. In that case we need to fail the IO.
535 *
536 * There's not really a good errno we can report here.
537 */
540
541 /*
542 * To be able to exercise the reopen-fails path, allow injection
543 * points to trigger a failure at this point.
544 */
545 INJECTION_POINT("aio-worker-after-reopen", ioh);
546
547 error_errno = 0;
548 error_ioh = NULL;
549
550 /*
551 * As part of IO completion the buffer will be marked as NOACCESS,
552 * until the buffer is pinned again - which never happens in io
553 * workers. Therefore the next time there is IO for the same
554 * buffer, the memory will be considered inaccessible. To avoid
555 * that, explicitly allow access to the memory before reading data
556 * into it.
557 */
558#ifdef USE_VALGRIND
559 {
560 struct iovec *iov;
561 uint16 iov_length = pgaio_io_get_iovec_length(ioh, &iov);
562
563 for (int i = 0; i < iov_length; i++)
565 }
566#endif
567
568 /*
569 * We don't expect this to ever fail with ERROR or FATAL, no need
570 * to keep error_ioh set to the IO.
571 * pgaio_io_perform_synchronously() contains a critical section to
572 * ensure we don't accidentally fail.
573 */
575
577 errcallback.arg = NULL;
578 }
579 else
580 {
584 }
585
587
589 {
590 ConfigReloadPending = false;
592 }
593 }
594
595 error_context_stack = errcallback.previous;
596 proc_exit(0);
597}
598
599bool
601{
602 return io_method == IOMETHOD_WORKER;
603}
void pgaio_io_process_completion(PgAioHandle *ioh, int result)
Definition aio.c:528
int io_method
Definition aio.c:74
int pgaio_io_get_id(PgAioHandle *ioh)
Definition aio.c:342
PgAioCtl * pgaio_ctl
Definition aio.c:78
void pgaio_io_prepare_submit(PgAioHandle *ioh)
Definition aio.c:510
@ IOMETHOD_WORKER
Definition aio.h:35
@ PGAIO_HF_REFERENCES_LOCAL
Definition aio.h:60
#define pgaio_debug(elevel, msg,...)
#define pgaio_debug_io(elevel, ioh, msg,...)
#define PGAIO_SUBMIT_BATCH_SIZE
void pgaio_io_perform_synchronously(PgAioHandle *ioh)
Definition aio_io.c:116
int pgaio_io_get_iovec_length(PgAioHandle *ioh, struct iovec **iov)
Definition aio_io.c:219
void pgaio_io_reopen(PgAioHandle *ioh)
Definition aio_target.c:116
bool pgaio_io_can_reopen(PgAioHandle *ioh)
Definition aio_target.c:103
void AuxiliaryProcessMainCommon(void)
Definition auxprocess.c:40
sigset_t UnBlockSig
Definition pqsignal.c:22
#define Min(x, y)
Definition c.h:1093
#define Assert(condition)
Definition c.h:945
#define FLEXIBLE_ARRAY_MEMBER
Definition c.h:552
int32_t int32
Definition c.h:614
uint64_t uint64
Definition c.h:619
uint16_t uint16
Definition c.h:617
uint32_t uint32
Definition c.h:618
Datum arg
Definition elog.c:1322
void EmitErrorReport(void)
Definition elog.c:1882
ErrorContextCallback * error_context_stack
Definition elog.c:99
sigjmp_buf * PG_exception_stack
Definition elog.c:101
#define errcontext
Definition elog.h:198
#define DEBUG3
Definition elog.h:28
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define DEBUG4
Definition elog.h:27
ProcNumber MyProcNumber
Definition globals.c:90
bool IsUnderPostmaster
Definition globals.c:120
struct Latch * MyLatch
Definition globals.c:63
void ProcessConfigFile(GucContext context)
Definition guc-file.l:120
@ PGC_SIGHUP
Definition guc.h:75
#define INJECTION_POINT(name, arg)
void SignalHandlerForShutdownRequest(SIGNAL_ARGS)
Definition interrupt.c:104
volatile sig_atomic_t ShutdownRequestPending
Definition interrupt.c:28
volatile sig_atomic_t ConfigReloadPending
Definition interrupt.c:27
void SignalHandlerForConfigReload(SIGNAL_ARGS)
Definition interrupt.c:61
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition ipc.c:372
void proc_exit(int code)
Definition ipc.c:105
int i
Definition isn.c:77
void SetLatch(Latch *latch)
Definition latch.c:290
void ResetLatch(Latch *latch)
Definition latch.c:374
int WaitLatch(Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
Definition latch.c:172
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition lwlock.c:1177
void LWLockRelease(LWLock *lock)
Definition lwlock.c:1794
void LWLockReleaseAll(void)
Definition lwlock.c:1893
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition lwlock.c:1348
@ LW_EXCLUSIVE
Definition lwlock.h:112
#define VALGRIND_MAKE_MEM_UNDEFINED(addr, size)
Definition memdebug.h:28
static size_t pgaio_worker_control_shmem_size(void)
static uint32 pgaio_worker_submission_queue_depth(void)
static void pgaio_worker_error_callback(void *arg)
static bool pgaio_worker_needs_synchronous_execution(PgAioHandle *ioh)
static int pgaio_worker_submit(uint16 num_staged_ios, PgAioHandle **staged_ios)
#define IO_WORKER_WAKEUP_FANOUT
static size_t pgaio_worker_shmem_size(void)
static size_t pgaio_worker_queue_shmem_size(int *queue_size)
static int io_worker_queue_size
static void pgaio_worker_register(void)
static PgAioWorkerControl * io_worker_control
static int MyIoWorkerId
const IoMethodOps pgaio_worker_ops
static void pgaio_worker_die(int code, Datum arg)
static int pgaio_worker_submission_queue_consume(void)
static bool pgaio_worker_submission_queue_insert(PgAioHandle *ioh)
bool pgaio_workers_enabled(void)
static PgAioWorkerSubmissionQueue * io_worker_submission_queue
void IoWorkerMain(const void *startup_data, size_t startup_data_len)
static void pgaio_worker_submit_internal(int num_staged_ios, PgAioHandle **staged_ios)
static void pgaio_worker_shmem_init(bool first_time)
int io_workers
static int pgaio_worker_choose_idle(void)
#define RESUME_INTERRUPTS()
Definition miscadmin.h:136
#define START_CRIT_SECTION()
Definition miscadmin.h:150
#define CHECK_FOR_INTERRUPTS()
Definition miscadmin.h:123
#define HOLD_INTERRUPTS()
Definition miscadmin.h:134
@ B_IO_WORKER
Definition miscadmin.h:364
#define END_CRIT_SECTION()
Definition miscadmin.h:152
BackendType MyBackendType
Definition miscinit.c:65
static int pg_rightmost_one_pos64(uint64 word)
static uint32 pg_nextpower2_32(uint32 num)
#define die(msg)
#define pqsignal
Definition port.h:547
#define sprintf
Definition port.h:262
uint64_t Datum
Definition postgres.h:70
static int fb(int x)
#define MAX_IO_WORKERS
Definition proc.h:523
#define GetPGProcByNumber(n)
Definition proc.h:501
int ProcNumber
Definition procnumber.h:24
void procsignal_sigusr1_handler(SIGNAL_ARGS)
Definition procsignal.c:680
static void set_ps_display(const char *activity)
Definition ps_status.h:40
Size add_size(Size s1, Size s2)
Definition shmem.c:485
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition shmem.c:381
struct ErrorContextCallback * previous
Definition elog.h:297
void(* callback)(void *arg)
Definition elog.h:298
size_t(* shmem_size)(void)
Definition latch.h:116
Definition proc.h:176
PgAioHandle * io_handles
PgAioWorkerSlot workers[FLEXIBLE_ARRAY_MEMBER]
int sqes[FLEXIBLE_ARRAY_MEMBER]
#define WL_EXIT_ON_PM_DEATH
#define WL_LATCH_SET
static TimestampTz wakeup[NUM_WALRCV_WAKEUPS]
#define SIGHUP
Definition win32_port.h:158
#define SIGPIPE
Definition win32_port.h:163
#define SIGUSR1
Definition win32_port.h:170
#define SIGALRM
Definition win32_port.h:164
#define SIGUSR2
Definition win32_port.h:171