PostgreSQL Source Code git master
Loading...
Searching...
No Matches
method_io_uring.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * method_io_uring.c
4 * AIO - perform AIO using Linux' io_uring
5 *
6 * For now we create one io_uring instance for each backend. These io_uring
7 * instances have to be created in postmaster, during startup, to allow other
8 * backends to process IO completions, if the issuing backend is currently
9 * busy doing other things. Other backends may not use another backend's
10 * io_uring instance to submit IO, that'd require additional locking that
11 * would likely be harmful for performance.
12 *
13 * We likely will want to introduce a backend-local io_uring instance in the
14 * future, e.g. for FE/BE network IO.
15 *
16 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
17 * Portions Copyright (c) 1994, Regents of the University of California
18 *
19 * IDENTIFICATION
20 * src/backend/storage/aio/method_io_uring.c
21 *
22 *-------------------------------------------------------------------------
23 */
24
25#include "postgres.h"
26
27/* included early, for IOMETHOD_IO_URING_ENABLED */
28#include "storage/aio.h"
29
30#ifdef IOMETHOD_IO_URING_ENABLED
31
32#include <sys/mman.h>
33#include <unistd.h>
34
35#include <liburing.h>
36
37#include "miscadmin.h"
39#include "storage/fd.h"
40#include "storage/proc.h"
41#include "storage/shmem.h"
42#include "storage/lwlock.h"
43#include "storage/procnumber.h"
44#include "utils/wait_event.h"
45
46
47/* number of completions processed at once */
48#define PGAIO_MAX_LOCAL_COMPLETED_IO 32
49
50
51/* Entry points for IoMethodOps. */
52static size_t pgaio_uring_shmem_size(void);
53static void pgaio_uring_shmem_init(bool first_time);
54static void pgaio_uring_init_backend(void);
55static int pgaio_uring_submit(uint16 num_staged_ios, PgAioHandle **staged_ios);
58
59/* helper functions */
61
62
64 /*
65 * While io_uring mostly is OK with FDs getting closed while the IO is in
66 * flight, that is not true for IOs submitted with IOSQE_ASYNC.
67 *
68 * See
69 * https://postgr.es/m/5ons2rtmwarqqhhexb3dnqulw5rjgwgoct57vpdau4rujlrffj%403fls6d2mkiwc
70 */
72
73 .shmem_size = pgaio_uring_shmem_size,
74 .shmem_init = pgaio_uring_shmem_init,
75 .init_backend = pgaio_uring_init_backend,
76
77 .submit = pgaio_uring_submit,
78 .wait_one = pgaio_uring_wait_one,
79 .check_one = pgaio_uring_check_one,
80};
81
82/*
83 * Per-backend state when using io_method=io_uring
84 */
85typedef struct PgAioUringContext
86{
87 /*
88 * Align the whole struct to a cacheline boundary, to prevent false
89 * sharing between completion_lock and prior backend's io_uring_ring.
90 */
91 alignas(PG_CACHE_LINE_SIZE)
92
93 /*
94 * Multiple backends can process completions for this backend's io_uring
95 * instance (e.g. when the backend issuing IO is busy doing something
96 * else). To make that safe we have to ensure that only a single backend
97 * gets io completions from the io_uring instance at a time.
98 */
100
101 struct io_uring io_uring_ring;
103
104/*
105 * Information about the capabilities that io_uring has.
106 *
107 * Depending on liburing and kernel version different features are
108 * supported. At least for the kernel a kernel version check does not suffice
109 * as various vendors do backport features to older kernels :(.
110 */
111typedef struct PgAioUringCaps
112{
113 bool checked;
114 /* -1 if io_uring_queue_init_mem() is unsupported */
115 int mem_init_size;
117
118
119/* PgAioUringContexts for all backends */
121
122/* the current backend's context */
124
126{
127 .checked = false,
128 .mem_init_size = -1,
129};
130
131static uint32
133{
134 /*
135 * We can subtract MAX_IO_WORKERS here as io workers are never used at the
136 * same time as io_method=io_uring.
137 */
139}
140
141/*
142 * Initializes pgaio_uring_caps, unless that's already done.
143 */
144static void
146{
147 if (pgaio_uring_caps.checked)
148 return;
149
150 /*
151 * By default io_uring creates a shared memory mapping for each io_uring
152 * instance, leading to a large number of memory mappings. Unfortunately a
153 * large number of memory mappings slows things down, backend exit is
154 * particularly affected. To address that, newer kernels (6.5) support
155 * using user-provided memory for the memory, by putting the relevant
156 * memory into shared memory we don't need any additional mappings.
157 *
158 * To know whether this is supported, we unfortunately need to probe the
159 * kernel by trying to create a ring with userspace-provided memory. This
160 * also has a secondary benefit: We can determine precisely how much
161 * memory we need for each io_uring instance.
162 */
163#if defined(HAVE_IO_URING_QUEUE_INIT_MEM) && defined(IORING_SETUP_NO_MMAP)
164 {
165 struct io_uring test_ring;
166 size_t ring_size;
167 void *ring_ptr;
168 struct io_uring_params p = {0};
169 int ret;
170
171 /*
172 * Liburing does not yet provide an API to query how much memory a
173 * ring will need. So we over-estimate it here. As the memory is freed
174 * just below that's small temporary waste of memory.
175 *
176 * 1MB is more than enough for rings within io_max_concurrency's
177 * range.
178 */
179 ring_size = 1024 * 1024;
180
181 /*
182 * Hard to believe a system exists where 1MB would not be a multiple
183 * of the page size. But it's cheap to ensure...
184 */
186
188 if (ring_ptr == MAP_FAILED)
189 elog(ERROR,
190 "mmap(%zu) to determine io_uring_queue_init_mem() support failed: %m",
191 ring_size);
192
194 if (ret > 0)
195 {
196 pgaio_uring_caps.mem_init_size = ret;
197
198 elog(DEBUG1,
199 "can use combined memory mapping for io_uring, each ring needs %d bytes",
200 ret);
201
202 /* clean up the created ring, it was just for a test */
204 }
205 else
206 {
207 /*
208 * There are different reasons for ring creation to fail, but it's
209 * ok to treat that just as io_uring_queue_init_mem() not being
210 * supported. We'll report a more detailed error in
211 * pgaio_uring_shmem_init().
212 */
213 errno = -ret;
214 elog(DEBUG1,
215 "cannot use combined memory mapping for io_uring, ring creation failed: %m");
216
217 }
218
219 if (munmap(ring_ptr, ring_size) != 0)
220 elog(ERROR, "munmap() failed: %m");
221 }
222#else
223 {
224 elog(DEBUG1,
225 "can't use combined memory mapping for io_uring, kernel or liburing too old");
226 }
227#endif
228
229 pgaio_uring_caps.checked = true;
230}
231
232/*
233 * Memory for all PgAioUringContext instances
234 */
235static size_t
237{
239}
240
241/*
242 * Memory for the combined memory used by io_uring instances. Returns 0 if
243 * that is not supported by kernel/liburing.
244 */
245static size_t
247{
248 size_t sz = 0;
249
250 if (pgaio_uring_caps.mem_init_size > 0)
251 {
252 /*
253 * Memory for rings needs to be allocated to the page boundary,
254 * reserve space. Luckily it does not need to be aligned to hugepage
255 * boundaries, even if huge pages are used.
256 */
259 pgaio_uring_caps.mem_init_size));
260 }
261
262 return sz;
263}
264
265static size_t
267{
268 size_t sz;
269
270 /*
271 * Kernel and liburing support for various features influences how much
272 * shmem we need, perform the necessary checks.
273 */
275
278
279 return sz;
280}
281
282static void
284{
286 bool found;
287 char *shmem;
288 size_t ring_mem_remain = 0;
289 char *ring_mem_next = 0;
290
291 /*
292 * We allocate memory for all PgAioUringContext instances and, if
293 * supported, the memory required for each of the io_uring instances, in
294 * one ShmemInitStruct().
295 */
296 shmem = ShmemInitStruct("AioUringContext", pgaio_uring_shmem_size(), &found);
297 if (found)
298 return;
299
302
303 /* if supported, handle memory alignment / sizing for io_uring memory */
304 if (pgaio_uring_caps.mem_init_size > 0)
305 {
307 ring_mem_next = shmem;
308
309 /* align to page boundary, see also pgaio_uring_ring_shmem_size() */
311
312 /* account for alignment */
314 shmem += ring_mem_next - shmem;
315
316 shmem += ring_mem_remain;
317 }
318
319 for (int contextno = 0; contextno < TotalProcs; contextno++)
320 {
322 int ret;
323
324 /*
325 * Right now a high TotalProcs will cause problems in two ways:
326 *
327 * - RLIMIT_NOFILE needs to be big enough to allow all
328 * io_uring_queue_init() calls to succeed.
329 *
330 * - RLIMIT_NOFILE needs to be big enough to still have enough file
331 * descriptors to satisfy set_max_safe_fds() left over. Or, even
332 * better, have max_files_per_process left over FDs.
333 *
334 * We probably should adjust the soft RLIMIT_NOFILE to ensure that.
335 *
336 *
337 * XXX: Newer versions of io_uring support sharing the workers that
338 * execute some asynchronous IOs between io_uring instances. It might
339 * be worth using that - also need to evaluate if that causes
340 * noticeable additional contention?
341 */
342
343 /*
344 * If supported (c.f. pgaio_uring_check_capabilities()), create ring
345 * with its data in shared memory. Otherwise fall back io_uring
346 * creating a memory mapping for each ring.
347 */
348#if defined(HAVE_IO_URING_QUEUE_INIT_MEM) && defined(IORING_SETUP_NO_MMAP)
349 if (pgaio_uring_caps.mem_init_size > 0)
350 {
351 struct io_uring_params p = {0};
352
353 ret = io_uring_queue_init_mem(io_max_concurrency, &context->io_uring_ring, &p, ring_mem_next, ring_mem_remain);
354
355 ring_mem_remain -= ret;
356 ring_mem_next += ret;
357 }
358 else
359#endif
360 {
361 ret = io_uring_queue_init(io_max_concurrency, &context->io_uring_ring, 0);
362 }
363
364 if (ret < 0)
365 {
366 char *hint = NULL;
368
369 /* add hints for some failures that errno explains sufficiently */
370 if (-ret == EPERM)
371 {
373 hint = _("Check if io_uring is disabled via /proc/sys/kernel/io_uring_disabled.");
374 }
375 else if (-ret == EMFILE)
376 {
378 hint = psprintf(_("Consider increasing \"ulimit -n\" to at least %d."),
380 }
381 else if (-ret == ENOSYS)
382 {
384 hint = _("The kernel does not support io_uring.");
385 }
386
387 /* update errno to allow %m to work */
388 errno = -ret;
389
391 errcode(err),
392 errmsg("could not setup io_uring queue: %m"),
393 hint != NULL ? errhint("%s", hint) : 0);
394 }
395
396 LWLockInitialize(&context->completion_lock, LWTRANCHE_AIO_URING_COMPLETION);
397 }
398}
399
400static void
402{
404
406}
407
408static int
409pgaio_uring_submit(uint16 num_staged_ios, PgAioHandle **staged_ios)
410{
411 struct io_uring *uring_instance = &pgaio_my_uring_context->io_uring_ring;
413
414 Assert(num_staged_ios <= PGAIO_SUBMIT_BATCH_SIZE);
415
416 for (int i = 0; i < num_staged_ios; i++)
417 {
418 PgAioHandle *ioh = staged_ios[i];
419 struct io_uring_sqe *sqe;
420
422
423 if (!sqe)
424 elog(ERROR, "io_uring submission queue is unexpectedly full");
425
428
429 /*
430 * io_uring executes IO in process context if possible. That's
431 * generally good, as it reduces context switching. When performing a
432 * lot of buffered IO that means that copying between page cache and
433 * userspace memory happens in the foreground, as it can't be
434 * offloaded to DMA hardware as is possible when using direct IO. When
435 * executing a lot of buffered IO this causes io_uring to be slower
436 * than worker mode, as worker mode parallelizes the copying. io_uring
437 * can be told to offload work to worker threads instead.
438 *
439 * If an IO is buffered IO and we already have IOs in flight or
440 * multiple IOs are being submitted, we thus tell io_uring to execute
441 * the IO in the background. We don't do so for the first few IOs
442 * being submitted as executing in this process' context has lower
443 * latency.
444 */
445 if (in_flight_before > 4 && (ioh->flags & PGAIO_HF_BUFFERED))
447
449 }
450
451 while (true)
452 {
453 int ret;
454
458
459 if (ret == -EINTR)
460 {
462 "aio method uring: submit EINTR, nios: %d",
463 num_staged_ios);
464 }
465 else if (ret < 0)
466 {
467 /*
468 * The io_uring_enter() manpage suggests that the appropriate
469 * reaction to EAGAIN is:
470 *
471 * "The application should wait for some completions and try
472 * again"
473 *
474 * However, it seems unlikely that that would help in our case, as
475 * we apply a low limit to the number of outstanding IOs and thus
476 * also outstanding completions, making it unlikely that we'd get
477 * EAGAIN while the OS is in good working order.
478 *
479 * Additionally, it would be problematic to just wait here, our
480 * caller might hold critical locks. It'd possibly lead to
481 * delaying the crash-restart that seems likely to occur when the
482 * kernel is under such heavy memory pressure.
483 *
484 * Update errno to allow %m to work.
485 */
486 errno = -ret;
487 elog(PANIC, "io_uring submit failed: %m");
488 }
489 else if (ret != num_staged_ios)
490 {
491 /* likely unreachable, but if it is, we would need to re-submit */
492 elog(PANIC, "io_uring submit submitted only %d of %d",
493 ret, num_staged_ios);
494 }
495 else
496 {
498 "aio method uring: submitted %d IOs",
499 num_staged_ios);
500 break;
501 }
502 }
503
504 return num_staged_ios;
505}
506
507static void
509{
510 ProcNumber owner;
512 int32 owner_pid;
514
515 if (!ioh)
516 return;
517
518 /* No need for context if a backend is completing the IO for itself */
519 if (ioh->owner_procno == MyProcNumber)
520 return;
521
522 owner = ioh->owner_procno;
524 owner_pid = owner_proc->pid;
525
526 errcontext("completing I/O on behalf of process %d", owner_pid);
527}
528
529static void
531{
532 int ready;
533 int orig_ready;
534 ErrorContextCallback errcallback = {0};
535
536 Assert(LWLockHeldByMeInMode(&context->completion_lock, LW_EXCLUSIVE));
537
539 errcallback.previous = error_context_stack;
540 error_context_stack = &errcallback;
541
542 /*
543 * Don't drain more events than available right now. Otherwise it's
544 * plausible that one backend could get stuck, for a while, receiving CQEs
545 * without actually processing them.
546 */
547 orig_ready = ready = io_uring_cq_ready(&context->io_uring_ring);
548
549 while (ready > 0)
550 {
553
555 ncqes =
556 io_uring_peek_batch_cqe(&context->io_uring_ring,
557 cqes,
559 Assert(ncqes <= ready);
560
561 ready -= ncqes;
562
563 for (int i = 0; i < ncqes; i++)
564 {
565 struct io_uring_cqe *cqe = cqes[i];
567 int result = cqe->res;
568
569 errcallback.arg = ioh;
570
571 io_uring_cqe_seen(&context->io_uring_ring, cqe);
572
574 errcallback.arg = NULL;
575 }
576
578
580 "drained %d/%d, now expecting %d",
581 ncqes, orig_ready, io_uring_cq_ready(&context->io_uring_ring));
582 }
583
584 error_context_stack = errcallback.previous;
585}
586
587static void
589{
591 ProcNumber owner_procno = ioh->owner_procno;
593 bool expect_cqe;
594 int waited = 0;
595
596 /*
597 * XXX: It would be nice to have a smarter locking scheme, nearly all the
598 * time the backend owning the ring will consume the completions, making
599 * the locking unnecessarily expensive.
600 */
601 LWLockAcquire(&owner_context->completion_lock, LW_EXCLUSIVE);
602
603 while (true)
604 {
606 "wait_one io_gen: %" PRIu64 ", ref_gen: %" PRIu64 ", cycle %d",
607 ioh->generation,
609 waited);
610
613 {
614 /* the IO was completed by another backend */
615 break;
616 }
617 else if (io_uring_cq_ready(&owner_context->io_uring_ring))
618 {
619 /* no need to wait in the kernel, io_uring has a completion */
620 expect_cqe = true;
621 }
622 else
623 {
624 int ret;
625 struct io_uring_cqe *cqes;
626
627 /* need to wait in the kernel */
629 ret = io_uring_wait_cqes(&owner_context->io_uring_ring, &cqes, 1, NULL, NULL);
631
632 if (ret == -EINTR)
633 {
634 continue;
635 }
636 else if (ret != 0)
637 {
638 /* see comment after io_uring_submit() */
639 errno = -ret;
640 elog(PANIC, "io_uring wait failed: %m");
641 }
642 else
643 {
644 Assert(cqes != NULL);
645 expect_cqe = true;
646 waited++;
647 }
648 }
649
650 if (expect_cqe)
651 {
653 }
654 }
655
656 LWLockRelease(&owner_context->completion_lock);
657
659 "wait_one with %d sleeps",
660 waited);
661}
662
663static void
665{
666 ProcNumber owner_procno = ioh->owner_procno;
668
669 /*
670 * This check is not reliable when not holding the completion lock, but
671 * it's a useful cheap pre-check to see if it's worth trying to get the
672 * completion lock.
673 */
674 if (!io_uring_cq_ready(&owner_context->io_uring_ring))
675 return;
676
677 /*
678 * If the completion lock is currently held, the holder will likely
679 * process any pending completions, give up.
680 */
681 if (!LWLockConditionalAcquire(&owner_context->completion_lock, LW_EXCLUSIVE))
682 return;
683
685 "check_one io_gen: %" PRIu64 ", ref_gen: %" PRIu64,
686 ioh->generation,
688
689 /*
690 * Recheck if there are any completions, another backend could have
691 * processed them since we checked above, or our unlocked pre-check could
692 * have been reading outdated values.
693 *
694 * It is possible that the IO handle has been reused since the start of
695 * the call, but now that we have the lock, we can just as well drain all
696 * completions.
697 */
698 if (io_uring_cq_ready(&owner_context->io_uring_ring))
700
701 LWLockRelease(&owner_context->completion_lock);
702}
703
704static void
706{
707 struct iovec *iov;
708
709 switch ((PgAioOp) ioh->op)
710 {
711 case PGAIO_OP_READV:
712 iov = &pgaio_ctl->iovecs[ioh->iovec_off];
713 if (ioh->op_data.read.iov_length == 1)
714 {
716 ioh->op_data.read.fd,
717 iov->iov_base,
718 iov->iov_len,
719 ioh->op_data.read.offset);
720 }
721 else
722 {
724 ioh->op_data.read.fd,
725 iov,
726 ioh->op_data.read.iov_length,
727 ioh->op_data.read.offset);
728
729 }
730 break;
731
732 case PGAIO_OP_WRITEV:
733 iov = &pgaio_ctl->iovecs[ioh->iovec_off];
734 if (ioh->op_data.write.iov_length == 1)
735 {
737 ioh->op_data.write.fd,
738 iov->iov_base,
739 iov->iov_len,
740 ioh->op_data.write.offset);
741 }
742 else
743 {
745 ioh->op_data.write.fd,
746 iov,
747 ioh->op_data.write.iov_length,
748 ioh->op_data.write.offset);
749 }
750 break;
751
752 case PGAIO_OP_INVALID:
753 elog(ERROR, "trying to prepare invalid IO operation for execution");
754 }
755
757}
758
759#endif /* IOMETHOD_IO_URING_ENABLED */
void pgaio_io_process_completion(PgAioHandle *ioh, int result)
Definition aio.c:528
PgAioBackend * pgaio_my_backend
Definition aio.c:81
int io_max_concurrency
Definition aio.c:75
PgAioCtl * pgaio_ctl
Definition aio.c:78
bool pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state)
Definition aio.c:559
void pgaio_io_prepare_submit(PgAioHandle *ioh)
Definition aio.c:510
PgAioOp
Definition aio.h:88
@ PGAIO_OP_WRITEV
Definition aio.h:93
@ PGAIO_OP_INVALID
Definition aio.h:90
@ PGAIO_OP_READV
Definition aio.h:92
@ PGAIO_HF_BUFFERED
Definition aio.h:77
PgAioHandleState
@ PGAIO_HS_SUBMITTED
#define pgaio_debug(elevel, msg,...)
#define pgaio_debug_io(elevel, ioh, msg,...)
#define PGAIO_SUBMIT_BATCH_SIZE
#define Min(x, y)
Definition c.h:1091
#define TYPEALIGN(ALIGNVAL, LEN)
Definition c.h:889
#define Assert(condition)
Definition c.h:943
int32_t int32
Definition c.h:620
uint64_t uint64
Definition c.h:625
uint16_t uint16
Definition c.h:623
uint32_t uint32
Definition c.h:624
Datum arg
Definition elog.c:1322
ErrorContextCallback * error_context_stack
Definition elog.c:99
int errcode(int sqlerrcode)
Definition elog.c:874
#define _(x)
Definition elog.c:95
#define errcontext
Definition elog.h:199
int errhint(const char *fmt,...) pg_attribute_printf(1
#define DEBUG3
Definition elog.h:28
#define PANIC
Definition elog.h:43
#define DEBUG1
Definition elog.h:30
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:227
#define ereport(elevel,...)
Definition elog.h:151
#define DEBUG4
Definition elog.h:27
void err(int eval, const char *fmt,...)
Definition err.c:43
int max_files_per_process
Definition fd.c:147
ProcNumber MyProcNumber
Definition globals.c:90
int MaxBackends
Definition globals.c:146
static uint32 dclist_count(const dclist_head *head)
Definition ilist.h:932
int i
Definition isn.c:77
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition lwlock.c:1149
bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode)
Definition lwlock.c:1928
void LWLockRelease(LWLock *lock)
Definition lwlock.c:1766
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition lwlock.c:669
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition lwlock.c:1320
@ LW_EXCLUSIVE
Definition lwlock.h:104
#define MAP_FAILED
Definition mem.h:43
#define MAP_ANONYMOUS
Definition mem.h:25
#define START_CRIT_SECTION()
Definition miscadmin.h:150
#define END_CRIT_SECTION()
Definition miscadmin.h:152
static char * errmsg
#define PG_CACHE_LINE_SIZE
static int fb(int x)
#define MAX_IO_WORKERS
Definition proc.h:525
#define NUM_AUXILIARY_PROCS
Definition proc.h:526
#define GetPGProcByNumber(n)
Definition proc.h:503
int ProcNumber
Definition procnumber.h:24
char * psprintf(const char *fmt,...)
Definition psprintf.c:43
Size add_size(Size s1, Size s2)
Definition shmem.c:483
Size mul_size(Size s1, Size s2)
Definition shmem.c:498
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition shmem.c:408
struct ErrorContextCallback * previous
Definition elog.h:298
void(* callback)(void *arg)
Definition elog.h:299
bool wait_on_fd_before_close
Definition proc.h:178
dclist_head in_flight_ios
struct iovec * iovecs
int32 owner_procno
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition wait_event.h:69
static void pgstat_report_wait_end(void)
Definition wait_event.h:85
#define EINTR
Definition win32_port.h:361