PostgreSQL Source Code git master
Loading...
Searching...
No Matches
method_io_uring.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * method_io_uring.c
4 * AIO - perform AIO using Linux' io_uring
5 *
6 * For now we create one io_uring instance for each backend. These io_uring
7 * instances have to be created in postmaster, during startup, to allow other
8 * backends to process IO completions, if the issuing backend is currently
9 * busy doing other things. Other backends may not use another backend's
10 * io_uring instance to submit IO, that'd require additional locking that
11 * would likely be harmful for performance.
12 *
13 * We likely will want to introduce a backend-local io_uring instance in the
14 * future, e.g. for FE/BE network IO.
15 *
16 * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
17 * Portions Copyright (c) 1994, Regents of the University of California
18 *
19 * IDENTIFICATION
20 * src/backend/storage/aio/method_io_uring.c
21 *
22 *-------------------------------------------------------------------------
23 */
24
25#include "postgres.h"
26
27/* included early, for IOMETHOD_IO_URING_ENABLED */
28#include "storage/aio.h"
29
30#ifdef IOMETHOD_IO_URING_ENABLED
31
32#include <sys/mman.h>
33#include <unistd.h>
34
35#include <liburing.h>
36
37#include "miscadmin.h"
39#include "storage/fd.h"
40#include "storage/proc.h"
41#include "storage/shmem.h"
42#include "storage/lwlock.h"
43#include "storage/procnumber.h"
44#include "utils/wait_event.h"
45
46
47/* number of completions processed at once */
48#define PGAIO_MAX_LOCAL_COMPLETED_IO 32
49
50
51/* Entry points for IoMethodOps. */
52static size_t pgaio_uring_shmem_size(void);
53static void pgaio_uring_shmem_init(bool first_time);
54static void pgaio_uring_init_backend(void);
55static int pgaio_uring_submit(uint16 num_staged_ios, PgAioHandle **staged_ios);
57
58/* helper functions */
60
61
63 /*
64 * While io_uring mostly is OK with FDs getting closed while the IO is in
65 * flight, that is not true for IOs submitted with IOSQE_ASYNC.
66 *
67 * See
68 * https://postgr.es/m/5ons2rtmwarqqhhexb3dnqulw5rjgwgoct57vpdau4rujlrffj%403fls6d2mkiwc
69 */
71
72 .shmem_size = pgaio_uring_shmem_size,
73 .shmem_init = pgaio_uring_shmem_init,
74 .init_backend = pgaio_uring_init_backend,
75
76 .submit = pgaio_uring_submit,
77 .wait_one = pgaio_uring_wait_one,
78};
79
80/*
81 * Per-backend state when using io_method=io_uring
82 *
83 * Align the whole struct to a cacheline boundary, to prevent false sharing
84 * between completion_lock and prior backend's io_uring_ring.
85 */
88{
89 /*
90 * Multiple backends can process completions for this backend's io_uring
91 * instance (e.g. when the backend issuing IO is busy doing something
92 * else). To make that safe we have to ensure that only a single backend
93 * gets io completions from the io_uring instance at a time.
94 */
96
99
100/*
101 * Information about the capabilities that io_uring has.
102 *
103 * Depending on liburing and kernel version different features are
104 * supported. At least for the kernel a kernel version check does not suffice
105 * as various vendors do backport features to older kernels :(.
106 */
107typedef struct PgAioUringCaps
108{
109 bool checked;
110 /* -1 if io_uring_queue_init_mem() is unsupported */
111 int mem_init_size;
113
114
115/* PgAioUringContexts for all backends */
117
118/* the current backend's context */
120
122{
123 .checked = false,
124 .mem_init_size = -1,
125};
126
127static uint32
129{
130 /*
131 * We can subtract MAX_IO_WORKERS here as io workers are never used at the
132 * same time as io_method=io_uring.
133 */
135}
136
137/*
138 * Initializes pgaio_uring_caps, unless that's already done.
139 */
140static void
142{
143 if (pgaio_uring_caps.checked)
144 return;
145
146 /*
147 * By default io_uring creates a shared memory mapping for each io_uring
148 * instance, leading to a large number of memory mappings. Unfortunately a
149 * large number of memory mappings slows things down, backend exit is
150 * particularly affected. To address that, newer kernels (6.5) support
151 * using user-provided memory for the memory, by putting the relevant
152 * memory into shared memory we don't need any additional mappings.
153 *
154 * To know whether this is supported, we unfortunately need to probe the
155 * kernel by trying to create a ring with userspace-provided memory. This
156 * also has a secondary benefit: We can determine precisely how much
157 * memory we need for each io_uring instance.
158 */
159#if defined(HAVE_IO_URING_QUEUE_INIT_MEM) && defined(IORING_SETUP_NO_MMAP)
160 {
161 struct io_uring test_ring;
162 size_t ring_size;
163 void *ring_ptr;
164 struct io_uring_params p = {0};
165 int ret;
166
167 /*
168 * Liburing does not yet provide an API to query how much memory a
169 * ring will need. So we over-estimate it here. As the memory is freed
170 * just below that's small temporary waste of memory.
171 *
172 * 1MB is more than enough for rings within io_max_concurrency's
173 * range.
174 */
175 ring_size = 1024 * 1024;
176
177 /*
178 * Hard to believe a system exists where 1MB would not be a multiple
179 * of the page size. But it's cheap to ensure...
180 */
182
184 if (ring_ptr == MAP_FAILED)
185 elog(ERROR,
186 "mmap(%zu) to determine io_uring_queue_init_mem() support failed: %m",
187 ring_size);
188
190 if (ret > 0)
191 {
192 pgaio_uring_caps.mem_init_size = ret;
193
194 elog(DEBUG1,
195 "can use combined memory mapping for io_uring, each ring needs %d bytes",
196 ret);
197
198 /* clean up the created ring, it was just for a test */
200 }
201 else
202 {
203 /*
204 * There are different reasons for ring creation to fail, but it's
205 * ok to treat that just as io_uring_queue_init_mem() not being
206 * supported. We'll report a more detailed error in
207 * pgaio_uring_shmem_init().
208 */
209 errno = -ret;
210 elog(DEBUG1,
211 "cannot use combined memory mapping for io_uring, ring creation failed: %m");
212
213 }
214
215 if (munmap(ring_ptr, ring_size) != 0)
216 elog(ERROR, "munmap() failed: %m");
217 }
218#else
219 {
220 elog(DEBUG1,
221 "can't use combined memory mapping for io_uring, kernel or liburing too old");
222 }
223#endif
224
225 pgaio_uring_caps.checked = true;
226}
227
228/*
229 * Memory for all PgAioUringContext instances
230 */
231static size_t
233{
235}
236
237/*
238 * Memory for the combined memory used by io_uring instances. Returns 0 if
239 * that is not supported by kernel/liburing.
240 */
241static size_t
243{
244 size_t sz = 0;
245
246 if (pgaio_uring_caps.mem_init_size > 0)
247 {
248 /*
249 * Memory for rings needs to be allocated to the page boundary,
250 * reserve space. Luckily it does not need to be aligned to hugepage
251 * boundaries, even if huge pages are used.
252 */
255 pgaio_uring_caps.mem_init_size));
256 }
257
258 return sz;
259}
260
261static size_t
263{
264 size_t sz;
265
266 /*
267 * Kernel and liburing support for various features influences how much
268 * shmem we need, perform the necessary checks.
269 */
271
274
275 return sz;
276}
277
278static void
280{
282 bool found;
283 char *shmem;
284 size_t ring_mem_remain = 0;
285 char *ring_mem_next = 0;
286
287 /*
288 * We allocate memory for all PgAioUringContext instances and, if
289 * supported, the memory required for each of the io_uring instances, in
290 * one ShmemInitStruct().
291 */
292 shmem = ShmemInitStruct("AioUringContext", pgaio_uring_shmem_size(), &found);
293 if (found)
294 return;
295
298
299 /* if supported, handle memory alignment / sizing for io_uring memory */
300 if (pgaio_uring_caps.mem_init_size > 0)
301 {
303 ring_mem_next = shmem;
304
305 /* align to page boundary, see also pgaio_uring_ring_shmem_size() */
307
308 /* account for alignment */
310 shmem += ring_mem_next - shmem;
311
312 shmem += ring_mem_remain;
313 }
314
315 for (int contextno = 0; contextno < TotalProcs; contextno++)
316 {
318 int ret;
319
320 /*
321 * Right now a high TotalProcs will cause problems in two ways:
322 *
323 * - RLIMIT_NOFILE needs to be big enough to allow all
324 * io_uring_queue_init() calls to succeed.
325 *
326 * - RLIMIT_NOFILE needs to be big enough to still have enough file
327 * descriptors to satisfy set_max_safe_fds() left over. Or, even
328 * better, have max_files_per_process left over FDs.
329 *
330 * We probably should adjust the soft RLIMIT_NOFILE to ensure that.
331 *
332 *
333 * XXX: Newer versions of io_uring support sharing the workers that
334 * execute some asynchronous IOs between io_uring instances. It might
335 * be worth using that - also need to evaluate if that causes
336 * noticeable additional contention?
337 */
338
339 /*
340 * If supported (c.f. pgaio_uring_check_capabilities()), create ring
341 * with its data in shared memory. Otherwise fall back io_uring
342 * creating a memory mapping for each ring.
343 */
344#if defined(HAVE_IO_URING_QUEUE_INIT_MEM) && defined(IORING_SETUP_NO_MMAP)
345 if (pgaio_uring_caps.mem_init_size > 0)
346 {
347 struct io_uring_params p = {0};
348
349 ret = io_uring_queue_init_mem(io_max_concurrency, &context->io_uring_ring, &p, ring_mem_next, ring_mem_remain);
350
351 ring_mem_remain -= ret;
352 ring_mem_next += ret;
353 }
354 else
355#endif
356 {
357 ret = io_uring_queue_init(io_max_concurrency, &context->io_uring_ring, 0);
358 }
359
360 if (ret < 0)
361 {
362 char *hint = NULL;
364
365 /* add hints for some failures that errno explains sufficiently */
366 if (-ret == EPERM)
367 {
369 hint = _("Check if io_uring is disabled via /proc/sys/kernel/io_uring_disabled.");
370 }
371 else if (-ret == EMFILE)
372 {
374 hint = psprintf(_("Consider increasing \"ulimit -n\" to at least %d."),
376 }
377 else if (-ret == ENOSYS)
378 {
380 hint = _("The kernel does not support io_uring.");
381 }
382
383 /* update errno to allow %m to work */
384 errno = -ret;
385
387 errcode(err),
388 errmsg("could not setup io_uring queue: %m"),
389 hint != NULL ? errhint("%s", hint) : 0);
390 }
391
392 LWLockInitialize(&context->completion_lock, LWTRANCHE_AIO_URING_COMPLETION);
393 }
394}
395
396static void
398{
400
402}
403
404static int
405pgaio_uring_submit(uint16 num_staged_ios, PgAioHandle **staged_ios)
406{
407 struct io_uring *uring_instance = &pgaio_my_uring_context->io_uring_ring;
409
410 Assert(num_staged_ios <= PGAIO_SUBMIT_BATCH_SIZE);
411
412 for (int i = 0; i < num_staged_ios; i++)
413 {
414 PgAioHandle *ioh = staged_ios[i];
415 struct io_uring_sqe *sqe;
416
418
419 if (!sqe)
420 elog(ERROR, "io_uring submission queue is unexpectedly full");
421
424
425 /*
426 * io_uring executes IO in process context if possible. That's
427 * generally good, as it reduces context switching. When performing a
428 * lot of buffered IO that means that copying between page cache and
429 * userspace memory happens in the foreground, as it can't be
430 * offloaded to DMA hardware as is possible when using direct IO. When
431 * executing a lot of buffered IO this causes io_uring to be slower
432 * than worker mode, as worker mode parallelizes the copying. io_uring
433 * can be told to offload work to worker threads instead.
434 *
435 * If an IO is buffered IO and we already have IOs in flight or
436 * multiple IOs are being submitted, we thus tell io_uring to execute
437 * the IO in the background. We don't do so for the first few IOs
438 * being submitted as executing in this process' context has lower
439 * latency.
440 */
441 if (in_flight_before > 4 && (ioh->flags & PGAIO_HF_BUFFERED))
443
445 }
446
447 while (true)
448 {
449 int ret;
450
454
455 if (ret == -EINTR)
456 {
458 "aio method uring: submit EINTR, nios: %d",
459 num_staged_ios);
460 }
461 else if (ret < 0)
462 {
463 /*
464 * The io_uring_enter() manpage suggests that the appropriate
465 * reaction to EAGAIN is:
466 *
467 * "The application should wait for some completions and try
468 * again"
469 *
470 * However, it seems unlikely that that would help in our case, as
471 * we apply a low limit to the number of outstanding IOs and thus
472 * also outstanding completions, making it unlikely that we'd get
473 * EAGAIN while the OS is in good working order.
474 *
475 * Additionally, it would be problematic to just wait here, our
476 * caller might hold critical locks. It'd possibly lead to
477 * delaying the crash-restart that seems likely to occur when the
478 * kernel is under such heavy memory pressure.
479 *
480 * Update errno to allow %m to work.
481 */
482 errno = -ret;
483 elog(PANIC, "io_uring submit failed: %m");
484 }
485 else if (ret != num_staged_ios)
486 {
487 /* likely unreachable, but if it is, we would need to re-submit */
488 elog(PANIC, "io_uring submit submitted only %d of %d",
489 ret, num_staged_ios);
490 }
491 else
492 {
494 "aio method uring: submitted %d IOs",
495 num_staged_ios);
496 break;
497 }
498 }
499
500 return num_staged_ios;
501}
502
503static void
505{
506 ProcNumber owner;
508 int32 owner_pid;
510
511 if (!ioh)
512 return;
513
514 /* No need for context if a backend is completing the IO for itself */
515 if (ioh->owner_procno == MyProcNumber)
516 return;
517
518 owner = ioh->owner_procno;
520 owner_pid = owner_proc->pid;
521
522 errcontext("completing I/O on behalf of process %d", owner_pid);
523}
524
525static void
527{
528 int ready;
529 int orig_ready;
530 ErrorContextCallback errcallback = {0};
531
532 Assert(LWLockHeldByMeInMode(&context->completion_lock, LW_EXCLUSIVE));
533
535 errcallback.previous = error_context_stack;
536 error_context_stack = &errcallback;
537
538 /*
539 * Don't drain more events than available right now. Otherwise it's
540 * plausible that one backend could get stuck, for a while, receiving CQEs
541 * without actually processing them.
542 */
543 orig_ready = ready = io_uring_cq_ready(&context->io_uring_ring);
544
545 while (ready > 0)
546 {
549
551 ncqes =
552 io_uring_peek_batch_cqe(&context->io_uring_ring,
553 cqes,
555 Assert(ncqes <= ready);
556
557 ready -= ncqes;
558
559 for (int i = 0; i < ncqes; i++)
560 {
561 struct io_uring_cqe *cqe = cqes[i];
563 int result = cqe->res;
564
565 errcallback.arg = ioh;
566
567 io_uring_cqe_seen(&context->io_uring_ring, cqe);
568
570 errcallback.arg = NULL;
571 }
572
574
576 "drained %d/%d, now expecting %d",
577 ncqes, orig_ready, io_uring_cq_ready(&context->io_uring_ring));
578 }
579
580 error_context_stack = errcallback.previous;
581}
582
583static void
585{
587 ProcNumber owner_procno = ioh->owner_procno;
589 bool expect_cqe;
590 int waited = 0;
591
592 /*
593 * XXX: It would be nice to have a smarter locking scheme, nearly all the
594 * time the backend owning the ring will consume the completions, making
595 * the locking unnecessarily expensive.
596 */
597 LWLockAcquire(&owner_context->completion_lock, LW_EXCLUSIVE);
598
599 while (true)
600 {
602 "wait_one io_gen: %" PRIu64 ", ref_gen: %" PRIu64 ", cycle %d",
603 ioh->generation,
605 waited);
606
609 {
610 /* the IO was completed by another backend */
611 break;
612 }
613 else if (io_uring_cq_ready(&owner_context->io_uring_ring))
614 {
615 /* no need to wait in the kernel, io_uring has a completion */
616 expect_cqe = true;
617 }
618 else
619 {
620 int ret;
621 struct io_uring_cqe *cqes;
622
623 /* need to wait in the kernel */
625 ret = io_uring_wait_cqes(&owner_context->io_uring_ring, &cqes, 1, NULL, NULL);
627
628 if (ret == -EINTR)
629 {
630 continue;
631 }
632 else if (ret != 0)
633 {
634 /* see comment after io_uring_submit() */
635 errno = -ret;
636 elog(PANIC, "io_uring wait failed: %m");
637 }
638 else
639 {
640 Assert(cqes != NULL);
641 expect_cqe = true;
642 waited++;
643 }
644 }
645
646 if (expect_cqe)
647 {
649 }
650 }
651
652 LWLockRelease(&owner_context->completion_lock);
653
655 "wait_one with %d sleeps",
656 waited);
657}
658
659static void
661{
662 struct iovec *iov;
663
664 switch ((PgAioOp) ioh->op)
665 {
666 case PGAIO_OP_READV:
667 iov = &pgaio_ctl->iovecs[ioh->iovec_off];
668 if (ioh->op_data.read.iov_length == 1)
669 {
671 ioh->op_data.read.fd,
672 iov->iov_base,
673 iov->iov_len,
674 ioh->op_data.read.offset);
675 }
676 else
677 {
679 ioh->op_data.read.fd,
680 iov,
681 ioh->op_data.read.iov_length,
682 ioh->op_data.read.offset);
683
684 }
685 break;
686
687 case PGAIO_OP_WRITEV:
688 iov = &pgaio_ctl->iovecs[ioh->iovec_off];
689 if (ioh->op_data.write.iov_length == 1)
690 {
692 ioh->op_data.write.fd,
693 iov->iov_base,
694 iov->iov_len,
695 ioh->op_data.write.offset);
696 }
697 else
698 {
700 ioh->op_data.write.fd,
701 iov,
702 ioh->op_data.write.iov_length,
703 ioh->op_data.write.offset);
704 }
705 break;
706
707 case PGAIO_OP_INVALID:
708 elog(ERROR, "trying to prepare invalid IO operation for execution");
709 }
710
712}
713
714#endif /* IOMETHOD_IO_URING_ENABLED */
void pgaio_io_process_completion(PgAioHandle *ioh, int result)
Definition aio.c:528
PgAioBackend * pgaio_my_backend
Definition aio.c:81
int io_max_concurrency
Definition aio.c:75
PgAioCtl * pgaio_ctl
Definition aio.c:78
bool pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state)
Definition aio.c:559
void pgaio_io_prepare_submit(PgAioHandle *ioh)
Definition aio.c:510
PgAioOp
Definition aio.h:88
@ PGAIO_OP_WRITEV
Definition aio.h:93
@ PGAIO_OP_INVALID
Definition aio.h:90
@ PGAIO_OP_READV
Definition aio.h:92
@ PGAIO_HF_BUFFERED
Definition aio.h:77
PgAioHandleState
@ PGAIO_HS_SUBMITTED
#define pgaio_debug(elevel, msg,...)
#define pgaio_debug_io(elevel, ioh, msg,...)
#define PGAIO_SUBMIT_BATCH_SIZE
#define Min(x, y)
Definition c.h:997
#define TYPEALIGN(ALIGNVAL, LEN)
Definition c.h:819
#define Assert(condition)
Definition c.h:873
int32_t int32
Definition c.h:542
uint64_t uint64
Definition c.h:547
uint16_t uint16
Definition c.h:545
uint32_t uint32
Definition c.h:546
ErrorContextCallback * error_context_stack
Definition elog.c:95
int errhint(const char *fmt,...)
Definition elog.c:1330
int errcode(int sqlerrcode)
Definition elog.c:863
int errmsg(const char *fmt,...)
Definition elog.c:1080
#define _(x)
Definition elog.c:91
#define errcontext
Definition elog.h:198
#define DEBUG3
Definition elog.h:28
#define PANIC
Definition elog.h:42
#define DEBUG1
Definition elog.h:30
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define ereport(elevel,...)
Definition elog.h:150
#define DEBUG4
Definition elog.h:27
void err(int eval, const char *fmt,...)
Definition err.c:43
int max_files_per_process
Definition fd.c:146
ProcNumber MyProcNumber
Definition globals.c:90
int MaxBackends
Definition globals.c:146
static uint32 dclist_count(const dclist_head *head)
Definition ilist.h:932
int i
Definition isn.c:77
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition lwlock.c:1176
bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode)
Definition lwlock.c:1955
void LWLockRelease(LWLock *lock)
Definition lwlock.c:1793
void LWLockInitialize(LWLock *lock, int tranche_id)
Definition lwlock.c:698
@ LW_EXCLUSIVE
Definition lwlock.h:112
#define MAP_FAILED
Definition mem.h:43
#define MAP_ANONYMOUS
Definition mem.h:25
#define START_CRIT_SECTION()
Definition miscadmin.h:150
#define END_CRIT_SECTION()
Definition miscadmin.h:152
void * arg
static int fb(int x)
#define MAX_IO_WORKERS
Definition proc.h:462
#define NUM_AUXILIARY_PROCS
Definition proc.h:463
#define GetPGProcByNumber(n)
Definition proc.h:440
int ProcNumber
Definition procnumber.h:24
char * psprintf(const char *fmt,...)
Definition psprintf.c:43
Size add_size(Size s1, Size s2)
Definition shmem.c:482
Size mul_size(Size s1, Size s2)
Definition shmem.c:497
void * ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Definition shmem.c:378
struct ErrorContextCallback * previous
Definition elog.h:297
void(* callback)(void *arg)
Definition elog.h:298
bool wait_on_fd_before_close
Definition proc.h:180
dclist_head in_flight_ios
struct iovec * iovecs
int32 owner_procno
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition wait_event.h:69
static void pgstat_report_wait_end(void)
Definition wait_event.h:85
#define EINTR
Definition win32_port.h:361